allow utf-8 in xml names (liborcus) (tdf#141672)

Change-Id: Ib150d55b588a572e4352396f18de2331983b2aae Reviewed-on: https://gerrit.libreoffice.org/c/core/+/114892 Tested-by: Jenkins Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
author: Luboš Luňák <l.lunak@collabora.com> 2021-04-29 20:10:34 +0200
committer: Andras Timar <andras.timar@collabora.com> 2021-05-10 23:31:56 +0200
commit: 036e62326d08cfdcda8b390720ab9c0cf9f8c3f7 (patch)
tree: 394b2d9b7b93b09e7dd717029be6a50a38a24273
parent: 70acef544db618ded97e9550a2519930d30e3fba (diff)
2 files changed, 293 insertions, 0 deletions
diff --git a/external/liborcus/UnpackedTarball_liborcus.mk b/external/liborcus/UnpackedTarball_liborcus.mk
index 791436e66016..6df45cb2fb39 100644
--- a/external/liborcus/UnpackedTarball_liborcus.mk
+++ b/external/liborcus/UnpackedTarball_liborcus.mk
@@ -21,6 +21,10 @@ $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
 	external/liborcus/0001-protect-the-self-closing-xml-element-code-against-se.patch \
 ))
 
+$(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
+	external/liborcus/allow-utf-8-in-xml-names.patch \
+))
+
 ifeq ($(OS),WNT)
 $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\
 	external/liborcus/windows-constants-hack.patch \
diff --git a/external/liborcus/allow-utf-8-in-xml-names.patch b/external/liborcus/allow-utf-8-in-xml-names.patch
new file mode 100644
index 000000000000..efef24b84053
--- /dev/null
+++ b/external/liborcus/allow-utf-8-in-xml-names.patch
@@ -0,0 +1,289 @@
+From 9889cb660372bc6c3da22fc274c73ea11040415f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
+Date: Thu, 29 Apr 2021 19:12:20 +0200
+Subject: [PATCH] allow utf-8 in xml names (#137)
+
+https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+has a list of all allowed characters.
+---
+ include/orcus/sax_parser_base.hpp |   3 +
+ src/orcus_test_xml.cpp            |   1 +
+ src/parser/sax_parser_base.cpp    | 186 ++++++++++++++++++++++++++++--
+ test/xml/non-ascii/check.txt      |   4 +
+ test/xml/non-ascii/input.xml      |   4 +
+ 5 files changed, 190 insertions(+), 8 deletions(-)
+ create mode 100644 test/xml/non-ascii/check.txt
+ create mode 100644 test/xml/non-ascii/input.xml
+
+diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp
+index a117b3a2..1aeb8b38 100644
+--- a/include/orcus/sax_parser_base.hpp
++++ b/include/orcus/sax_parser_base.hpp
+@@ -217,6 +217,9 @@ protected:
+     void element_name(parser_element& elem, const char* begin_pos);
+     void attribute_name(pstring& attr_ns, pstring& attr_name);
+     void characters_with_encoded_char(cell_buffer& buf);
++
++    int is_name_char();
++    int is_name_start_char();
+ };
+ 
+ }}
+diff --git a/src/orcus_test_xml.cpp b/src/orcus_test_xml.cpp
+index 98e83297..89c8af72 100644
+--- a/src/orcus_test_xml.cpp
++++ b/src/orcus_test_xml.cpp
+@@ -73,6 +73,7 @@ const char* sax_parser_test_dirs[] = {
+     SRCDIR"/test/xml/bom/",
+     SRCDIR"/test/xml/custom-decl-1/",
+     SRCDIR"/test/xml/cdata-1/"
++    SRCDIR"/test/xml/non-ascii/",
+ };
+ 
+ const char* sax_parser_parse_only_test_dirs[] = {
+diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
+index 743130da..ecbd7f99 100644
+--- a/src/parser/sax_parser_base.cpp
++++ b/src/parser/sax_parser_base.cpp
+@@ -296,20 +296,22 @@ void parser_base::value_with_encoded_char(cell_buffer& buf, pstring& str)
+         str = pstring(buf.get(), buf.size());
+ 
+     // Skip the closing quote.
+-    assert(cur_char() == '"');
++    assert(!has_char() || cur_char() == '"');
+     next();
+ }
+ 
+ bool parser_base::value(pstring& str, bool decode)
+ {
+     char c = cur_char();
+-    if (c != '"')
++    if (c != '"' && c != '\'')
+         throw malformed_xml_error("value must be quoted", offset());
+ 
++    char quote_char = c;
++
+     c = next_char_checked();
+ 
+     const char* p0 = mp_char;
+-    for (; c != '"'; c = next_char_checked())
++    for (; c != quote_char; c = next_char_checked())
+     {
+         if (decode && c == '&')
+         {
+@@ -330,19 +332,187 @@ bool parser_base::value(pstring& str, bool decode)
+     return false;
+ }
+ 
++// https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
++// Return length of the character in bytes, otherwise 0.
++template< bool only_start_name >
++static
++int is_name_char_helper(const char* mp_char, const char* mp_end)
++{
++    const unsigned char first = mp_char[0];
++    // Note that ':' technically is an allowed name character, but it is handled separately
++    // e.g. in element_name(), so here pretend it isn't.
++    if (/*first == ':' ||*/ first == '_' || (first >= 'A' && first <= 'Z') || (first >= 'a' && first <= 'z'))
++        return 1;
++    if (!only_start_name && (first == '-' || first == '.' || (first >= '0' && first <= '9')))
++        return 1;
++
++    if (first < 0x7f) // other ascii characters are not allowed
++        return 0;
++    if (mp_end < mp_char + 1)
++        return 0;
++    const unsigned char second = mp_char[1];
++
++    // 0xb7 = 0xc2 0xb7 utf-8
++    if (!only_start_name && first == 0xc2 && second == 0xb7)
++        return 2;
++
++    // [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
++    // 0xc0 = 0xc3 0x80 utf-8
++    if (first < 0xc3)
++        return 0;
++    // xd7 = 0xc3 0x97 utf-8, 0xf7 = 0xc3 0xb7 utf-8
++    if (first == 0xc3)
++        return second >= 0x80 && second <= 0xff && second != 0x97 && second != 0xb7 ? 2 : 0;
++    // 0x2ff = 0xcb 0xbf utf-8, 0x300 = 0xcc 0x80 utf-8
++    if (first >= 0xc4 && first <= 0xcb)
++        return 2;
++
++    // [#x0300-#x036F]
++    // 0x0300 = 0xcc 0x80 utf-8, 0x36f = 0xcd 0xaf utf-8
++    if (!only_start_name && first == 0xcc)
++        return 2;
++    if (!only_start_name && first == 0xcd && second <= 0xaf)
++        return 2;
++
++    // [#x370-#x37D] | [#x37F-#x1FFF]
++    // 0x370 = 0xcd 0xb0 utf-8, 0x37e = 0xcd 0xbe
++    if (first < 0xcd)
++        return 0;
++    if (first == 0xcd)
++        return second >= 0xb0 && second != 0xbe ? 2 : 0;
++    // 0x07ff = 0xdf 0xbf utf-8 (the last 2-byte utf-8)
++    if (first <= 0xdf)
++        return 2;
++
++    if (first < 0xe0)
++        return 0;
++    if (mp_end < mp_char + 2)
++        return 0;
++    const unsigned char third = mp_char[2];
++
++    // 0x0800 = 0xe0 0xa0 0x80 utf-8, 0x1fff = 0xe1 0xbf 0xbf utf-8, 0x2000 = 0xe2 0x80 0x80
++    if (first == 0xe0 || first == 0xe1)
++        return 3;
++
++    // [#x200C-#x200D]
++    // 0x200c = 0xe2 0x80 0x8c utf-8, 0x200d = 0xe2 0x80 0x8d utf-8
++    if (first < 0xe2)
++        return 0;
++    if (first == 0xe2 && second == 0x80 && (third == 0x8c || third == 0x8d))
++        return 3;
++
++    // [#x203F-#x2040]
++    // 0x203f = 0xe2 0x80 0xbf utf-8, 0x2040 = 0xe2 0x81 0x80 utf-8
++    if (!only_start_name && first == 0xe2 && second == 0x80 && third == 0xbf)
++        return 3;
++    if (!only_start_name && first == 0xe2 && second == 0x81 && third == 0x80)
++        return 3;
++
++    // [#x2070-#x218F]
++    // 0x2070 = 0xe2 0x81 0xb0 utf-8, 0x218f = 0xe2 0x86 0x8f utf-8
++    if (first == 0xe2)
++    {
++        if (second < 0x81)
++            return 0;
++        if (second >= 0x81 && second < 0x86)
++            return 3;
++        if (second == 0x86 && third <= 0x8f)
++            return 3;
++    }
++
++    // [#x2C00-#x2FEF]
++    // 0x2c00 = 0xe2 0xb0 0x80 utf-8, 0x2fef = 0xe2 0xbf 0xaf utf-8
++    if (first == 0xe2)
++    {
++        if (second < 0xb0)
++            return 0;
++        if (second < 0xbf)
++            return 3;
++        if (second == 0xbf && third <= 0xaf)
++            return 3;
++    }
++
++    // [#x3001-#xD7FF]
++    // 0x3001 = 0xe3 0x80 0x81 utf-8, 0xd7ff = 0xed 0x9f 0xbf utf-8, 0xd800 = 0xed 0xa0 0x80 utf-8
++    if (first < 0xe3)
++        return 0;
++    if (first < 0xed)
++        return 3;
++    if (first == 0xed && second <= 0x9f)
++        return 3;
++
++    // [#xF900-#xFDCF]
++    // 0xf900 = 0xef 0xa4 0x80 utf-8, 0xfdcf = 0xef 0xb7 0x8f utf-8
++    if (first == 0xef)
++    {
++        if (second < 0xa4)
++            return 0;
++        if (second < 0xb7)
++            return 3;
++        if (second == 0xb7 && third <= 0x8f)
++            return 3;
++    }
++
++    // [#xFDF0-#xFFFD]
++    // 0xfdf0 = 0xef 0xb7 0xb0 utf-8, 0xfffd = 0xef 0xbf 0xbd utf-8
++    if (first == 0xef)
++    {
++        assert(second >= 0xb7);
++        if (second == 0xb7 && third < 0xb0)
++            return 0;
++        if (second < 0xbe)
++            return 3;
++        if (second == 0xbf && third <= 0xbd)
++            return 3;
++    }
++
++    if (first < 0xf0)
++        return 0;
++    if (mp_end < mp_char + 3)
++        return 0;
++    // const unsigned char fourth = mp_char[3];
++
++    // [#x10000-#xEFFFF]
++    // 0x10000 = 0xf0 0x90 0x80 0x80 utf-8, 0xeffff = 0xf3 0xaf 0xbf 0xbf utf-8,
++    // 0xf0000 = 0xf3 0xb0 0x80 0x80 utf-8
++    if (first >= 0xf0 && first < 0xf2)
++        return 4;
++    if (first == 0xf3 && second < 0xb0)
++        return 4;
++
++    return 0;
++}
++
++int parser_base::is_name_char()
++{
++    return is_name_char_helper<false>(mp_char, mp_end);
++}
++
++int parser_base::is_name_start_char()
++{
++    return is_name_char_helper<true>(mp_char, mp_end);
++}
++
author	Luboš Luňák <l.lunak@collabora.com>	2021-04-29 20:10:34 +0200
committer	Andras Timar <andras.timar@collabora.com>	2021-05-10 23:31:56 +0200
commit	036e62326d08cfdcda8b390720ab9c0cf9f8c3f7 (patch)
tree	394b2d9b7b93b09e7dd717029be6a50a38a24273
parent	70acef544db618ded97e9550a2519930d30e3fba (diff)