diff options
author | Stephan Bergmann <sbergman@redhat.com> | 2022-02-01 15:47:07 +0100 |
---|---|---|
committer | Stephan Bergmann <sbergman@redhat.com> | 2022-02-01 23:36:55 +0100 |
commit | 2f3a0bfbfe110c0837b3c7e04f9ad0969d6e56e4 (patch) | |
tree | f2cb620fb9a93b1dcfc12b6bc83225c75e461383 /sax/source | |
parent | 3eac118b31ed569dbcec845fc32386ebd0022f73 (diff) |
tdf#147088: Also handle U+FFFE, U+FFFF invalid XML 1.0 characters
Change-Id: Ieec81fcde41e3508c6a9aa4250d7050db2fbb442
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129296
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sax/source')
-rw-r--r-- | sax/source/tools/fastserializer.cxx | 54 |
1 files changed, 35 insertions, 19 deletions
diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx index de035f4717fd..8dcc308a836e 100644 --- a/sax/source/tools/fastserializer.cxx +++ b/sax/source/tools/fastserializer.cxx @@ -25,8 +25,11 @@ #include <comphelper/processfactory.hxx> #include <comphelper/sequence.hxx> +#include <cassert> +#include <optional> #include <string.h> #include <string_view> +#include <utility> #if OSL_DEBUG_LEVEL > 0 #include <iostream> @@ -104,19 +107,33 @@ namespace sax_fastparser { /** Characters not allowed in XML 1.0 XML 1.1 would exclude only U+0000 */ - static bool invalidChar( char c ) + template<typename Int> static std::optional<std::pair<unsigned, Int>> invalidChar( + char const * string, Int length, Int index ) { - if (static_cast<unsigned char>(c) >= 0x20) - return false; + assert(index < length); + auto const c = string[index]; + + if (static_cast<unsigned char>(c) >= 0x20 && c != '\xEF') + return {}; switch (c) { case 0x09: case 0x0a: case 0x0d: - return false; + return {}; + case '\xEF': // U+FFFE, U+FFFF: + if (length - index >= 3 && string[index + 1] == '\xBF') { + switch (string[index + 2]) { + case '\xBE': + return std::pair(0xFFFE, 3); + case '\xBF': + return std::pair(0xFFFF, 3); + } + } + return {}; } - return true; + return std::pair(static_cast<unsigned char>(c), 1); } static bool isHexDigit( char c ) @@ -139,7 +156,7 @@ namespace sax_fastparser { const sal_Int32 kXescapeLen = 7; char bufXescape[kXescapeLen+1]; sal_Int32 nNextXescape = 0; - for (sal_Int32 i = 0; i < nLen; ++i) + for (sal_Int32 i = 0; i < nLen;) { char c = pStr[ i ]; switch( c ) @@ -250,24 +267,19 @@ namespace sax_fastparser { break; } } - if (invalidChar(c)) + if (auto const inv = invalidChar(pStr, nLen, i)) { snprintf( bufXescape, kXescapeLen+1, "_x%04x_", - static_cast<unsigned int>(static_cast<unsigned char>(c))); + inv->first); writeBytes( bufXescape, kXescapeLen); - break; + i += inv->second; + continue; } - /* TODO: also U+FFFE and U+FFFF are not allowed - * in XML 1.0, assuming we're writing UTF-8 - * those should be escaped as well to be - * conformant. Likely that would involve - * scanning for both encoded sequences and - * write as _xHHHH_? */ } #if OSL_DEBUG_LEVEL > 0 else { - if (bGood && invalidChar(pStr[i])) + if (bGood && invalidChar(pStr, nLen, i)) { bGood = false; // The SAL_WARN() for the single character is @@ -279,6 +291,7 @@ namespace sax_fastparser { writeBytes( &c, 1 ); break; } + ++i; } SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'"); } @@ -671,14 +684,17 @@ namespace sax_fastparser { #if OSL_DEBUG_LEVEL > 0 { bool bGood = true; - for (size_t i=0; i < nLen; ++i) + for (size_t i=0; i < nLen;) { - if (invalidChar(pStr[i])) + if (auto const inv = invalidChar(pStr, nLen, i)) { bGood = false; SAL_WARN("sax", "FastSaxSerializer::writeBytes - illegal XML character 0x" << - std::hex << int(static_cast<unsigned char>(pStr[i]))); + std::hex << inv->first); + i += inv->second; + continue; } + ++i; } SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'"); } |