From 2f3a0bfbfe110c0837b3c7e04f9ad0969d6e56e4 Mon Sep 17 00:00:00 2001 From: Stephan Bergmann Date: Tue, 1 Feb 2022 15:47:07 +0100 Subject: tdf#147088: Also handle U+FFFE, U+FFFF invalid XML 1.0 characters Change-Id: Ieec81fcde41e3508c6a9aa4250d7050db2fbb442 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129296 Tested-by: Jenkins Reviewed-by: Stephan Bergmann --- sax/source/tools/fastserializer.cxx | 54 ++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'sax') diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx index de035f4717fd..8dcc308a836e 100644 --- a/sax/source/tools/fastserializer.cxx +++ b/sax/source/tools/fastserializer.cxx @@ -25,8 +25,11 @@ #include #include +#include +#include #include #include +#include #if OSL_DEBUG_LEVEL > 0 #include @@ -104,19 +107,33 @@ namespace sax_fastparser { /** Characters not allowed in XML 1.0 XML 1.1 would exclude only U+0000 */ - static bool invalidChar( char c ) + template static std::optional> invalidChar( + char const * string, Int length, Int index ) { - if (static_cast(c) >= 0x20) - return false; + assert(index < length); + auto const c = string[index]; + + if (static_cast(c) >= 0x20 && c != '\xEF') + return {}; switch (c) { case 0x09: case 0x0a: case 0x0d: - return false; + return {}; + case '\xEF': // U+FFFE, U+FFFF: + if (length - index >= 3 && string[index + 1] == '\xBF') { + switch (string[index + 2]) { + case '\xBE': + return std::pair(0xFFFE, 3); + case '\xBF': + return std::pair(0xFFFF, 3); + } + } + return {}; } - return true; + return std::pair(static_cast(c), 1); } static bool isHexDigit( char c ) @@ -139,7 +156,7 @@ namespace sax_fastparser { const sal_Int32 kXescapeLen = 7; char bufXescape[kXescapeLen+1]; sal_Int32 nNextXescape = 0; - for (sal_Int32 i = 0; i < nLen; ++i) + for (sal_Int32 i = 0; i < nLen;) { char c = pStr[ i ]; switch( c ) @@ -250,24 +267,19 @@ namespace sax_fastparser { break; } } - if (invalidChar(c)) + if (auto const inv = invalidChar(pStr, nLen, i)) { snprintf( bufXescape, kXescapeLen+1, "_x%04x_", - static_cast(static_cast(c))); + inv->first); writeBytes( bufXescape, kXescapeLen); - break; + i += inv->second; + continue; } - /* TODO: also U+FFFE and U+FFFF are not allowed - * in XML 1.0, assuming we're writing UTF-8 - * those should be escaped as well to be - * conformant. Likely that would involve - * scanning for both encoded sequences and - * write as _xHHHH_? */ } #if OSL_DEBUG_LEVEL > 0 else { - if (bGood && invalidChar(pStr[i])) + if (bGood && invalidChar(pStr, nLen, i)) { bGood = false; // The SAL_WARN() for the single character is @@ -279,6 +291,7 @@ namespace sax_fastparser { writeBytes( &c, 1 ); break; } + ++i; } SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min(nLen,42)) << "'"); } @@ -671,14 +684,17 @@ namespace sax_fastparser { #if OSL_DEBUG_LEVEL > 0 { bool bGood = true; - for (size_t i=0; i < nLen; ++i) + for (size_t i=0; i < nLen;) { - if (invalidChar(pStr[i])) + if (auto const inv = invalidChar(pStr, nLen, i)) { bGood = false; SAL_WARN("sax", "FastSaxSerializer::writeBytes - illegal XML character 0x" << - std::hex << int(static_cast(pStr[i]))); + std::hex << inv->first); + i += inv->second; + continue; } + ++i; } SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min(nLen,42)) << "'"); } -- cgit