summaryrefslogtreecommitdiff
path: root/sax/source
diff options
context:
space:
mode:
authorStephan Bergmann <sbergman@redhat.com>2022-02-01 15:47:07 +0100
committerStephan Bergmann <sbergman@redhat.com>2022-02-01 23:36:55 +0100
commit2f3a0bfbfe110c0837b3c7e04f9ad0969d6e56e4 (patch)
treef2cb620fb9a93b1dcfc12b6bc83225c75e461383 /sax/source
parent3eac118b31ed569dbcec845fc32386ebd0022f73 (diff)
tdf#147088: Also handle U+FFFE, U+FFFF invalid XML 1.0 characters
Change-Id: Ieec81fcde41e3508c6a9aa4250d7050db2fbb442 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129296 Tested-by: Jenkins Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sax/source')
-rw-r--r--sax/source/tools/fastserializer.cxx54
1 files changed, 35 insertions, 19 deletions
diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx
index de035f4717fd..8dcc308a836e 100644
--- a/sax/source/tools/fastserializer.cxx
+++ b/sax/source/tools/fastserializer.cxx
@@ -25,8 +25,11 @@
#include <comphelper/processfactory.hxx>
#include <comphelper/sequence.hxx>
+#include <cassert>
+#include <optional>
#include <string.h>
#include <string_view>
+#include <utility>
#if OSL_DEBUG_LEVEL > 0
#include <iostream>
@@ -104,19 +107,33 @@ namespace sax_fastparser {
/** Characters not allowed in XML 1.0
XML 1.1 would exclude only U+0000
*/
- static bool invalidChar( char c )
+ template<typename Int> static std::optional<std::pair<unsigned, Int>> invalidChar(
+ char const * string, Int length, Int index )
{
- if (static_cast<unsigned char>(c) >= 0x20)
- return false;
+ assert(index < length);
+ auto const c = string[index];
+
+ if (static_cast<unsigned char>(c) >= 0x20 && c != '\xEF')
+ return {};
switch (c)
{
case 0x09:
case 0x0a:
case 0x0d:
- return false;
+ return {};
+ case '\xEF': // U+FFFE, U+FFFF:
+ if (length - index >= 3 && string[index + 1] == '\xBF') {
+ switch (string[index + 2]) {
+ case '\xBE':
+ return std::pair(0xFFFE, 3);
+ case '\xBF':
+ return std::pair(0xFFFF, 3);
+ }
+ }
+ return {};
}
- return true;
+ return std::pair(static_cast<unsigned char>(c), 1);
}
static bool isHexDigit( char c )
@@ -139,7 +156,7 @@ namespace sax_fastparser {
const sal_Int32 kXescapeLen = 7;
char bufXescape[kXescapeLen+1];
sal_Int32 nNextXescape = 0;
- for (sal_Int32 i = 0; i < nLen; ++i)
+ for (sal_Int32 i = 0; i < nLen;)
{
char c = pStr[ i ];
switch( c )
@@ -250,24 +267,19 @@ namespace sax_fastparser {
break;
}
}
- if (invalidChar(c))
+ if (auto const inv = invalidChar(pStr, nLen, i))
{
snprintf( bufXescape, kXescapeLen+1, "_x%04x_",
- static_cast<unsigned int>(static_cast<unsigned char>(c)));
+ inv->first);
writeBytes( bufXescape, kXescapeLen);
- break;
+ i += inv->second;
+ continue;
}
- /* TODO: also U+FFFE and U+FFFF are not allowed
- * in XML 1.0, assuming we're writing UTF-8
- * those should be escaped as well to be
- * conformant. Likely that would involve
- * scanning for both encoded sequences and
- * write as _xHHHH_? */
}
#if OSL_DEBUG_LEVEL > 0
else
{
- if (bGood && invalidChar(pStr[i]))
+ if (bGood && invalidChar(pStr, nLen, i))
{
bGood = false;
// The SAL_WARN() for the single character is
@@ -279,6 +291,7 @@ namespace sax_fastparser {
writeBytes( &c, 1 );
break;
}
+ ++i;
}
SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'");
}
@@ -671,14 +684,17 @@ namespace sax_fastparser {
#if OSL_DEBUG_LEVEL > 0
{
bool bGood = true;
- for (size_t i=0; i < nLen; ++i)
+ for (size_t i=0; i < nLen;)
{
- if (invalidChar(pStr[i]))
+ if (auto const inv = invalidChar(pStr, nLen, i))
{
bGood = false;
SAL_WARN("sax", "FastSaxSerializer::writeBytes - illegal XML character 0x" <<
- std::hex << int(static_cast<unsigned char>(pStr[i])));
+ std::hex << inv->first);
+ i += inv->second;
+ continue;
}
+ ++i;
}
SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << "'");
}