diff options
author | dante <dante19031999@gmail.com> | 2021-01-01 19:42:37 +0100 |
---|---|---|
committer | Stephan Bergmann <sbergman@redhat.com> | 2021-01-19 08:19:46 +0100 |
commit | ae5ec5b944cf2378806498072c50d473a3ac62ed (patch) | |
tree | 56fd5795c9e4cbfa160a66d2cd8f9e6a28b91452 /sax | |
parent | 2197f69f1b91ff13d0dcd078685ab23466241197 (diff) |
Use customized xml entities on xmleport.
This will be mainly used on matml export for unicode characters.
It will be used mostly for mathml.
Change-Id: I59b96d44facbd01fa517317a0ae54d64d29b0a19
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/108562
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sax')
-rw-r--r-- | sax/source/expatwrap/saxwriter.cxx | 242 |
1 files changed, 157 insertions, 85 deletions
diff --git a/sax/source/expatwrap/saxwriter.cxx b/sax/source/expatwrap/saxwriter.cxx index f6a58b0bb4cb..04fd90762cd9 100644 --- a/sax/source/expatwrap/saxwriter.cxx +++ b/sax/source/expatwrap/saxwriter.cxx @@ -117,6 +117,9 @@ private: /// @throws SAXException void FinishStartElement(); + // Search for the correct replacement + const ReplacementPair* findXMLReplacement(const sal_Unicode* pStr, sal_Int32 nStrLen); + public: explicit SaxWriterHelper(Reference<XOutputStream> const& m_TempOut) : m_out(m_TempOut) @@ -193,6 +196,10 @@ public: void setCustomEntityNames( const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements); + + // Calculate length for convertToXML + sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, + bool bNormalizeWhitespace); }; const bool g_bValidCharsBelow32[32] = { @@ -282,10 +289,10 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, for (sal_Int32 i = 0; i < nStrLen; i++) { - sal_uInt16 c = pStr[i]; + sal_Unicode c = pStr[i]; if (IsInvalidChar(c)) bRet = false; - else if ((c >= 0x0001) && (c <= 0x007F)) + else if ((c >= 0x0001) && (c <= 0x007F)) // Deal with ascii { if (bDoNormalization) { @@ -414,26 +421,80 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, rPos++; } } - else if (c >= 0xd800 && c < 0xdc00) - { - // 1. surrogate: save (until 2. surrogate) - OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate"); - nSurrogate = ((c & 0x03ff) + 0x0040); - } - else if (c >= 0xdc00 && c < 0xe000) + else { - // 2. surrogate: write as UTF-8 - OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate"); + // Deal with replacements + if (bDoNormalization && !m_Replacements.empty()) + { + // search + const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i); + + // replace + if (it != nullptr) + { + OString name = ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8); + if (rPos + name.getLength() > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(name.getStr()), + name.getLength()); + else + { + memcpy(&(pTarget[rPos]), name.getStr(), name.getLength()); + rPos += name.getLength(); + } + i += it->replacement.getLength() - 1; + continue; + } + } + + // Deal with other uniciode cases + if (c >= 0xd800 && c < 0xdc00) + { + // 1. surrogate: save (until 2. surrogate) + OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate"); + nSurrogate = ((c & 0x03ff) + 0x0040); + } + else if (c >= 0xdc00 && c < 0xe000) + { + // 2. surrogate: write as UTF-8 + OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate"); + + nSurrogate = (nSurrogate << 10) | (c & 0x03ff); + if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000) + { + sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)), + sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)), + sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)), + sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) }; + if ((rPos + 4) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 4); + else + { + pTarget[rPos] = aBytes[0]; + rPos++; + pTarget[rPos] = aBytes[1]; + rPos++; + pTarget[rPos] = aBytes[2]; + rPos++; + pTarget[rPos] = aBytes[3]; + rPos++; + } + } + else + { + OSL_FAIL("illegal Unicode character"); + bRet = false; + } - nSurrogate = (nSurrogate << 10) | (c & 0x03ff); - if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000) + // reset surrogate + nSurrogate = 0; + } + else if (c > 0x07FF) { - sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)), - sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)), - sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)), - sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) }; - if ((rPos + 4) > SEQUENCESIZE) - AddBytes(pTarget, rPos, aBytes, 4); + sal_Int8 aBytes[] + = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)), + sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; + if ((rPos + 3) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 3); else { pTarget[rPos] = aBytes[0]; @@ -442,50 +503,24 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, rPos++; pTarget[rPos] = aBytes[2]; rPos++; - pTarget[rPos] = aBytes[3]; - rPos++; } } else { - OSL_FAIL("illegal Unicode character"); - bRet = false; - } - - // reset surrogate - nSurrogate = 0; - } - else if (c > 0x07FF) - { - sal_Int8 aBytes[] - = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)), - sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; - if ((rPos + 3) > SEQUENCESIZE) - AddBytes(pTarget, rPos, aBytes, 3); - else - { - pTarget[rPos] = aBytes[0]; - rPos++; - pTarget[rPos] = aBytes[1]; - rPos++; - pTarget[rPos] = aBytes[2]; - rPos++; - } - } - else - { - sal_Int8 aBytes[] - = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; - if ((rPos + 2) > SEQUENCESIZE) - AddBytes(pTarget, rPos, aBytes, 2); - else - { - pTarget[rPos] = aBytes[0]; - rPos++; - pTarget[rPos] = aBytes[1]; - rPos++; + sal_Int8 aBytes[] + = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; + if ((rPos + 2) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 2); + else + { + pTarget[rPos] = aBytes[0]; + rPos++; + pTarget[rPos] = aBytes[1]; + rPos++; + } } } + OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position"); if (rPos == SEQUENCESIZE) rPos = writeSequence(); @@ -848,7 +883,8 @@ bool SaxWriterHelper::comment(const OUString& rComment) return bRet; } -sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bNormalizeWhitespace) +sal_Int32 SaxWriterHelper::calcXMLByteLength(const OUString& rStr, bool bDoNormalization, + bool bNormalizeWhitespace) { sal_Int32 nOutputLength = 0; sal_uInt32 nSurrogate = 0; @@ -897,26 +933,45 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN nOutputLength++; } } - else if (c >= 0xd800 && c < 0xdc00) - { - // save surrogate - nSurrogate = ((c & 0x03ff) + 0x0040); - } - else if (c >= 0xdc00 && c < 0xe000) - { - // 2. surrogate: write as UTF-8 (if range is OK - nSurrogate = (nSurrogate << 10) | (c & 0x03ff); - if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000) - nOutputLength += 4; - nSurrogate = 0; - } - else if (c > 0x07FF) - { - nOutputLength += 3; - } else { - nOutputLength += 2; + // Deal with replacements + if (bDoNormalization && !m_Replacements.empty()) + { + // search + const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i); + + if (it != nullptr) + { + nOutputLength + += ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8).getLength(); + i += it->replacement.getLength() - 1; + continue; + } + } + + // Deal with other unicode cases + if (c >= 0xd800 && c < 0xdc00) + { + // save surrogate + nSurrogate = ((c & 0x03ff) + 0x0040); + } + else if (c >= 0xdc00 && c < 0xe000) + { + // 2. surrogate: write as UTF-8 (if range is OK + nSurrogate = (nSurrogate << 10) | (c & 0x03ff); + if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000) + nOutputLength += 4; + nSurrogate = 0; + } + else if (c > 0x07FF) + { + nOutputLength += 3; + } + else + { + nOutputLength += 2; + } } // surrogate processing @@ -927,6 +982,23 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN return nOutputLength; } +const ReplacementPair* SaxWriterHelper::findXMLReplacement(const sal_Unicode* pStr, + sal_Int32 nStrLen) +{ + for (size_t iter = 0; iter < m_Replacements.size(); ++iter) + { + if (m_Replacements[iter].replacement.getLength() > nStrLen) + continue; + sal_Int32 matches = m_Replacements[iter].replacement.compareTo( + std::u16string_view(pStr, m_Replacements[iter].replacement.getLength())); + if (matches == 0) + return &m_Replacements[iter]; + if (matches > 0) + return nullptr; + } + return nullptr; +} + /** returns position of first ascii 10 within the string, -1 when no 10 in string. */ sal_Int32 getFirstLineBreak(const OUString& str) throw() @@ -1115,7 +1187,7 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0; nLength++; // "<" - nLength += calcXMLByteLength(aName, false, false); // the tag name + nLength += m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); // the tag name sal_Int16 n; for (n = 0; n < static_cast<sal_Int16>(nAttribCount); n++) @@ -1123,13 +1195,13 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi nLength++; // " " OUString tmp = xAttribs->getNameByIndex(n); - nLength += calcXMLByteLength(tmp, false, false); + nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, false, false); nLength += 2; // =" tmp = xAttribs->getValueByIndex(n); - nLength += calcXMLByteLength(tmp, true, true); + nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, true, true); nLength += 1; // " } @@ -1191,7 +1263,7 @@ void SAXWriter::endElement(const OUString& aName) // only ascii chars allowed sal_Int32 nLength(0); if (m_bAllowLineBreak) - nLength = 3 + calcXMLByteLength(aName, false, false); + nLength = 3 + m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); sal_Int32 nPrefix = getIndentPrefixLength(nLength); if (nPrefix >= 0) @@ -1233,7 +1305,7 @@ void SAXWriter::characters(const OUString& aChars) { sal_Int32 nFirstLineBreakOccurrence = getFirstLineBreak(aChars); - nLength = calcXMLByteLength(aChars, !m_bIsCDATA, false); + nLength = m_pSaxWriterHelper->calcXMLByteLength(aChars, !m_bIsCDATA, false); nIndentPrefix = getIndentPrefixLength( nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength); } @@ -1280,11 +1352,11 @@ void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& a if (m_bAllowLineBreak) { nLength = 2; // "<?" - nLength += calcXMLByteLength(aTarget, false, false); + nLength += m_pSaxWriterHelper->calcXMLByteLength(aTarget, false, false); nLength += 1; // " " - nLength += calcXMLByteLength(aData, false, false); + nLength += m_pSaxWriterHelper->calcXMLByteLength(aData, false, false); nLength += 2; // "?>" } @@ -1355,7 +1427,7 @@ void SAXWriter::comment(const OUString& sComment) if (m_bAllowLineBreak) { nLength = 4; // "<!--" - nLength += calcXMLByteLength(sComment, false, false); + nLength += m_pSaxWriterHelper->calcXMLByteLength(sComment, false, false); nLength += 3; } @@ -1398,7 +1470,7 @@ void SAXWriter::unknown(const OUString& sString) sal_Int32 nLength(0); if (m_bAllowLineBreak) - nLength = calcXMLByteLength(sString, false, false); + nLength = m_pSaxWriterHelper->calcXMLByteLength(sString, false, false); sal_Int32 nPrefix = getIndentPrefixLength(nLength); if (nPrefix >= 0) |