summaryrefslogtreecommitdiff
path: root/sax
diff options
context:
space:
mode:
authordante <dante19031999@gmail.com>2021-01-01 19:42:37 +0100
committerStephan Bergmann <sbergman@redhat.com>2021-01-19 08:19:46 +0100
commitae5ec5b944cf2378806498072c50d473a3ac62ed (patch)
tree56fd5795c9e4cbfa160a66d2cd8f9e6a28b91452 /sax
parent2197f69f1b91ff13d0dcd078685ab23466241197 (diff)
Use customized xml entities on xmleport.
This will be mainly used on matml export for unicode characters. It will be used mostly for mathml. Change-Id: I59b96d44facbd01fa517317a0ae54d64d29b0a19 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/108562 Tested-by: Jenkins Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sax')
-rw-r--r--sax/source/expatwrap/saxwriter.cxx242
1 files changed, 157 insertions, 85 deletions
diff --git a/sax/source/expatwrap/saxwriter.cxx b/sax/source/expatwrap/saxwriter.cxx
index f6a58b0bb4cb..04fd90762cd9 100644
--- a/sax/source/expatwrap/saxwriter.cxx
+++ b/sax/source/expatwrap/saxwriter.cxx
@@ -117,6 +117,9 @@ private:
/// @throws SAXException
void FinishStartElement();
+ // Search for the correct replacement
+ const ReplacementPair* findXMLReplacement(const sal_Unicode* pStr, sal_Int32 nStrLen);
+
public:
explicit SaxWriterHelper(Reference<XOutputStream> const& m_TempOut)
: m_out(m_TempOut)
@@ -193,6 +196,10 @@ public:
void setCustomEntityNames(
const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>&
replacements);
+
+ // Calculate length for convertToXML
+ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+ bool bNormalizeWhitespace);
};
const bool g_bValidCharsBelow32[32] = {
@@ -282,10 +289,10 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
for (sal_Int32 i = 0; i < nStrLen; i++)
{
- sal_uInt16 c = pStr[i];
+ sal_Unicode c = pStr[i];
if (IsInvalidChar(c))
bRet = false;
- else if ((c >= 0x0001) && (c <= 0x007F))
+ else if ((c >= 0x0001) && (c <= 0x007F)) // Deal with ascii
{
if (bDoNormalization)
{
@@ -414,26 +421,80 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
rPos++;
}
}
- else if (c >= 0xd800 && c < 0xdc00)
- {
- // 1. surrogate: save (until 2. surrogate)
- OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate");
- nSurrogate = ((c & 0x03ff) + 0x0040);
- }
- else if (c >= 0xdc00 && c < 0xe000)
+ else
{
- // 2. surrogate: write as UTF-8
- OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate");
+ // Deal with replacements
+ if (bDoNormalization && !m_Replacements.empty())
+ {
+ // search
+ const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+ // replace
+ if (it != nullptr)
+ {
+ OString name = ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8);
+ if (rPos + name.getLength() > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(name.getStr()),
+ name.getLength());
+ else
+ {
+ memcpy(&(pTarget[rPos]), name.getStr(), name.getLength());
+ rPos += name.getLength();
+ }
+ i += it->replacement.getLength() - 1;
+ continue;
+ }
+ }
+
+ // Deal with other uniciode cases
+ if (c >= 0xd800 && c < 0xdc00)
+ {
+ // 1. surrogate: save (until 2. surrogate)
+ OSL_ENSURE(nSurrogate == 0, "left-over Unicode surrogate");
+ nSurrogate = ((c & 0x03ff) + 0x0040);
+ }
+ else if (c >= 0xdc00 && c < 0xe000)
+ {
+ // 2. surrogate: write as UTF-8
+ OSL_ENSURE(nSurrogate != 0, "lone 2nd Unicode surrogate");
+
+ nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
+ if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+ {
+ sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)),
+ sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)),
+ sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)),
+ sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) };
+ if ((rPos + 4) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 4);
+ else
+ {
+ pTarget[rPos] = aBytes[0];
+ rPos++;
+ pTarget[rPos] = aBytes[1];
+ rPos++;
+ pTarget[rPos] = aBytes[2];
+ rPos++;
+ pTarget[rPos] = aBytes[3];
+ rPos++;
+ }
+ }
+ else
+ {
+ OSL_FAIL("illegal Unicode character");
+ bRet = false;
+ }
- nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
- if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+ // reset surrogate
+ nSurrogate = 0;
+ }
+ else if (c > 0x07FF)
{
- sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)),
- sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)),
- sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)),
- sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) };
- if ((rPos + 4) > SEQUENCESIZE)
- AddBytes(pTarget, rPos, aBytes, 4);
+ sal_Int8 aBytes[]
+ = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)),
+ sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+ if ((rPos + 3) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 3);
else
{
pTarget[rPos] = aBytes[0];
@@ -442,50 +503,24 @@ bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
rPos++;
pTarget[rPos] = aBytes[2];
rPos++;
- pTarget[rPos] = aBytes[3];
- rPos++;
}
}
else
{
- OSL_FAIL("illegal Unicode character");
- bRet = false;
- }
-
- // reset surrogate
- nSurrogate = 0;
- }
- else if (c > 0x07FF)
- {
- sal_Int8 aBytes[]
- = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)),
- sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
- if ((rPos + 3) > SEQUENCESIZE)
- AddBytes(pTarget, rPos, aBytes, 3);
- else
- {
- pTarget[rPos] = aBytes[0];
- rPos++;
- pTarget[rPos] = aBytes[1];
- rPos++;
- pTarget[rPos] = aBytes[2];
- rPos++;
- }
- }
- else
- {
- sal_Int8 aBytes[]
- = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
- if ((rPos + 2) > SEQUENCESIZE)
- AddBytes(pTarget, rPos, aBytes, 2);
- else
- {
- pTarget[rPos] = aBytes[0];
- rPos++;
- pTarget[rPos] = aBytes[1];
- rPos++;
+ sal_Int8 aBytes[]
+ = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+ if ((rPos + 2) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 2);
+ else
+ {
+ pTarget[rPos] = aBytes[0];
+ rPos++;
+ pTarget[rPos] = aBytes[1];
+ rPos++;
+ }
}
}
+
OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position");
if (rPos == SEQUENCESIZE)
rPos = writeSequence();
@@ -848,7 +883,8 @@ bool SaxWriterHelper::comment(const OUString& rComment)
return bRet;
}
-sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bNormalizeWhitespace)
+sal_Int32 SaxWriterHelper::calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+ bool bNormalizeWhitespace)
{
sal_Int32 nOutputLength = 0;
sal_uInt32 nSurrogate = 0;
@@ -897,26 +933,45 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN
nOutputLength++;
}
}
- else if (c >= 0xd800 && c < 0xdc00)
- {
- // save surrogate
- nSurrogate = ((c & 0x03ff) + 0x0040);
- }
- else if (c >= 0xdc00 && c < 0xe000)
- {
- // 2. surrogate: write as UTF-8 (if range is OK
- nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
- if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
- nOutputLength += 4;
- nSurrogate = 0;
- }
- else if (c > 0x07FF)
- {
- nOutputLength += 3;
- }
else
{
- nOutputLength += 2;
+ // Deal with replacements
+ if (bDoNormalization && !m_Replacements.empty())
+ {
+ // search
+ const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+ if (it != nullptr)
+ {
+ nOutputLength
+ += ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8).getLength();
+ i += it->replacement.getLength() - 1;
+ continue;
+ }
+ }
+
+ // Deal with other unicode cases
+ if (c >= 0xd800 && c < 0xdc00)
+ {
+ // save surrogate
+ nSurrogate = ((c & 0x03ff) + 0x0040);
+ }
+ else if (c >= 0xdc00 && c < 0xe000)
+ {
+ // 2. surrogate: write as UTF-8 (if range is OK
+ nSurrogate = (nSurrogate << 10) | (c & 0x03ff);
+ if (rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000)
+ nOutputLength += 4;
+ nSurrogate = 0;
+ }
+ else if (c > 0x07FF)
+ {
+ nOutputLength += 3;
+ }
+ else
+ {
+ nOutputLength += 2;
+ }
}
// surrogate processing
@@ -927,6 +982,23 @@ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, bool bN
return nOutputLength;
}
+const ReplacementPair* SaxWriterHelper::findXMLReplacement(const sal_Unicode* pStr,
+ sal_Int32 nStrLen)
+{
+ for (size_t iter = 0; iter < m_Replacements.size(); ++iter)
+ {
+ if (m_Replacements[iter].replacement.getLength() > nStrLen)
+ continue;
+ sal_Int32 matches = m_Replacements[iter].replacement.compareTo(
+ std::u16string_view(pStr, m_Replacements[iter].replacement.getLength()));
+ if (matches == 0)
+ return &m_Replacements[iter];
+ if (matches > 0)
+ return nullptr;
+ }
+ return nullptr;
+}
+
/** returns position of first ascii 10 within the string, -1 when no 10 in string.
*/
sal_Int32 getFirstLineBreak(const OUString& str) throw()
@@ -1115,7 +1187,7 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi
sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0;
nLength++; // "<"
- nLength += calcXMLByteLength(aName, false, false); // the tag name
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); // the tag name
sal_Int16 n;
for (n = 0; n < static_cast<sal_Int16>(nAttribCount); n++)
@@ -1123,13 +1195,13 @@ void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeLi
nLength++; // " "
OUString tmp = xAttribs->getNameByIndex(n);
- nLength += calcXMLByteLength(tmp, false, false);
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, false, false);
nLength += 2; // ="
tmp = xAttribs->getValueByIndex(n);
- nLength += calcXMLByteLength(tmp, true, true);
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, true, true);
nLength += 1; // "
}
@@ -1191,7 +1263,7 @@ void SAXWriter::endElement(const OUString& aName)
// only ascii chars allowed
sal_Int32 nLength(0);
if (m_bAllowLineBreak)
- nLength = 3 + calcXMLByteLength(aName, false, false);
+ nLength = 3 + m_pSaxWriterHelper->calcXMLByteLength(aName, false, false);
sal_Int32 nPrefix = getIndentPrefixLength(nLength);
if (nPrefix >= 0)
@@ -1233,7 +1305,7 @@ void SAXWriter::characters(const OUString& aChars)
{
sal_Int32 nFirstLineBreakOccurrence = getFirstLineBreak(aChars);
- nLength = calcXMLByteLength(aChars, !m_bIsCDATA, false);
+ nLength = m_pSaxWriterHelper->calcXMLByteLength(aChars, !m_bIsCDATA, false);
nIndentPrefix = getIndentPrefixLength(
nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength);
}
@@ -1280,11 +1352,11 @@ void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& a
if (m_bAllowLineBreak)
{
nLength = 2; // "<?"
- nLength += calcXMLByteLength(aTarget, false, false);
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aTarget, false, false);
nLength += 1; // " "
- nLength += calcXMLByteLength(aData, false, false);
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aData, false, false);
nLength += 2; // "?>"
}
@@ -1355,7 +1427,7 @@ void SAXWriter::comment(const OUString& sComment)
if (m_bAllowLineBreak)
{
nLength = 4; // "<!--"
- nLength += calcXMLByteLength(sComment, false, false);
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(sComment, false, false);
nLength += 3;
}
@@ -1398,7 +1470,7 @@ void SAXWriter::unknown(const OUString& sString)
sal_Int32 nLength(0);
if (m_bAllowLineBreak)
- nLength = calcXMLByteLength(sString, false, false);
+ nLength = m_pSaxWriterHelper->calcXMLByteLength(sString, false, false);
sal_Int32 nPrefix = getIndentPrefixLength(nLength);
if (nPrefix >= 0)