summaryrefslogtreecommitdiff
path: root/writerfilter
diff options
context:
space:
mode:
authorMike Kaganski <mike.kaganski@collabora.com>2017-08-23 09:09:57 +0300
committerAron Budea <aron.budea@collabora.com>2017-11-22 16:27:11 +0100
commitf3af5926f93f46e96caa6b85e08723f150c94463 (patch)
tree347e600db3cfd28e4317539876c29b061d675bb7 /writerfilter
parent3c01d0ff65342432055dcb575601f3b3f446e8bd (diff)
tdf#111964: only trim XML whitespace
OUString::trim() uses rtl_uString_newTrim, which relies upon rtl_ImplIsWhitespace. The latter treats as whitespaces not only characters with values less than or equal to 32, but also Unicode General Punctuation area Space and some Control characters. Thus, using OUString::trim() is incorrect when the goal is to trim XML whitespace, which is defined as one of 0x09, 0x0A, 0x0D, 0x20. The comments for OUString::trim() and rtl_uString_newTrim are corrected to describe which characters are considered whitespace. A unit test included. Change-Id: I45a132be923a52dcd5a4c35aeecb53d423b49fec Reviewed-on: https://gerrit.libreoffice.org/41444 Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com> Tested-by: Mike Kaganski <mike.kaganski@collabora.com> (cherry picked from commit 5b518ab051cc04e672ceb01da42b06625a1a4ce9) Reviewed-on: https://gerrit.libreoffice.org/44758 Reviewed-by: Aron Budea <aron.budea@collabora.com> Tested-by: Aron Budea <aron.budea@collabora.com>
Diffstat (limited to 'writerfilter')
-rw-r--r--writerfilter/source/ooxml/OOXMLFastContextHandler.cxx28
1 files changed, 27 insertions, 1 deletions
diff --git a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
index bb59ed9bebdc..1485ce0ed177 100644
--- a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
+++ b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
@@ -620,6 +620,32 @@ void OOXMLFastContextHandler::endTxbxContent()
mpParserState->endTxbxContent();
}
+namespace {
+// XML schema defines white space as one of four characters:
+// #x9 (tab), #xA (line feed), #xD (carriage return), and #x20 (space)
+bool IsXMLWhitespace(sal_Unicode cChar)
+{
+ return cChar == 0x9 || cChar == 0xA || cChar == 0xD || cChar == 0x20;
+}
+
+OUString TrimXMLWhitespace(const OUString & sText)
+{
+ sal_Int32 nTrimmedStart = 0;
+ const sal_Int32 nLen = sText.getLength();
+ sal_Int32 nTrimmedEnd = nLen - 1;
+ while (nTrimmedStart < nLen && IsXMLWhitespace(sText[nTrimmedStart]))
+ ++nTrimmedStart;
+ while (nTrimmedStart <= nTrimmedEnd && IsXMLWhitespace(sText[nTrimmedEnd]))
+ --nTrimmedEnd;
+ if ((nTrimmedStart == 0) && (nTrimmedEnd == nLen - 1))
+ return sText;
+ else if (nTrimmedStart > nTrimmedEnd)
+ return OUString();
+ else
+ return sText.copy(nTrimmedStart, nTrimmedEnd-nTrimmedStart+1);
+}
+}
+
void OOXMLFastContextHandler::text(const OUString & sText)
{
if (isForwardEvents())
@@ -631,7 +657,7 @@ void OOXMLFastContextHandler::text(const OUString & sText)
// tabs are converted to spaces
if (!IsPreserveSpace())
{
- sNormalizedText = sNormalizedText.trim().replaceAll("\t", " ");
+ sNormalizedText = TrimXMLWhitespace(sNormalizedText).replaceAll("\t", " ");
}
mpStream->utext(reinterpret_cast < const sal_uInt8 * >
(sNormalizedText.getStr()),