diff options
author | Mike Kaganski <mike.kaganski@collabora.com> | 2017-08-23 09:09:57 +0300 |
---|---|---|
committer | Aron Budea <aron.budea@collabora.com> | 2017-11-22 16:27:11 +0100 |
commit | f3af5926f93f46e96caa6b85e08723f150c94463 (patch) | |
tree | 347e600db3cfd28e4317539876c29b061d675bb7 | |
parent | 3c01d0ff65342432055dcb575601f3b3f446e8bd (diff) |
tdf#111964: only trim XML whitespace
OUString::trim() uses rtl_uString_newTrim, which relies upon
rtl_ImplIsWhitespace. The latter treats as whitespaces not only
characters with values less than or equal to 32, but also Unicode
General Punctuation area Space and some Control characters. Thus,
using OUString::trim() is incorrect when the goal is to trim XML
whitespace, which is defined as one of 0x09, 0x0A, 0x0D, 0x20.
The comments for OUString::trim() and rtl_uString_newTrim are
corrected to describe which characters are considered whitespace.
A unit test included.
Change-Id: I45a132be923a52dcd5a4c35aeecb53d423b49fec
Reviewed-on: https://gerrit.libreoffice.org/41444
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Tested-by: Mike Kaganski <mike.kaganski@collabora.com>
(cherry picked from commit 5b518ab051cc04e672ceb01da42b06625a1a4ce9)
Reviewed-on: https://gerrit.libreoffice.org/44758
Reviewed-by: Aron Budea <aron.budea@collabora.com>
Tested-by: Aron Budea <aron.budea@collabora.com>
-rw-r--r-- | include/rtl/ustring.h | 4 | ||||
-rw-r--r-- | include/rtl/ustring.hxx | 4 | ||||
-rw-r--r-- | sw/qa/extras/ooxmlexport/data/tdf111964.docx | bin | 0 -> 1481 bytes | |||
-rw-r--r-- | sw/qa/extras/ooxmlexport/ooxmlexport9.cxx | 10 | ||||
-rw-r--r-- | writerfilter/source/ooxml/OOXMLFastContextHandler.cxx | 28 |
5 files changed, 43 insertions, 3 deletions
diff --git a/include/rtl/ustring.h b/include/rtl/ustring.h index 831ecd66d9be..50dbd75a5ecc 100644 --- a/include/rtl/ustring.h +++ b/include/rtl/ustring.h @@ -2023,7 +2023,9 @@ SAL_DLLPUBLIC void SAL_CALL rtl_uString_newToAsciiUpperCase( string. The new string results from removing all characters with values less than - or equal to 32 (the space character) form both ends of str. + or equal to 32 (the space character), and also Unicode General Punctuation + area Space and some Control characters, form both ends of str (see + rtl_ImplIsWhitespace). This function cannot be used for language-specific conversion. The new string does not necessarily have a reference count of 1 (in cases where diff --git a/include/rtl/ustring.hxx b/include/rtl/ustring.hxx index 337e8509a53d..bc87c2936eef 100644 --- a/include/rtl/ustring.hxx +++ b/include/rtl/ustring.hxx @@ -2635,7 +2635,9 @@ public: of the string. All characters that have codes less than or equal to - 32 (the space character) are considered to be white space. + 32 (the space character), and Unicode General Punctuation area Space + and some Control characters are considered to be white space (see + rtl_ImplIsWhitespace). If the string doesn't contain white spaces at both ends, then the new string is assigned with str. diff --git a/sw/qa/extras/ooxmlexport/data/tdf111964.docx b/sw/qa/extras/ooxmlexport/data/tdf111964.docx Binary files differnew file mode 100644 index 000000000000..7cb85a1d87df --- /dev/null +++ b/sw/qa/extras/ooxmlexport/data/tdf111964.docx diff --git a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx index b2c8a417109c..8c1537ffc268 100644 --- a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx +++ b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx @@ -384,6 +384,16 @@ DECLARE_OOXMLEXPORT_TEST(testTdf107684, "tdf107684.odt") assertXPath(pXmlDoc, "//w:style[@w:styleId='Heading1']/w:pPr/w:outlineLvl", 1); } +DECLARE_OOXMLEXPORT_TEST(testTdf111964, "tdf111964.docx") +{ + xmlDocPtr pXmlDoc = parseExport("word/document.xml"); + if (!pXmlDoc) + return; + // Unicode spaces that are not XML whitespace must not be trimmed + const sal_Unicode sWSReference [] { 0x2002, 0x2002, 0x2002, 0x2002, 0x2002, 0 }; + assertXPathContent(pXmlDoc, "/w:document/w:body/w:p/w:r[4]/w:t", sWSReference); +} + CPPUNIT_PLUGIN_IMPLEMENT(); /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx index bb59ed9bebdc..1485ce0ed177 100644 --- a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx +++ b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx @@ -620,6 +620,32 @@ void OOXMLFastContextHandler::endTxbxContent() mpParserState->endTxbxContent(); } +namespace { +// XML schema defines white space as one of four characters: +// #x9 (tab), #xA (line feed), #xD (carriage return), and #x20 (space) +bool IsXMLWhitespace(sal_Unicode cChar) +{ + return cChar == 0x9 || cChar == 0xA || cChar == 0xD || cChar == 0x20; +} + +OUString TrimXMLWhitespace(const OUString & sText) +{ + sal_Int32 nTrimmedStart = 0; + const sal_Int32 nLen = sText.getLength(); + sal_Int32 nTrimmedEnd = nLen - 1; + while (nTrimmedStart < nLen && IsXMLWhitespace(sText[nTrimmedStart])) + ++nTrimmedStart; + while (nTrimmedStart <= nTrimmedEnd && IsXMLWhitespace(sText[nTrimmedEnd])) + --nTrimmedEnd; + if ((nTrimmedStart == 0) && (nTrimmedEnd == nLen - 1)) + return sText; + else if (nTrimmedStart > nTrimmedEnd) + return OUString(); + else + return sText.copy(nTrimmedStart, nTrimmedEnd-nTrimmedStart+1); +} +} + void OOXMLFastContextHandler::text(const OUString & sText) { if (isForwardEvents()) @@ -631,7 +657,7 @@ void OOXMLFastContextHandler::text(const OUString & sText) // tabs are converted to spaces if (!IsPreserveSpace()) { - sNormalizedText = sNormalizedText.trim().replaceAll("\t", " "); + sNormalizedText = TrimXMLWhitespace(sNormalizedText).replaceAll("\t", " "); } mpStream->utext(reinterpret_cast < const sal_uInt8 * > (sNormalizedText.getStr()), |