From f3af5926f93f46e96caa6b85e08723f150c94463 Mon Sep 17 00:00:00 2001 From: Mike Kaganski Date: Wed, 23 Aug 2017 09:09:57 +0300 Subject: tdf#111964: only trim XML whitespace OUString::trim() uses rtl_uString_newTrim, which relies upon rtl_ImplIsWhitespace. The latter treats as whitespaces not only characters with values less than or equal to 32, but also Unicode General Punctuation area Space and some Control characters. Thus, using OUString::trim() is incorrect when the goal is to trim XML whitespace, which is defined as one of 0x09, 0x0A, 0x0D, 0x20. The comments for OUString::trim() and rtl_uString_newTrim are corrected to describe which characters are considered whitespace. A unit test included. Change-Id: I45a132be923a52dcd5a4c35aeecb53d423b49fec Reviewed-on: https://gerrit.libreoffice.org/41444 Reviewed-by: Mike Kaganski Tested-by: Mike Kaganski (cherry picked from commit 5b518ab051cc04e672ceb01da42b06625a1a4ce9) Reviewed-on: https://gerrit.libreoffice.org/44758 Reviewed-by: Aron Budea Tested-by: Aron Budea --- .../source/ooxml/OOXMLFastContextHandler.cxx | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'writerfilter') diff --git a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx index bb59ed9bebdc..1485ce0ed177 100644 --- a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx +++ b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx @@ -620,6 +620,32 @@ void OOXMLFastContextHandler::endTxbxContent() mpParserState->endTxbxContent(); } +namespace { +// XML schema defines white space as one of four characters: +// #x9 (tab), #xA (line feed), #xD (carriage return), and #x20 (space) +bool IsXMLWhitespace(sal_Unicode cChar) +{ + return cChar == 0x9 || cChar == 0xA || cChar == 0xD || cChar == 0x20; +} + +OUString TrimXMLWhitespace(const OUString & sText) +{ + sal_Int32 nTrimmedStart = 0; + const sal_Int32 nLen = sText.getLength(); + sal_Int32 nTrimmedEnd = nLen - 1; + while (nTrimmedStart < nLen && IsXMLWhitespace(sText[nTrimmedStart])) + ++nTrimmedStart; + while (nTrimmedStart <= nTrimmedEnd && IsXMLWhitespace(sText[nTrimmedEnd])) + --nTrimmedEnd; + if ((nTrimmedStart == 0) && (nTrimmedEnd == nLen - 1)) + return sText; + else if (nTrimmedStart > nTrimmedEnd) + return OUString(); + else + return sText.copy(nTrimmedStart, nTrimmedEnd-nTrimmedStart+1); +} +} + void OOXMLFastContextHandler::text(const OUString & sText) { if (isForwardEvents()) @@ -631,7 +657,7 @@ void OOXMLFastContextHandler::text(const OUString & sText) // tabs are converted to spaces if (!IsPreserveSpace()) { - sNormalizedText = sNormalizedText.trim().replaceAll("\t", " "); + sNormalizedText = TrimXMLWhitespace(sNormalizedText).replaceAll("\t", " "); } mpStream->utext(reinterpret_cast < const sal_uInt8 * > (sNormalizedText.getStr()), -- cgit