summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Stahl <Michael.Stahl@cib.de>2019-10-28 14:31:23 +0100
committerVasily Melenchuk <vasily.melenchuk@cib.de>2021-04-13 15:25:11 +0300
commitc8e04eb4455eb412466b65a085464a81bd2519fa (patch)
tree560393e14eef7b31088c4cf2fb7240175fa62af9
parent4ebc328779539d9bc0cb0d7759c27e1e155c129e (diff)
svl: HTMLParser: stop inserting control character garbage into Writer
E.g. rhbz433940-1.html contains literal ^G characters that are inserted as-is into SwTextNodes. This now triggers assert about CH_TXT_ATR_FIELDSTART in SwSubFont::GetTextSize_() that was added in 19a559b0ec9b806519c405651d6d2b2e14712b4a. Change-Id: I6aa7de41a04069e15b40865fd57894dae0fc10db Reviewed-on: https://gerrit.libreoffice.org/81606 Reviewed-by: Michael Stahl <michael.stahl@cib.de> Tested-by: Michael Stahl <michael.stahl@cib.de> (cherry picked from commit 35d248cab1f0d4800f72abb5cb6afb56f40d9083) Reviewed-on: https://gerrit.libreoffice.org/81652 Tested-by: Jenkins Reviewed-by: Caolán McNamara <caolanm@redhat.com> Tested-by: Caolán McNamara <caolanm@redhat.com>
-rw-r--r--svtools/source/svhtml/parhtml.cxx19
1 files changed, 15 insertions, 4 deletions
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index 6f9e33a8bd2e..42cd5d1295cd 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -31,6 +31,7 @@
#include <tools/datetime.hxx>
#include <unotools/datetime.hxx>
#include <svl/inettype.hxx>
+#include <svl/lngmisc.hxx>
#include <com/sun/star/beans/PropertyAttribute.hpp>
#include <com/sun/star/document/XDocumentProperties.hpp>
@@ -456,8 +457,12 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
else
nNextCh = 0U;
- if ( ! rtl::isUnicodeCodePoint( cChar ) )
+ if (!rtl::isUnicodeCodePoint(cChar)
+ || (linguistic::IsControlChar(cChar)
+ && cChar != '\r' && cChar != '\n' && cChar != '\t'))
+ {
cChar = '?';
+ }
}
else if( rtl::isAsciiAlpha( nNextCh ) )
{
@@ -753,8 +758,11 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
else
{
do {
+ if (!linguistic::IsControlChar(nNextCh))
+ {
// All remaining characters make their way into the text.
- sTmpBuffer.appendUtf32( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
+ }
if( MAX_LEN == sTmpBuffer.getLength() )
{
aToken += sTmpBuffer;
@@ -989,8 +997,11 @@ HtmlTokenId HTMLParser::GetNextRawToken()
}
SAL_FALLTHROUGH;
default:
- // all remaining characters are appended to the buffer
- sTmpBuffer.appendUtf32( nNextCh );
+ if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
+ {
+ // all remaining characters are appended to the buffer
+ sTmpBuffer.appendUtf32( nNextCh );
+ }
break;
}