diff options
author | Michael Stahl <Michael.Stahl@cib.de> | 2019-10-28 14:31:23 +0100 |
---|---|---|
committer | Vasily Melenchuk <vasily.melenchuk@cib.de> | 2021-04-13 15:25:11 +0300 |
commit | c8e04eb4455eb412466b65a085464a81bd2519fa (patch) | |
tree | 560393e14eef7b31088c4cf2fb7240175fa62af9 /svtools | |
parent | 4ebc328779539d9bc0cb0d7759c27e1e155c129e (diff) |
svl: HTMLParser: stop inserting control character garbage into Writer
E.g. rhbz433940-1.html contains literal ^G characters that are inserted
as-is into SwTextNodes.
This now triggers assert about CH_TXT_ATR_FIELDSTART in
SwSubFont::GetTextSize_() that was added in
19a559b0ec9b806519c405651d6d2b2e14712b4a.
Change-Id: I6aa7de41a04069e15b40865fd57894dae0fc10db
Reviewed-on: https://gerrit.libreoffice.org/81606
Reviewed-by: Michael Stahl <michael.stahl@cib.de>
Tested-by: Michael Stahl <michael.stahl@cib.de>
(cherry picked from commit 35d248cab1f0d4800f72abb5cb6afb56f40d9083)
Reviewed-on: https://gerrit.libreoffice.org/81652
Tested-by: Jenkins
Reviewed-by: Caolán McNamara <caolanm@redhat.com>
Tested-by: Caolán McNamara <caolanm@redhat.com>
Diffstat (limited to 'svtools')
-rw-r--r-- | svtools/source/svhtml/parhtml.cxx | 19 |
1 files changed, 15 insertions, 4 deletions
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index 6f9e33a8bd2e..42cd5d1295cd 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -31,6 +31,7 @@ #include <tools/datetime.hxx> #include <unotools/datetime.hxx> #include <svl/inettype.hxx> +#include <svl/lngmisc.hxx> #include <com/sun/star/beans/PropertyAttribute.hpp> #include <com/sun/star/document/XDocumentProperties.hpp> @@ -456,8 +457,12 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak ) else nNextCh = 0U; - if ( ! rtl::isUnicodeCodePoint( cChar ) ) + if (!rtl::isUnicodeCodePoint(cChar) + || (linguistic::IsControlChar(cChar) + && cChar != '\r' && cChar != '\n' && cChar != '\t')) + { cChar = '?'; + } } else if( rtl::isAsciiAlpha( nNextCh ) ) { @@ -753,8 +758,11 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak ) else { do { + if (!linguistic::IsControlChar(nNextCh)) + { // All remaining characters make their way into the text. - sTmpBuffer.appendUtf32( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); + } if( MAX_LEN == sTmpBuffer.getLength() ) { aToken += sTmpBuffer; @@ -989,8 +997,11 @@ HtmlTokenId HTMLParser::GetNextRawToken() } SAL_FALLTHROUGH; default: - // all remaining characters are appended to the buffer - sTmpBuffer.appendUtf32( nNextCh ); + if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t') + { + // all remaining characters are appended to the buffer + sTmpBuffer.appendUtf32( nNextCh ); + } break; } |