diff options
author | Miklos Vajna <vmiklos@collabora.com> | 2022-10-25 15:55:34 +0200 |
---|---|---|
committer | Miklos Vajna <vmiklos@collabora.com> | 2022-10-25 18:15:47 +0200 |
commit | b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch) | |
tree | a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/source/svhtml | |
parent | d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff) |
sw html import: fix handling of CDATA
In case the HTML contained markup like <![CDATA[...]]>, we simply
ignored it during import, even if e.g. the ODT import handles that
correctly.
The reason for this is that the svtools/ HTMLParser had code to parse
<!-- ... ---> style comments, but not for CDATA.
Fix the problem by introducing a new HtmlTokenId::CDATA, producing a
matching token content in HTMLParser::GetNextToken_(), and finally map
it to normal text on the Writer side.
Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START,
neither allows inserting a single token before ONOFF_START (it breaks
getOnToken()), so for now just add a second, dummy token to avoid
breakage.
Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813
Reviewed-by: Miklos Vajna <vmiklos@collabora.com>
Tested-by: Jenkins
Diffstat (limited to 'svtools/source/svhtml')
-rw-r--r-- | svtools/source/svhtml/htmlkywd.cxx | 4 | ||||
-rw-r--r-- | svtools/source/svhtml/parhtml.cxx | 39 |
2 files changed, 43 insertions, 0 deletions
diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx index 2d51910d85e9..584322fac8bc 100644 --- a/svtools/source/svhtml/htmlkywd.cxx +++ b/svtools/source/svhtml/htmlkywd.cxx @@ -27,6 +27,9 @@ #include <svtools/htmltokn.h> #include <svtools/htmlkywd.hxx> +// If this is odd, then getOnToken() breaks. +static_assert(static_cast<sal_Int16>(HtmlTokenId::ABBREVIATION_ON) % 2 == 0); + namespace { template<typename T> @@ -64,6 +67,7 @@ using HTML_TokenEntry = TokenEntry<HtmlTokenId>; HTML_TokenEntry const aHTMLTokenTab[] = { {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_comment), HtmlTokenId::COMMENT}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_doctype), HtmlTokenId::DOCTYPE}, + {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_cdata), HtmlTokenId::CDATA}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_anchor), HtmlTokenId::ANCHOR_ON}, {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_abbreviation), HtmlTokenId::ABBREVIATION_ON}, // HTML 3.0 {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_acronym), HtmlTokenId::ACRONYM_ON}, // HTML 3.0 diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index 0ac10578981c..e705c98013e4 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -1053,6 +1053,10 @@ HtmlTokenId HTMLParser::GetNextToken_() do { sTmpBuffer.appendUtf32( nNextCh ); nNextCh = GetNextChar(); + if (std::u16string_view(sTmpBuffer) == u"![CDATA[") + { + break; + } } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) && !linguistic::IsControlChar(nNextCh) && IsParserWorking() && !rInput.eof() ); @@ -1152,6 +1156,41 @@ HtmlTokenId HTMLParser::GetNextToken_() nNextCh = '>'; } } + else if (nRet == HtmlTokenId::CDATA) + { + // Read until the closing ]]>. + bool bDone = false; + while (!bDone && !rInput.eof() && IsParserWorking()) + { + if (nNextCh == '>') + { + if (sTmpBuffer.getLength() >= 2) + { + bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']' + && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']'; + if (bDone) + { + // Ignore ]] at the end. + sTmpBuffer.setLength(sTmpBuffer.getLength() - 2); + } + } + if (!bDone) + { + sTmpBuffer.appendUtf32(nNextCh); + } + } + else if (!linguistic::IsControlChar(nNextCh)) + { + sTmpBuffer.appendUtf32(nNextCh); + } + if (!bDone) + { + nNextCh = GetNextChar(); + } + } + aToken = sTmpBuffer; + sTmpBuffer.setLength(0); + } else { // TokenString not needed anymore |