sw html import: fix handling of CDATA

In case the HTML contained markup like <![CDATA[...]]>, we simply ignored it during import, even if e.g. the ODT import handles that correctly. The reason for this is that the svtools/ HTMLParser had code to parse  style comments, but not for CDATA. Fix the problem by introducing a new HtmlTokenId::CDATA, producing a matching token content in HTMLParser::GetNextToken_(), and finally map it to normal text on the Writer side. Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START, neither allows inserting a single token before ONOFF_START (it breaks getOnToken()), so for now just add a second, dummy token to avoid breakage. Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813 Reviewed-by: Miklos Vajna <vmiklos@collabora.com> Tested-by: Jenkins
author: Miklos Vajna <vmiklos@collabora.com> 2022-10-25 15:55:34 +0200
committer: Miklos Vajna <vmiklos@collabora.com> 2022-10-25 18:15:47 +0200
commit: b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch)
tree: a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/source/svhtml
parent: d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff)
2 files changed, 43 insertions, 0 deletions
diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx
index 2d51910d85e9..584322fac8bc 100644
--- a/svtools/source/svhtml/htmlkywd.cxx
+++ b/svtools/source/svhtml/htmlkywd.cxx
@@ -27,6 +27,9 @@
 #include <svtools/htmltokn.h>
 #include <svtools/htmlkywd.hxx>
 
+// If this is odd, then getOnToken() breaks.
+static_assert(static_cast<sal_Int16>(HtmlTokenId::ABBREVIATION_ON) % 2 == 0);
+
 namespace {
 
 template<typename T>
@@ -64,6 +67,7 @@ using HTML_TokenEntry = TokenEntry<HtmlTokenId>;
 HTML_TokenEntry const aHTMLTokenTab[] = {
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_comment),         HtmlTokenId::COMMENT},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_doctype),         HtmlTokenId::DOCTYPE},
+    {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_cdata),           HtmlTokenId::CDATA},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_anchor),          HtmlTokenId::ANCHOR_ON},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_abbreviation),    HtmlTokenId::ABBREVIATION_ON},  // HTML 3.0
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_acronym),         HtmlTokenId::ACRONYM_ON},   // HTML 3.0
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index 0ac10578981c..e705c98013e4 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -1053,6 +1053,10 @@ HtmlTokenId HTMLParser::GetNextToken_()
                     do {
                         sTmpBuffer.appendUtf32( nNextCh );
                         nNextCh = GetNextChar();
+                        if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
+                        {
+                            break;
+                        }
                     } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
                             !linguistic::IsControlChar(nNextCh) &&
                              IsParserWorking() && !rInput.eof() );
@@ -1152,6 +1156,41 @@ HtmlTokenId HTMLParser::GetNextToken_()
                             nNextCh = '>';
                         }
                     }
+                    else if (nRet == HtmlTokenId::CDATA)
+                    {
+                        // Read until the closing ]]>.
+                        bool bDone = false;
+                        while (!bDone && !rInput.eof() && IsParserWorking())
+                        {
+                            if (nNextCh == '>')
+                            {
+                                if (sTmpBuffer.getLength() >= 2)
+                                {
+                                    bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
+                                            && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
+                                    if (bDone)
+                                    {
+                                        // Ignore ]] at the end.
+                                        sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
+                                    }
+                                }
+                                if (!bDone)
+                                {
+                                    sTmpBuffer.appendUtf32(nNextCh);
+                                }
+                            }
+                            else if (!linguistic::IsControlChar(nNextCh))
+                            {
+                                sTmpBuffer.appendUtf32(nNextCh);
+                            }
+                            if (!bDone)
+                            {
+                                nNextCh = GetNextChar();
+                            }
+                        }
+                        aToken = sTmpBuffer;
+                        sTmpBuffer.setLength(0);
+                    }
                     else
                     {
                         // TokenString not needed anymore
author	Miklos Vajna <vmiklos@collabora.com>	2022-10-25 15:55:34 +0200
committer	Miklos Vajna <vmiklos@collabora.com>	2022-10-25 18:15:47 +0200
commit	b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch)
tree	a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/source/svhtml
parent	d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff)