sw html import: fix handling of CDATA

In case the HTML contained markup like <![CDATA[...]]>, we simply ignored it during import, even if e.g. the ODT import handles that correctly. The reason for this is that the svtools/ HTMLParser had code to parse  style comments, but not for CDATA. Fix the problem by introducing a new HtmlTokenId::CDATA, producing a matching token content in HTMLParser::GetNextToken_(), and finally map it to normal text on the Writer side. Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START, neither allows inserting a single token before ONOFF_START (it breaks getOnToken()), so for now just add a second, dummy token to avoid breakage. Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813 Reviewed-by: Miklos Vajna <vmiklos@collabora.com> Tested-by: Jenkins
author: Miklos Vajna <vmiklos@collabora.com> 2022-10-25 15:55:34 +0200
committer: Miklos Vajna <vmiklos@collabora.com> 2022-10-25 18:15:47 +0200
commit: b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch)
tree: a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/qa
parent: d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff)
1 files changed, 23 insertions, 0 deletions
diff --git a/svtools/qa/unit/testHtmlReader.cxx b/svtools/qa/unit/testHtmlReader.cxx
index 146458a200eb..37f74e903bcc 100644
--- a/svtools/qa/unit/testHtmlReader.cxx
+++ b/svtools/qa/unit/testHtmlReader.cxx
@@ -27,6 +27,7 @@ public:
 
     OUString m_aDocument;
     int m_nLineBreakCount = 0;
+    OUString m_aCdata;
 };
 
 TestHTMLParser::TestHTMLParser(SvStream& rStream)
@@ -40,6 +41,8 @@ void TestHTMLParser::NextToken(HtmlTokenId nToken)
         m_aDocument += aToken;
     else if (nToken == HtmlTokenId::LINEBREAK)
         ++m_nLineBreakCount;
+    else if (nToken == HtmlTokenId::CDATA)
+        m_aCdata = aToken;
 }
 
 /// Tests HTMLParser.
@@ -76,6 +79,26 @@ CPPUNIT_TEST_FIXTURE(Test, testLineBreak)
     // This was 2, <br></br> was interpreted as 2 line breaks in XHTML mode.
     CPPUNIT_ASSERT_EQUAL(1, xParser->m_nLineBreakCount);
 }
+
+CPPUNIT_TEST_FIXTURE(Test, testCdata)
+{
+    // Given a document with CDATA:
+    SvMemoryStream aStream;
+    OString aDocument("A<![CDATA[B &uuml; &lt;]]>C");
+    aStream.WriteBytes(aDocument.getStr(), aDocument.getLength());
+    aStream.Seek(0);
+
+    // When parsing that HTML:
+    tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream);
+    xParser->CallParser();
+
+    // Then make sure that we get a cdata token with the correct content:
+    // Without the accompanying fix in place, this test would have failed with:
+    // - Expected: B &uuml; &lt;
+    // - Actual  :
+    // i.e. the content inside CDATA was lost.
+    CPPUNIT_ASSERT_EQUAL(OUString("B &uuml; &lt;"), xParser->m_aCdata);
+}
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Miklos Vajna <vmiklos@collabora.com>	2022-10-25 15:55:34 +0200
committer	Miklos Vajna <vmiklos@collabora.com>	2022-10-25 18:15:47 +0200
commit	b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch)
tree	a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/qa
parent	d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff)