diff options
author | Miklos Vajna <vmiklos@collabora.com> | 2022-10-25 15:55:34 +0200 |
---|---|---|
committer | Miklos Vajna <vmiklos@collabora.com> | 2022-10-25 18:15:47 +0200 |
commit | b38730ae0ae92ca49b84a45853c2ed098ee9064f (patch) | |
tree | a08c26370a2b73fe6c56b395bffb0b197a956789 /svtools/qa | |
parent | d55358c7c31e2e9f124ee90d78eba2db3f1af756 (diff) |
sw html import: fix handling of CDATA
In case the HTML contained markup like <![CDATA[...]]>, we simply
ignored it during import, even if e.g. the ODT import handles that
correctly.
The reason for this is that the svtools/ HTMLParser had code to parse
<!-- ... ---> style comments, but not for CDATA.
Fix the problem by introducing a new HtmlTokenId::CDATA, producing a
matching token content in HTMLParser::GetNextToken_(), and finally map
it to normal text on the Writer side.
Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START,
neither allows inserting a single token before ONOFF_START (it breaks
getOnToken()), so for now just add a second, dummy token to avoid
breakage.
Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813
Reviewed-by: Miklos Vajna <vmiklos@collabora.com>
Tested-by: Jenkins
Diffstat (limited to 'svtools/qa')
-rw-r--r-- | svtools/qa/unit/testHtmlReader.cxx | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/svtools/qa/unit/testHtmlReader.cxx b/svtools/qa/unit/testHtmlReader.cxx index 146458a200eb..37f74e903bcc 100644 --- a/svtools/qa/unit/testHtmlReader.cxx +++ b/svtools/qa/unit/testHtmlReader.cxx @@ -27,6 +27,7 @@ public: OUString m_aDocument; int m_nLineBreakCount = 0; + OUString m_aCdata; }; TestHTMLParser::TestHTMLParser(SvStream& rStream) @@ -40,6 +41,8 @@ void TestHTMLParser::NextToken(HtmlTokenId nToken) m_aDocument += aToken; else if (nToken == HtmlTokenId::LINEBREAK) ++m_nLineBreakCount; + else if (nToken == HtmlTokenId::CDATA) + m_aCdata = aToken; } /// Tests HTMLParser. @@ -76,6 +79,26 @@ CPPUNIT_TEST_FIXTURE(Test, testLineBreak) // This was 2, <br></br> was interpreted as 2 line breaks in XHTML mode. CPPUNIT_ASSERT_EQUAL(1, xParser->m_nLineBreakCount); } + +CPPUNIT_TEST_FIXTURE(Test, testCdata) +{ + // Given a document with CDATA: + SvMemoryStream aStream; + OString aDocument("A<![CDATA[B ü <]]>C"); + aStream.WriteBytes(aDocument.getStr(), aDocument.getLength()); + aStream.Seek(0); + + // When parsing that HTML: + tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream); + xParser->CallParser(); + + // Then make sure that we get a cdata token with the correct content: + // Without the accompanying fix in place, this test would have failed with: + // - Expected: B ü < + // - Actual : + // i.e. the content inside CDATA was lost. + CPPUNIT_ASSERT_EQUAL(OUString("B ü <"), xParser->m_aCdata); +} } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |