tdf#66597 Fix PDF text extraction for complex text

Implement a more through strategy for embedding textual content in PDF files: * If there is unique one to one or one to many mapping between each glyph index and Unicode code points, use ToUnicode CMAP. * If there is many to one or many to many mapping, use an ActualText span embedding the original string, since ToUnicode can’t handle these. * If the one glyph is used for several Unicode code points, also use ActualText since ToUnicode can map each glyph in the font only once. * Limit ActualText to single cluster at a time, since using it for whole words or sentences breaks text selection and highlighting in PDF viewers (there will be no way to tell which glyphs belong to which characters). * Keep generating (now) redundant ToUnicode entries for compatibility with old tools not supporting ActualText. Change-Id: I33261811b59b3b8fe2164c2c21d3c52c417e6208 Reviewed-on: https://gerrit.libreoffice.org/53315 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
author: Khaled Hosny <khaledhosny@eglug.org> 2018-04-26 12:55:26 +0200
committer: Miklos Vajna <vmiklos@collabora.co.uk> 2018-04-27 11:23:14 +0200
commit: c688b01d9102832226251fc84045408afe392459 (patch)
tree: e000d416369c3d4b032cf2614ce8e9d59eb0e68f /vcl/qa
parent: dfdc165a48d711b867961d1f75ee36a1c9596dc0 (diff)
3 files changed, 195 insertions, 1 deletions
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt
new file mode 100644
index 000000000000..7fecc55c6386
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt
new file mode 100644
index 000000000000..3d7b5e59cc9d
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index b9fe20df099f..d280f561fc64 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -75,8 +75,12 @@ public:
     void testTdf115117_1a();
     /// Test writing ToUnicode CMAP for RTL ligatures.
     void testTdf115117_2();
-    /// Text extracting RTL text with ligatures.
+    /// Test extracting RTL text with ligatures.
     void testTdf115117_2a();
+    /// Test writing ToUnicode CMAP for doubly encoded glyphs.
+    void testTdf66597_1();
+    /// Test writing ActualText for many to one glyph to Unicode mapping.
+    void testTdf66597_2();
 #endif
 #endif
 
@@ -101,6 +105,8 @@ public:
     CPPUNIT_TEST(testTdf115117_1a);
     CPPUNIT_TEST(testTdf115117_2);
     CPPUNIT_TEST(testTdf115117_2a);
+    CPPUNIT_TEST(testTdf66597_1);
+    CPPUNIT_TEST(testTdf66597_2);
 #endif
 #endif
     CPPUNIT_TEST_SUITE_END();
@@ -976,6 +982,194 @@ void PdfExportTest::testTdf115117_2a()
     OUString aActualText(aChars.data(), aChars.size());
     CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
 }
+
+// This requires Amiri font, if it is missing the test will fail.
+void PdfExportTest::testTdf66597_1()
+{
+    // FIXME: Fallback font is used on Windows for some reason.
+#if !defined _WIN32
+    vcl::filter::PDFDocument aDocument;
+    load("tdf66597-1.odt", aDocument);
+
+    {
+        // Get access to ToUnicode of the first font
+        vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+        for (const auto& aElement : aDocument.GetElements())
+        {
+            auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+            if (!pObject)
+                continue;
+            auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+            if (pType && pType->GetValue() == "Font")
+            {
+                auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont"));
+                auto aName = pName->GetValue().copy(7); // skip the subset id
+                CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("Amiri-Regular"), aName);
+
+                auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+                CPPUNIT_ASSERT(pToUnicodeRef);
+                pToUnicode = pToUnicodeRef->LookupObject();
+                break;
+            }
+        }
+
+        CPPUNIT_ASSERT(pToUnicode);
+        auto pStream = pToUnicode->GetStream();
+        CPPUNIT_ASSERT(pStream);
+        SvMemoryStream aObjectStream;
+        ZCodec aZCodec;
+        aZCodec.BeginCompression();
+        pStream->GetMemory().Seek(0);
+        aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+        CPPUNIT_ASSERT(aZCodec.EndCompression());
+        aObjectStream.Seek(0);
+        // The <01> is glyph id, <0020> is code point.
+        // The document has three characters <space><nbspace><space>, but the font
+        // reuses the same glyph for space and nbspace so we should have a single
+        // CMAP entry for the space, and nbspace will be handled with ActualText
+        // (tested above).
+        std::string aCmap("1 beginbfchar\n"
+                          "<01> <0020>\n"
+                          "endbfchar");
+        std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize());
+        auto nPos = aData.find(aCmap);
+        CPPUNIT_ASSERT(nPos != std::string::npos);
+    }
+
+    {
+        auto aPages = aDocument.GetPages();
+        CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size());
+        // Get page contents and stream.
+        auto pContents = aPages[0]->LookupObject("Contents");
+        CPPUNIT_ASSERT(pContents);
+        auto pStream = pContents->GetStream();
+        CPPUNIT_ASSERT(pStream);
+        auto& rObjectStream = pStream->GetMemory();
+
+        // Uncompress the stream.
+        SvMemoryStream aUncompressed;
+        ZCodec aZCodec;
+        aZCodec.BeginCompression();
+        rObjectStream.Seek(0);
+        aZCodec.Decompress(rObjectStream, aUncompressed);
+        CPPUNIT_ASSERT(aZCodec.EndCompression());
+
+        // Make sure the expected ActualText is present.
+        std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize());
+
+        std::string aActualText("/Span<</ActualText<");
+        size_t nCount = 0;
+        size_t nPos = 0;
+        while ((nPos = aData.find(aActualText, nPos)) != std::string::npos)
+        {
+            nCount++;
+            nPos += aActualText.length();
+        }
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("The should be one ActualText entry!", static_cast<size_t>(1), nCount);
+
+        aActualText = "/Span<</ActualText<FEFF00A0>>>";
+        nPos = aData.find(aActualText);
+        CPPUNIT_ASSERT_MESSAGE("ActualText not found!", nPos != std::string::npos);
+    }
+#endif
+}
+
+// This requires Reem Kufi font, if it is missing the test will fail.
+void PdfExportTest::testTdf66597_2()
+{
+    // FIXME: Fallback font is used on Windows for some reason.
+#if !defined _WIN32
+    vcl::filter::PDFDocument aDocument;
+    load("tdf66597-2.odt", aDocument);
+
+    {
+        // Get access to ToUnicode of the first font
+        vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+        for (const auto& aElement : aDocument.GetElements())
+        {
+            auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+            if (!pObject)
+                continue;
+            auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+            if (pType && pType->GetValue() == "Font")
+            {
+                auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont"));
+                auto aName = pName->GetValue().copy(7); // skip the subset id
+                CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("ReemKufi-Regular"), aName);
+
+                auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+                CPPUNIT_ASSERT(pToUnicodeRef);
+                pToUnicode = pToUnicodeRef->LookupObject();
+                break;
+            }
+        }
+
+        CPPUNIT_ASSERT(pToUnicode);
+        auto pStream = pToUnicode->GetStream();
+        CPPUNIT_ASSERT(pStream);
+        SvMemoryStream aObjectStream;
+        ZCodec aZCodec;
+        aZCodec.BeginCompression();
+        pStream->GetMemory().Seek(0);
+        aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+        CPPUNIT_ASSERT(aZCodec.EndCompression());
+        aObjectStream.Seek(0);
+        std::string aCmap("8 beginbfchar\n"
+                          "<02> <0632>\n"
+                          "<03> <0020>\n"
+                          "<04> <0648>\n"
+                          "<05> <0647>\n"
+                          "<06> <062F>\n"
+                          "<08> <062C>\n"
+                          "<09> <0628>\n"
+                          "<0B> <0623>\n"
+                          "endbfchar");
+        std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize());
+        auto nPos = aData.find(aCmap);
+        CPPUNIT_ASSERT(nPos != std::string::npos);
+    }
+
+    {
+        auto aPages = aDocument.GetPages();
+        CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size());
+        // Get page contents and stream.
+        auto pContents = aPages[0]->LookupObject("Contents");
+        CPPUNIT_ASSERT(pContents);
+        auto pStream = pContents->GetStream();
+        CPPUNIT_ASSERT(pStream);
+        auto& rObjectStream = pStream->GetMemory();
+
+        // Uncompress the stream.
+        SvMemoryStream aUncompressed;
+        ZCodec aZCodec;
+        aZCodec.BeginCompression();
+        rObjectStream.Seek(0);
+        aZCodec.Decompress(rObjectStream, aUncompressed);
+        CPPUNIT_ASSERT(aZCodec.EndCompression());
+
+        // Make sure the expected ActualText is present.
+        std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize());
+
+        std::vector<std::string> aCodes({ "0632", "062C", "0628", "0623" });
+        std::string aActualText("/Span<</ActualText<");
+        size_t nCount = 0;
+        size_t nPos = 0;
+        while ((nPos = aData.find(aActualText, nPos)) != std::string::npos)
+        {
+            nCount++;
+            nPos += aActualText.length();
+        }
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Number of ActualText entries does not match!", aCodes.size(), nCount);
+
+        for (const auto& aCode : aCodes)
+        {
+            aActualText = "/Span<</ActualText<FEFF" + aCode + ">>>";
+            nPos = aData.find(aActualText);
+            CPPUNIT_ASSERT_MESSAGE("ActualText not found for " + aCode, nPos != std::string::npos);
+        }
+    }
+#endif
+}
 #endif
 #endif
author	Khaled Hosny <khaledhosny@eglug.org>	2018-04-26 12:55:26 +0200
committer	Miklos Vajna <vmiklos@collabora.co.uk>	2018-04-27 11:23:14 +0200
commit	c688b01d9102832226251fc84045408afe392459 (patch)
tree	e000d416369c3d4b032cf2614ce8e9d59eb0e68f /vcl/qa
parent	dfdc165a48d711b867961d1f75ee36a1c9596dc0 (diff)