summaryrefslogtreecommitdiff
path: root/vcl/qa
diff options
context:
space:
mode:
authorKhaled Hosny <khaledhosny@eglug.org>2018-04-26 12:55:26 +0200
committerMiklos Vajna <vmiklos@collabora.co.uk>2018-04-27 11:23:14 +0200
commitc688b01d9102832226251fc84045408afe392459 (patch)
treee000d416369c3d4b032cf2614ce8e9d59eb0e68f /vcl/qa
parentdfdc165a48d711b867961d1f75ee36a1c9596dc0 (diff)
tdf#66597 Fix PDF text extraction for complex text
Implement a more through strategy for embedding textual content in PDF files: * If there is unique one to one or one to many mapping between each glyph index and Unicode code points, use ToUnicode CMAP. * If there is many to one or many to many mapping, use an ActualText span embedding the original string, since ToUnicode can’t handle these. * If the one glyph is used for several Unicode code points, also use ActualText since ToUnicode can map each glyph in the font only once. * Limit ActualText to single cluster at a time, since using it for whole words or sentences breaks text selection and highlighting in PDF viewers (there will be no way to tell which glyphs belong to which characters). * Keep generating (now) redundant ToUnicode entries for compatibility with old tools not supporting ActualText. Change-Id: I33261811b59b3b8fe2164c2c21d3c52c417e6208 Reviewed-on: https://gerrit.libreoffice.org/53315 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
Diffstat (limited to 'vcl/qa')
-rw-r--r--vcl/qa/cppunit/pdfexport/data/tdf66597-1.odtbin0 -> 8154 bytes
-rw-r--r--vcl/qa/cppunit/pdfexport/data/tdf66597-2.odtbin0 -> 8265 bytes
-rw-r--r--vcl/qa/cppunit/pdfexport/pdfexport.cxx196
3 files changed, 195 insertions, 1 deletions
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt
new file mode 100644
index 000000000000..7fecc55c6386
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt
Binary files differ
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt
new file mode 100644
index 000000000000..3d7b5e59cc9d
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt
Binary files differ
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index b9fe20df099f..d280f561fc64 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -75,8 +75,12 @@ public:
void testTdf115117_1a();
/// Test writing ToUnicode CMAP for RTL ligatures.
void testTdf115117_2();
- /// Text extracting RTL text with ligatures.
+ /// Test extracting RTL text with ligatures.
void testTdf115117_2a();
+ /// Test writing ToUnicode CMAP for doubly encoded glyphs.
+ void testTdf66597_1();
+ /// Test writing ActualText for many to one glyph to Unicode mapping.
+ void testTdf66597_2();
#endif
#endif
@@ -101,6 +105,8 @@ public:
CPPUNIT_TEST(testTdf115117_1a);
CPPUNIT_TEST(testTdf115117_2);
CPPUNIT_TEST(testTdf115117_2a);
+ CPPUNIT_TEST(testTdf66597_1);
+ CPPUNIT_TEST(testTdf66597_2);
#endif
#endif
CPPUNIT_TEST_SUITE_END();
@@ -976,6 +982,194 @@ void PdfExportTest::testTdf115117_2a()
OUString aActualText(aChars.data(), aChars.size());
CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
}
+
+// This requires Amiri font, if it is missing the test will fail.
+void PdfExportTest::testTdf66597_1()
+{
+ // FIXME: Fallback font is used on Windows for some reason.
+#if !defined _WIN32
+ vcl::filter::PDFDocument aDocument;
+ load("tdf66597-1.odt", aDocument);
+
+ {
+ // Get access to ToUnicode of the first font
+ vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+ for (const auto& aElement : aDocument.GetElements())
+ {
+ auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+ if (!pObject)
+ continue;
+ auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+ if (pType && pType->GetValue() == "Font")
+ {
+ auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont"));
+ auto aName = pName->GetValue().copy(7); // skip the subset id
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("Amiri-Regular"), aName);
+
+ auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+ CPPUNIT_ASSERT(pToUnicodeRef);
+ pToUnicode = pToUnicodeRef->LookupObject();
+ break;
+ }
+ }
+
+ CPPUNIT_ASSERT(pToUnicode);
+ auto pStream = pToUnicode->GetStream();
+ CPPUNIT_ASSERT(pStream);
+ SvMemoryStream aObjectStream;
+ ZCodec aZCodec;
+ aZCodec.BeginCompression();
+ pStream->GetMemory().Seek(0);
+ aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+ CPPUNIT_ASSERT(aZCodec.EndCompression());
+ aObjectStream.Seek(0);
+ // The <01> is glyph id, <0020> is code point.
+ // The document has three characters <space><nbspace><space>, but the font
+ // reuses the same glyph for space and nbspace so we should have a single
+ // CMAP entry for the space, and nbspace will be handled with ActualText
+ // (tested above).
+ std::string aCmap("1 beginbfchar\n"
+ "<01> <0020>\n"
+ "endbfchar");
+ std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize());
+ auto nPos = aData.find(aCmap);
+ CPPUNIT_ASSERT(nPos != std::string::npos);
+ }
+
+ {
+ auto aPages = aDocument.GetPages();
+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size());
+ // Get page contents and stream.
+ auto pContents = aPages[0]->LookupObject("Contents");
+ CPPUNIT_ASSERT(pContents);
+ auto pStream = pContents->GetStream();
+ CPPUNIT_ASSERT(pStream);
+ auto& rObjectStream = pStream->GetMemory();
+
+ // Uncompress the stream.
+ SvMemoryStream aUncompressed;
+ ZCodec aZCodec;
+ aZCodec.BeginCompression();
+ rObjectStream.Seek(0);
+ aZCodec.Decompress(rObjectStream, aUncompressed);
+ CPPUNIT_ASSERT(aZCodec.EndCompression());
+
+ // Make sure the expected ActualText is present.
+ std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize());
+
+ std::string aActualText("/Span<</ActualText<");
+ size_t nCount = 0;
+ size_t nPos = 0;
+ while ((nPos = aData.find(aActualText, nPos)) != std::string::npos)
+ {
+ nCount++;
+ nPos += aActualText.length();
+ }
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("The should be one ActualText entry!", static_cast<size_t>(1), nCount);
+
+ aActualText = "/Span<</ActualText<FEFF00A0>>>";
+ nPos = aData.find(aActualText);
+ CPPUNIT_ASSERT_MESSAGE("ActualText not found!", nPos != std::string::npos);
+ }
+#endif
+}
+
+// This requires Reem Kufi font, if it is missing the test will fail.
+void PdfExportTest::testTdf66597_2()
+{
+ // FIXME: Fallback font is used on Windows for some reason.
+#if !defined _WIN32
+ vcl::filter::PDFDocument aDocument;
+ load("tdf66597-2.odt", aDocument);
+
+ {
+ // Get access to ToUnicode of the first font
+ vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+ for (const auto& aElement : aDocument.GetElements())
+ {
+ auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+ if (!pObject)
+ continue;
+ auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+ if (pType && pType->GetValue() == "Font")
+ {
+ auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont"));
+ auto aName = pName->GetValue().copy(7); // skip the subset id
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("ReemKufi-Regular"), aName);
+
+ auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+ CPPUNIT_ASSERT(pToUnicodeRef);
+ pToUnicode = pToUnicodeRef->LookupObject();
+ break;
+ }
+ }
+
+ CPPUNIT_ASSERT(pToUnicode);
+ auto pStream = pToUnicode->GetStream();
+ CPPUNIT_ASSERT(pStream);
+ SvMemoryStream aObjectStream;
+ ZCodec aZCodec;
+ aZCodec.BeginCompression();
+ pStream->GetMemory().Seek(0);
+ aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+ CPPUNIT_ASSERT(aZCodec.EndCompression());
+ aObjectStream.Seek(0);
+ std::string aCmap("8 beginbfchar\n"
+ "<02> <0632>\n"
+ "<03> <0020>\n"
+ "<04> <0648>\n"
+ "<05> <0647>\n"
+ "<06> <062F>\n"
+ "<08> <062C>\n"
+ "<09> <0628>\n"
+ "<0B> <0623>\n"
+ "endbfchar");
+ std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize());
+ auto nPos = aData.find(aCmap);
+ CPPUNIT_ASSERT(nPos != std::string::npos);
+ }
+
+ {
+ auto aPages = aDocument.GetPages();
+ CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size());
+ // Get page contents and stream.
+ auto pContents = aPages[0]->LookupObject("Contents");
+ CPPUNIT_ASSERT(pContents);
+ auto pStream = pContents->GetStream();
+ CPPUNIT_ASSERT(pStream);
+ auto& rObjectStream = pStream->GetMemory();
+
+ // Uncompress the stream.
+ SvMemoryStream aUncompressed;
+ ZCodec aZCodec;
+ aZCodec.BeginCompression();
+ rObjectStream.Seek(0);
+ aZCodec.Decompress(rObjectStream, aUncompressed);
+ CPPUNIT_ASSERT(aZCodec.EndCompression());
+
+ // Make sure the expected ActualText is present.
+ std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize());
+
+ std::vector<std::string> aCodes({ "0632", "062C", "0628", "0623" });
+ std::string aActualText("/Span<</ActualText<");
+ size_t nCount = 0;
+ size_t nPos = 0;
+ while ((nPos = aData.find(aActualText, nPos)) != std::string::npos)
+ {
+ nCount++;
+ nPos += aActualText.length();
+ }
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Number of ActualText entries does not match!", aCodes.size(), nCount);
+
+ for (const auto& aCode : aCodes)
+ {
+ aActualText = "/Span<</ActualText<FEFF" + aCode + ">>>";
+ nPos = aData.find(aActualText);
+ CPPUNIT_ASSERT_MESSAGE("ActualText not found for " + aCode, nPos != std::string::npos);
+ }
+ }
+#endif
+}
#endif
#endif