diff options
author | Khaled Hosny <khaledhosny@eglug.org> | 2018-03-21 16:54:10 +0200 |
---|---|---|
committer | Miklos Vajna <vmiklos@collabora.co.uk> | 2018-03-21 18:31:53 +0100 |
commit | b94a66ebc8db6c5ca9c7dcfdfbb06b49deae4939 (patch) | |
tree | 9bc490e09c28bdd3092fed7073cfec8711102839 /vcl/qa | |
parent | c75705b2306d3fd41e71eb4613773b62bdaa9ca5 (diff) |
tdf#115117: Fix PDF ToUnicode CMAP for ligatures
Move the glyph to character(s) mapping to CommonSalLayout where we have
enough information to do this properly. This correctly handles ligatures
at end of run that wasn’t handled before, and also fixes a bug in the
PDF writer code when there is more than one ligature in the run (it
forgot to clear aCodeUnitsPerGlyph vector after each iteration). Also
drop the “temporary” fix for rotated glyph from 2009 that does not seem
to be needed now (the document from that bug exports correctly after this
change).
Change-Id: I5b5b1f4470bbd0ef05cbbc86dfa29d2ff51249ea
Reviewed-on: https://gerrit.libreoffice.org/51617
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
Diffstat (limited to 'vcl/qa')
-rw-r--r-- | vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt | bin | 0 -> 8566 bytes | |||
-rw-r--r-- | vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt | bin | 0 -> 8629 bytes | |||
-rw-r--r-- | vcl/qa/cppunit/pdfexport/pdfexport.cxx | 217 |
3 files changed, 217 insertions, 0 deletions
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt Binary files differnew file mode 100644 index 000000000000..63fe82946ef1 --- /dev/null +++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt Binary files differnew file mode 100644 index 000000000000..c1e1f6d4392c --- /dev/null +++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx index 9ae6e19be328..fc46f22fec44 100644 --- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx +++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx @@ -8,6 +8,7 @@ */ #include <config_features.h> +#include <config_test.h> #include <com/sun/star/frame/Desktop.hpp> #include <com/sun/star/frame/XStorable.hpp> @@ -25,6 +26,7 @@ #include <tools/zcodec.hxx> #if HAVE_FEATURE_PDFIUM #include <fpdf_edit.h> +#include <fpdf_text.h> #include <fpdfview.h> #endif @@ -67,6 +69,16 @@ public: void testTdf99680(); void testTdf99680_2(); void testTdf108963(); +#if !TEST_FONTS_MISSING + /// Test writing ToUnicode CMAP for LTR ligatures. + void testTdf115117_1(); + /// Text extracting LTR text with ligatures. + void testTdf115117_1a(); + /// Test writing ToUnicode CMAP for RTL ligatures. + void testTdf115117_2(); + /// Text extracting RTL text with ligatures. + void testTdf115117_2a(); +#endif #endif CPPUNIT_TEST_SUITE(PdfExportTest); @@ -85,6 +97,12 @@ public: CPPUNIT_TEST(testTdf99680); CPPUNIT_TEST(testTdf99680_2); CPPUNIT_TEST(testTdf108963); +#if !TEST_FONTS_MISSING + CPPUNIT_TEST(testTdf115117_1); + CPPUNIT_TEST(testTdf115117_1a); + CPPUNIT_TEST(testTdf115117_2); + CPPUNIT_TEST(testTdf115117_2a); +#endif #endif CPPUNIT_TEST_SUITE_END(); }; @@ -760,6 +778,205 @@ void PdfExportTest::testTdf108963() CPPUNIT_ASSERT_EQUAL(1, nYellowPathCount); } + +#if !TEST_FONTS_MISSING +// This requires Carlito font, if it is missing the test will most likely +// fail. +void PdfExportTest::testTdf115117_1() +{ + vcl::filter::PDFDocument aDocument; + load("tdf115117-1.odt", aDocument); + + vcl::filter::PDFObjectElement* pToUnicode = nullptr; + + // Get access to ToUnicode of the first font + for (const auto& aElement : aDocument.GetElements()) + { + auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get()); + if (!pObject) + continue; + auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type")); + if (pType && pType->GetValue() == "Font") + { + auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode")); + CPPUNIT_ASSERT(pToUnicodeRef); + pToUnicode = pToUnicodeRef->LookupObject(); + break; + } + } + + CPPUNIT_ASSERT(pToUnicode); + auto pStream = pToUnicode->GetStream(); + CPPUNIT_ASSERT(pStream); + SvMemoryStream aObjectStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + pStream->GetMemory().Seek(0); + aZCodec.Decompress(pStream->GetMemory(), aObjectStream); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + aObjectStream.Seek(0); + // The first values, <01> <02> etc., are glyph ids, they might change order + // if we changed how font subsets are created. + // The second values, <00740069> etc., are Unicode code points in hex, + // <00740069> is U+0074 and U+0069 i.e. "ti" which is a ligature in + // Carlito/Callibri. This test is failing if any of the second values + // changed which means we are not detecting ligatures and writing CMAP + // entries for them correctly. If glyph order in the subset changes then + // the order here will changes and the PDF has to be carefully inspected to + // ensure that the new values are correct before updating the string below. + OString aCmap("9 beginbfchar\n" + "<01> <00740069>\n" + "<02> <0020>\n" + "<03> <0074>\n" + "<04> <0065>\n" + "<05> <0073>\n" + "<06> <00660069>\n" + "<07> <0066006C>\n" + "<08> <006600660069>\n" + "<09> <00660066006C>\n" + "endbfchar"); + auto pStart = static_cast<const char*>(aObjectStream.GetData()); + const char* pEnd = pStart + aObjectStream.GetSize(); + auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength()); + CPPUNIT_ASSERT(it != pEnd); +} + +// This requires DejaVu Sans font, if it is missing the test will most likely +// fail. +void PdfExportTest::testTdf115117_2() +{ + // See the comments in testTdf115117_1() for explanation. + + vcl::filter::PDFDocument aDocument; + load("tdf115117-2.odt", aDocument); + + vcl::filter::PDFObjectElement* pToUnicode = nullptr; + + for (const auto& aElement : aDocument.GetElements()) + { + auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get()); + if (!pObject) + continue; + auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type")); + if (pType && pType->GetValue() == "Font") + { + auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode")); + CPPUNIT_ASSERT(pToUnicodeRef); + pToUnicode = pToUnicodeRef->LookupObject(); + break; + } + } + + CPPUNIT_ASSERT(pToUnicode); + auto pStream = pToUnicode->GetStream(); + CPPUNIT_ASSERT(pStream); + SvMemoryStream aObjectStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + pStream->GetMemory().Seek(0); + aZCodec.Decompress(pStream->GetMemory(), aObjectStream); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + aObjectStream.Seek(0); + OString aCmap("7 beginbfchar\n" + "<01> <06440627>\n" + "<02> <0020>\n" + "<03> <0641>\n" + "<04> <0642>\n" + "<05> <0648>\n" + "<06> <06440627>\n" + "<07> <0628>\n" + "endbfchar"); + auto pStart = static_cast<const char*>(aObjectStream.GetData()); + const char* pEnd = pStart + aObjectStream.GetSize(); + auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength()); + CPPUNIT_ASSERT(it != pEnd); +} + +void PdfExportTest::testTdf115117_1a() +{ + // Import the bugdoc and export as PDF. + OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-1.odt"; + mxComponent = loadFromDesktop(aURL); + CPPUNIT_ASSERT(mxComponent.is()); + + uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY); + utl::TempFile aTempFile; + aTempFile.EnableKillingFile(); + utl::MediaDescriptor aMediaDescriptor; + aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export"); + xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList()); + + // Parse the export result with pdfium. + SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ); + SvMemoryStream aMemory; + aMemory.WriteStream(aFile); + mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr); + CPPUNIT_ASSERT(mpPdfDocument); + + // The document has one page. + CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument)); + mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0); + CPPUNIT_ASSERT(mpPdfPage); + + auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage); + CPPUNIT_ASSERT(pPdfTextPage); + + // Extract the text from the page. This pdfium API is a bit higher level + // than we want and might apply heuristic that give false positive, but it + // is a good approximation in addition to the check in testTdf115117_1(). + int nChars = FPDFText_CountChars(pPdfTextPage); + CPPUNIT_ASSERT_EQUAL(44, nChars); + + OUString aExpectedText = "ti ti test ti\r\nti test fi fl ffi ffl test fi"; + std::vector<sal_uInt32> aChars(nChars); + for (int i = 0; i < nChars; i++) + aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i); + OUString aActualText(aChars.data(), aChars.size()); + CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText); +} + +void PdfExportTest::testTdf115117_2a() +{ + // See the comments in testTdf115117_1a() for explanation. + + // Import the bugdoc and export as PDF. + OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-2.odt"; + mxComponent = loadFromDesktop(aURL); + CPPUNIT_ASSERT(mxComponent.is()); + + uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY); + utl::TempFile aTempFile; + aTempFile.EnableKillingFile(); + utl::MediaDescriptor aMediaDescriptor; + aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export"); + xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList()); + + // Parse the export result with pdfium. + SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ); + SvMemoryStream aMemory; + aMemory.WriteStream(aFile); + mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr); + CPPUNIT_ASSERT(mpPdfDocument); + + // The document has one page. + CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument)); + mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0); + CPPUNIT_ASSERT(mpPdfPage); + + auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage); + CPPUNIT_ASSERT(pPdfTextPage); + + int nChars = FPDFText_CountChars(pPdfTextPage); + CPPUNIT_ASSERT_EQUAL(13, nChars); + + OUString aExpectedText = u"\u0627\u0644 \u0628\u0627\u0644 \u0648\u0642\u0641 \u0627\u0644"; + std::vector<sal_uInt32> aChars(nChars); + for (int i = 0; i < nChars; i++) + aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i); + OUString aActualText(aChars.data(), aChars.size()); + CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText); +} +#endif #endif CPPUNIT_TEST_SUITE_REGISTRATION(PdfExportTest); |