summaryrefslogtreecommitdiff
path: root/svx
diff options
context:
space:
mode:
authorAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-04-22 10:48:51 -0400
committerAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-06-04 12:36:28 -0400
commit9d03df990070b6e96cfb22d4413fd3e5d6a15763 (patch)
treeae1e311bf5ce896554b8189249f699313bb58443 /svx
parentd633c02c1ecdf4148c81a6b41c769b72c52ea6b2 (diff)
svx: import processed PDF text
Some PDFs don't include spaces in the text. Instead, they rely on the explicit positioning of each character to render visually separated words. Latex seems to be prone to this approach, though not exclusively. Luckily, PDFium does process text and inserts "generated" spaces where necessary, which is what we retrieve and use as the text string while importing. Change-Id: Ic21fe6c8416ecaba66f06b6260f1d6b040ff12af (cherry picked from commit da4b44d6afc01de4fb08251732ddcbdbd832b71f)
Diffstat (limited to 'svx')
-rw-r--r--svx/source/svdraw/svdpdf.cxx30
-rw-r--r--svx/source/svdraw/svdpdf.hxx12
2 files changed, 25 insertions, 17 deletions
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index ddb82cd9bf6b..a732989f092a 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -227,13 +227,18 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
<< ", height: " << dPageHeight);
SetupPageScale(dPageWidth, dPageHeight);
+ // Load the page text to extract it when we get text elements.
+ FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
+
const int nPageObjectCount = FPDFPage_CountObject(pPdfPage);
for (int nPageObjectIndex = 0; nPageObjectIndex < nPageObjectCount; ++nPageObjectIndex)
{
FPDF_PAGEOBJECT pPageObject = FPDFPage_GetObject(pPdfPage, nPageObjectIndex);
- ImportPdfObject(pPageObject, nPageObjectIndex);
+ ImportPdfObject(pPageObject, pTextPage, nPageObjectIndex);
}
+ FPDFText_ClosePage(pTextPage);
+
#if 0
// Now do the text.
FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
@@ -993,8 +998,8 @@ void ImpSdrPdfImport::checkClip()
}
bool ImpSdrPdfImport::isClip() const { return !maClip.getB2DRange().isEmpty(); }
-
-void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
if (pPageObject == nullptr)
return;
@@ -1003,7 +1008,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
switch (nPageObjectType)
{
case FPDF_PAGEOBJ_TEXT:
- ImportText(pPageObject, nPageObjectIndex);
+ ImportText(pPageObject, pTextPage, nPageObjectIndex);
break;
case FPDF_PAGEOBJ_PATH:
ImportPath(pPageObject, nPageObjectIndex);
@@ -1015,7 +1020,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
SAL_WARN("sd.filter", "Got page object SHADING: " << nPageObjectIndex);
break;
case FPDF_PAGEOBJ_FORM:
- ImportForm(pPageObject, nPageObjectIndex);
+ ImportForm(pPageObject, pTextPage, nPageObjectIndex);
break;
default:
SAL_WARN("sd.filter", "Unknown PDF page object #" << nPageObjectIndex
@@ -1024,7 +1029,8 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
}
}
-void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object FORM: " << nPageObjectIndex);
@@ -1039,14 +1045,15 @@ void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
for (int nIndex = 0; nIndex < nCount; ++nIndex)
{
FPDF_PAGEOBJECT pFormObject = FPDFFormObj_GetSubObject(pPageObject, nIndex);
- ImportPdfObject(pFormObject, -1);
+ ImportPdfObject(pFormObject, pTextPage, -1);
}
// Restore the old one.
mCurMatrix = aOldMatrix;
}
-void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object TEXT: " << nPageObjectIndex);
float left;
@@ -1078,14 +1085,15 @@ void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
SAL_WARN("sd.filter", "Got TEXT origin: " << aPos);
SAL_WARN("sd.filter", "Got TEXT Bounds: " << aRect);
- const int nChars = FPDFTextObj_CountChars(pPageObject);
+ const int nChars = FPDFTextObj_CountChars(pPageObject) * 2;
std::unique_ptr<sal_Unicode[]> pText(new sal_Unicode[nChars + 1]); // + terminating null
unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
- const int nActualChars = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+ const int nActualChars
+ = FPDFTextObj_GetTextProcessed(pPageObject, pTextPage, 0, nChars, pShortText);
if (nActualChars <= 0)
{
- SAL_WARN("sd.filter", "Got not TEXT");
+ SAL_WARN("sd.filter", "Got no TEXT");
return;
}
diff --git a/svx/source/svdraw/svdpdf.hxx b/svx/source/svdraw/svdpdf.hxx
index d36c939be91a..4b2fba1bf8cd 100644
--- a/svx/source/svdraw/svdpdf.hxx
+++ b/svx/source/svdraw/svdpdf.hxx
@@ -42,6 +42,8 @@ class SdrObject;
class SvdProgressInfo;
typedef void* FPDF_DOCUMENT;
typedef void* FPDF_PAGEOBJECT; // (text, path, etc.)
+typedef void* FPDF_TEXTPAGE;
+
// Helper Class to import PDF
class ImpSdrPdfImport final
{
@@ -85,7 +87,6 @@ class ImpSdrPdfImport final
double d() const { return md; }
double e() const { return me; }
double f() const { return mf; }
-
/// Mutliply this * other.
void Concatinate(const Matrix& other)
{
@@ -155,7 +156,6 @@ class ImpSdrPdfImport final
/// Correct the vertical coordinate to start at the top.
/// PDF coordinate system has orign at the bottom right.
double correctVertOrigin(double offsetPts) const { return mdPageHeightPts - offsetPts; }
-
/// Convert PDF points to logic (twips).
tools::Rectangle PointsToLogic(double left, double right, double top, double bottom) const;
Point PointsToLogic(double x, double y) const;
@@ -164,11 +164,12 @@ class ImpSdrPdfImport final
void checkClip();
bool isClip() const;
- void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex);
+ void ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportImage(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
void ImportPath(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportText(const Point& rPos, const Size& rSize, const OUString& rStr);
void SetupPageScale(const double dPageWidth, const double dPageHeight);
@@ -192,7 +193,6 @@ public:
~ImpSdrPdfImport();
int GetPageCount() const { return mnPageCount; }
-
size_t DoImport(SdrObjList& rDestList, size_t nInsPos, int nPageNumber,
SvdProgressInfo* pProgrInfo = nullptr);
};