summaryrefslogtreecommitdiff
path: root/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
blob: ab5564a8735341ef915db70dea83739886399f09 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
From 5f83d0a3fac4f8ccef457c03b74433ffd7b12e2a Mon Sep 17 00:00:00 2001
From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
Date: Tue, 5 Jun 2018 11:28:30 +0200
Subject: [PATCH 02/14] svx: more accurate PDF text importing

---
 pdfium/fpdfsdk/fpdf_editpage.cpp | 84 ++++++++++++++++++++++++++++++++++++++++
 pdfium/public/fpdf_edit.h        | 36 +++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
index 912df63..3244943 100644
--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
+++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h"
+#include "core/fpdfapi/font/cpdf_font.h"
 #include "core/fpdfapi/page/cpdf_form.h"
 #include "core/fpdfapi/page/cpdf_formobject.h"
 #include "core/fpdfapi/page/cpdf_imageobject.h"
@@ -626,6 +627,26 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) {
   return true;
 }
 
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object)
+{
+  if (!text_object)
+    return 0;
+
+  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
+  return pTxtObj->CountChars();
+}
+
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object)
+{
+  if (!text_object)
+    return 0;
+
+  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
+  return pTxtObj->GetFontSize();
+}
+
 FPDF_EXPORT void FPDF_CALLCONV
 FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
                       double* a,
@@ -642,3 +663,66 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
   *c = matrix.c;
   *d = matrix.d;
 }
+
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index)
+{
+  if (!text_object || index < 0)
+    return 0;
+
+  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
+  if (index > pTxtObj->CountChars())
+    return 0;
+
+  CPDF_TextObjectItem info;
+  pTxtObj->GetCharInfo(index, &info);
+  return info.m_CharCode;
+}
+
+FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                                                  int char_start,
+                                                  int char_count,
+                                                  unsigned short* result) {
+  if (!text_object || char_start < 0 || char_count < 0 || !result)
+    return 0;
+
+  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
+  int char_available = pTxtObj->CountChars() - char_start;
+  if (char_available <= 0)
+    return 0;
+
+  char_count = std::min(char_count, char_available);
+  if (char_count == 0) {
+    // Writing out "", which has a character count of 1 due to the NUL.
+    *result = '\0';
+    return 1;
+  }
+
+  CPDF_Font* pFont = pTxtObj->GetFont();
+  WideString str;
+  for (uint32_t charcode : pTxtObj->GetCharCodes()) {
+    if (charcode != CPDF_Font::kInvalidCharCode)
+      str += pFont->UnicodeFromCharCode(charcode);
+  }
+
+//   CFX_WideTextBuf m_TextBuf;
+//   WideString str = textpage->GetPageText(char_start, char_count);
+//   return WideString(m_TextBuf.AsStringView().Mid(
+//       static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
+
+//   if (str.GetLength() > static_cast<size_t>(char_count))
+//     str = str.Left(static_cast<size_t>(char_count));
+
+  // Reincode in UTF-16.
+//   WideString str = text.UTF8Decode();
+
+  // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
+  // the number of items to stay the same.
+  ByteString byte_str = str.UTF16LE_Encode();
+  size_t byte_str_len = byte_str.GetLength();
+  int ret_count = byte_str_len / sizeof(unsigned short);
+
+  ASSERT(ret_count <= char_count + 1);  // +1 to account for the NUL terminator.
+  memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
+  return ret_count;
+}
diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
index 3f45495..602849f 100644
--- a/pdfium/public/fpdf_edit.h
+++ b/pdfium/public/fpdf_edit.h
@@ -971,6 +971,26 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
                           FPDF_FONT font,
                           float font_size);
 
+// Get the number of characters from a text object.
+//
+// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+//               or FPDFPageObj_NewTextObjEx.
+// Return Value:
+// A character count in the text object.
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
+
+
+// Get the font size of a text object.
+//
+// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+//               or FPDFPageObj_NewTextObjEx.
+//
+// Return Value:
+// The value of the font size
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
+
 // Get the matrix of a particular text object.
 //
 // text_object - Handle of text object returned by FPDFPageObj_NewTextObj
@@ -986,6 +1006,22 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
                       double* c,
                       double* d);
 
+// Get the unicode of a special character in a text object.
+//
+// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+//               or FPDFPageObj_NewTextObjEx.
+// index - The index of the character to get the unicode.
+// Return Value:
+// The unicode value.
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index);
+
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                    int char_start,
+                    int char_count,
+                    unsigned short* result);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-- 
2.16.3