summaryrefslogtreecommitdiff
path: root/sdext/source
diff options
context:
space:
mode:
authorCaolán McNamara <caolan.mcnamara@collabora.com>2023-09-26 15:52:17 +0100
committerCaolán McNamara <caolan.mcnamara@collabora.com>2023-09-28 10:14:49 +0200
commit9dd0af943df70d7797196ee8f9717596f28b1849 (patch)
tree8155140cfa293dfd2eaecd08422bf018981860e4 /sdext/source
parent25b8fdd3b939a221ba00ca37fbf89adaf893aab7 (diff)
cool#7307 short-circuit pdf parsing during detect if no 'AdditionalStreams'
looks to me that we ignore the contents of the AdditionalStream and re-parse to get it in the final importer, in which case we could presumably parse the mimetype in AdditionalStream here and drop the extraction of the stream. Change-Id: I28e42c2b2fe8d4e10591e523260b08a0d0f7ca28 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/157287 Tested-by: Jenkins Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com>
Diffstat (limited to 'sdext/source')
-rw-r--r--sdext/source/pdfimport/filterdet.cxx39
1 files changed, 39 insertions, 0 deletions
diff --git a/sdext/source/pdfimport/filterdet.cxx b/sdext/source/pdfimport/filterdet.cxx
index ef29e8a2c022..5f6392e18983 100644
--- a/sdext/source/pdfimport/filterdet.cxx
+++ b/sdext/source/pdfimport/filterdet.cxx
@@ -36,6 +36,7 @@
#include <comphelper/hash.hxx>
#include <cppuhelper/supportsservice.hxx>
#include <comphelper/diagnose_ex.hxx>
+#include <tools/stream.hxx>
#include <memory>
#include <utility>
#include <string.h>
@@ -512,6 +513,40 @@ bool checkDocChecksum( const OUString& rInPDFFileURL,
&& (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size()));
}
+/* https://github.com/CollaboraOnline/online/issues/7307
+
+ Light-weight detection to determine if this is a hybrid
+ pdf document worth parsing to get its AdditionalStream
+ and mimetype.
+
+ TODO: a) do we really ignore the contents of the AdditionalStream
+ and re-parse to get it in the final importer?
+ b) in which case we could presumably parse the mimetype in
+ AdditionalStream here and drop the extraction of the stream.
+*/
+static bool detectHasAdditionalStreams(const OUString& rSysUPath)
+{
+ SvFileStream aHybridDetect(rSysUPath, StreamMode::READ);
+ std::vector<OString> aTrailingLines;
+ const sal_uInt64 nLen = aHybridDetect.remainingSize();
+ aHybridDetect.Seek(nLen - std::min<sal_uInt64>(nLen, 4096));
+ OString aLine;
+ while (aHybridDetect.ReadLine(aLine))
+ aTrailingLines.push_back(aLine);
+ bool bAdditionalStreams(false);
+ for (auto it = aTrailingLines.rbegin(); it != aTrailingLines.rend(); ++it)
+ {
+ if (*it == "trailer")
+ break;
+ if (it->startsWith("/AdditionalStreams "))
+ {
+ bAdditionalStreams = true;
+ break;
+ }
+ }
+ return bAdditionalStreams;
+}
+
uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL,
OUString& rOutMimetype,
OUString& io_rPwd,
@@ -524,6 +559,10 @@ uno::Reference< io::XStream > getAdditionalStream( const OUString&
OUString aSysUPath;
if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
return xEmbed;
+
+ if (!detectHasAdditionalStreams(aSysUPath))
+ return xEmbed;
+
aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));