cool#7307 short-circuit pdf parsing during detect if no 'AdditionalStreams'

looks to me that we ignore the contents of the AdditionalStream and re-parse to get it in the final importer, in which case we could presumably parse the mimetype in AdditionalStream here and drop the extraction of the stream. Change-Id: I28e42c2b2fe8d4e10591e523260b08a0d0f7ca28 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/157287 Tested-by: Jenkins Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com>
author: Caolán McNamara <caolan.mcnamara@collabora.com> 2023-09-26 15:52:17 +0100
committer: Caolán McNamara <caolan.mcnamara@collabora.com> 2023-09-28 10:14:49 +0200
commit: 9dd0af943df70d7797196ee8f9717596f28b1849 (patch)
tree: 8155140cfa293dfd2eaecd08422bf018981860e4
parent: 25b8fdd3b939a221ba00ca37fbf89adaf893aab7 (diff)
1 files changed, 39 insertions, 0 deletions
diff --git a/sdext/source/pdfimport/filterdet.cxx b/sdext/source/pdfimport/filterdet.cxx
index ef29e8a2c022..5f6392e18983 100644
--- a/sdext/source/pdfimport/filterdet.cxx
+++ b/sdext/source/pdfimport/filterdet.cxx
@@ -36,6 +36,7 @@
 #include <comphelper/hash.hxx>
 #include <cppuhelper/supportsservice.hxx>
 #include <comphelper/diagnose_ex.hxx>
+#include <tools/stream.hxx>
 #include <memory>
 #include <utility>
 #include <string.h>
@@ -512,6 +513,40 @@ bool checkDocChecksum( const OUString& rInPDFFileURL,
         && (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size()));
 }
 
+/* https://github.com/CollaboraOnline/online/issues/7307
+
+   Light-weight detection to determine if this is a hybrid
+   pdf document worth parsing to get its AdditionalStream
+   and mimetype.
+
+   TODO: a) do we really ignore the contents of the AdditionalStream
+   and re-parse to get it in the final importer?
+         b) in which case we could presumably parse the mimetype in
+   AdditionalStream here and drop the extraction of the stream.
+*/
+static bool detectHasAdditionalStreams(const OUString& rSysUPath)
+{
+    SvFileStream aHybridDetect(rSysUPath, StreamMode::READ);
+    std::vector<OString> aTrailingLines;
+    const sal_uInt64 nLen = aHybridDetect.remainingSize();
+    aHybridDetect.Seek(nLen - std::min<sal_uInt64>(nLen, 4096));
+    OString aLine;
+    while (aHybridDetect.ReadLine(aLine))
+        aTrailingLines.push_back(aLine);
+    bool bAdditionalStreams(false);
+    for (auto it = aTrailingLines.rbegin(); it != aTrailingLines.rend(); ++it)
+    {
+        if (*it == "trailer")
+            break;
+        if (it->startsWith("/AdditionalStreams "))
+        {
+            bAdditionalStreams = true;
+            break;
+        }
+    }
+    return bAdditionalStreams;
+}
+
 uno::Reference< io::XStream > getAdditionalStream( const OUString&                          rInPDFFileURL,
                                                    OUString&                                rOutMimetype,
                                                    OUString&                                io_rPwd,
@@ -524,6 +559,10 @@ uno::Reference< io::XStream > getAdditionalStream( const OUString&
     OUString aSysUPath;
     if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
         return xEmbed;
+
+    if (!detectHasAdditionalStreams(aSysUPath))
+        return xEmbed;
+
     aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
 
     std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));
author	Caolán McNamara <caolan.mcnamara@collabora.com>	2023-09-26 15:52:17 +0100
committer	Caolán McNamara <caolan.mcnamara@collabora.com>	2023-09-28 10:14:49 +0200
commit	9dd0af943df70d7797196ee8f9717596f28b1849 (patch)
tree	8155140cfa293dfd2eaecd08422bf018981860e4
parent	25b8fdd3b939a221ba00ca37fbf89adaf893aab7 (diff)