diff options
author | Caolán McNamara <caolan.mcnamara@collabora.com> | 2023-09-26 15:52:17 +0100 |
---|---|---|
committer | Caolán McNamara <caolan.mcnamara@collabora.com> | 2023-09-28 10:14:49 +0200 |
commit | 9dd0af943df70d7797196ee8f9717596f28b1849 (patch) | |
tree | 8155140cfa293dfd2eaecd08422bf018981860e4 | |
parent | 25b8fdd3b939a221ba00ca37fbf89adaf893aab7 (diff) |
cool#7307 short-circuit pdf parsing during detect if no 'AdditionalStreams'
looks to me that we ignore the contents of the AdditionalStream and
re-parse to get it in the final importer, in which case we could
presumably parse the mimetype in AdditionalStream here and drop the
extraction of the stream.
Change-Id: I28e42c2b2fe8d4e10591e523260b08a0d0f7ca28
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/157287
Tested-by: Jenkins
Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com>
-rw-r--r-- | sdext/source/pdfimport/filterdet.cxx | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/sdext/source/pdfimport/filterdet.cxx b/sdext/source/pdfimport/filterdet.cxx index ef29e8a2c022..5f6392e18983 100644 --- a/sdext/source/pdfimport/filterdet.cxx +++ b/sdext/source/pdfimport/filterdet.cxx @@ -36,6 +36,7 @@ #include <comphelper/hash.hxx> #include <cppuhelper/supportsservice.hxx> #include <comphelper/diagnose_ex.hxx> +#include <tools/stream.hxx> #include <memory> #include <utility> #include <string.h> @@ -512,6 +513,40 @@ bool checkDocChecksum( const OUString& rInPDFFileURL, && (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size())); } +/* https://github.com/CollaboraOnline/online/issues/7307 + + Light-weight detection to determine if this is a hybrid + pdf document worth parsing to get its AdditionalStream + and mimetype. + + TODO: a) do we really ignore the contents of the AdditionalStream + and re-parse to get it in the final importer? + b) in which case we could presumably parse the mimetype in + AdditionalStream here and drop the extraction of the stream. +*/ +static bool detectHasAdditionalStreams(const OUString& rSysUPath) +{ + SvFileStream aHybridDetect(rSysUPath, StreamMode::READ); + std::vector<OString> aTrailingLines; + const sal_uInt64 nLen = aHybridDetect.remainingSize(); + aHybridDetect.Seek(nLen - std::min<sal_uInt64>(nLen, 4096)); + OString aLine; + while (aHybridDetect.ReadLine(aLine)) + aTrailingLines.push_back(aLine); + bool bAdditionalStreams(false); + for (auto it = aTrailingLines.rbegin(); it != aTrailingLines.rend(); ++it) + { + if (*it == "trailer") + break; + if (it->startsWith("/AdditionalStreams ")) + { + bAdditionalStreams = true; + break; + } + } + return bAdditionalStreams; +} + uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL, OUString& rOutMimetype, OUString& io_rPwd, @@ -524,6 +559,10 @@ uno::Reference< io::XStream > getAdditionalStream( const OUString& OUString aSysUPath; if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None ) return xEmbed; + + if (!detectHasAdditionalStreams(aSysUPath)) + return xEmbed; + aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() ); std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() )); |