diff options
-rw-r--r-- | xmlsecurity/inc/pdfio/pdfdocument.hxx | 50 | ||||
-rw-r--r-- | xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf | bin | 0 -> 81882 bytes | |||
-rw-r--r-- | xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx | 14 | ||||
-rw-r--r-- | xmlsecurity/source/pdfio/pdfdocument.cxx | 439 |
4 files changed, 382 insertions, 121 deletions
diff --git a/xmlsecurity/inc/pdfio/pdfdocument.hxx b/xmlsecurity/inc/pdfio/pdfdocument.hxx index 95663e6c190b..37457c024d42 100644 --- a/xmlsecurity/inc/pdfio/pdfdocument.hxx +++ b/xmlsecurity/inc/pdfio/pdfdocument.hxx @@ -45,7 +45,40 @@ enum class TokenizeMode /// Till the first %%EOF token. EOF_TOKEN, /// Till the end of the current object. - END_OF_OBJECT + END_OF_OBJECT, + /// Same as END_OF_OBJECT, but for object streams (no endobj keyword). + STORED_OBJECT +}; + +/// The type column of an entry in a cross-reference stream. +enum class XRefEntryType +{ + /// xref "n" or xref stream "1". + NOT_COMPRESSED, + /// xref stream "2. + COMPRESSED +}; + +/// An entry in a cross-reference stream. +struct XRefEntry +{ + XRefEntryType m_eType; + /** + * Non-compressed: The byte offset of the object, starting from the + * beginning of the file. + * Compressed: The object number of the object stream in which this object is + * stored. + */ + sal_uInt64 m_nOffset; + /** + * Non-compressed: The generation number of the object. + * Compressed: The index of this object within the object stream. + */ + sal_uInt64 m_nGenerationNumber; + /// Are changed as part of an incremental update?. + bool m_bDirty; + + XRefEntry(); }; /** @@ -60,9 +93,7 @@ class XMLSECURITY_DLLPUBLIC PDFDocument /// This vector owns all elements. std::vector< std::unique_ptr<PDFElement> > m_aElements; /// Object ID <-> object offset map. - std::map<size_t, size_t> m_aXRef; - /// Object ID <-> "are changed as part of an incremental update?" map. - std::map<size_t, bool> m_aXRefDirty; + std::map<size_t, XRefEntry> m_aXRef; /// Object offset <-> Object pointer map. std::map<size_t, PDFObjectElement*> m_aOffsetObjects; /// Object ID <-> Object pointer map. @@ -80,8 +111,6 @@ class XMLSECURITY_DLLPUBLIC PDFDocument static int AsHex(char ch); /// Decode a hex dump. static std::vector<unsigned char> DecodeHexString(PDFHexStringElement* pElement); - /// Tokenize elements from current offset. - bool Tokenize(SvStream& rStream, TokenizeMode eMode); public: PDFDocument(); @@ -99,7 +128,14 @@ public: std::vector<PDFObjectElement*> GetPages(); /// Remember the end location of an EOF token. void PushBackEOF(size_t nOffset); - const std::map<size_t, PDFObjectElement*>& GetIDObjects() const; + /// Look up object based on object number, possibly by parsing object streams. + PDFObjectElement* LookupObject(size_t nObjectNumber); + /// Access to the input document, even after the inpust ream is gone. + SvMemoryStream& GetEditBuffer(); + /// Tokenize elements from current offset. + bool Tokenize(SvStream& rStream, TokenizeMode eMode, std::vector< std::unique_ptr<PDFElement> >& rElements, PDFObjectElement* pObject); + /// Register an object (owned directly or indirectly by m_aElements) as a provder for a given ID. + void SetIDObject(size_t nID, PDFObjectElement* pObject); /// Read elements from the start of the stream till its end. bool Read(SvStream& rStream); diff --git a/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf b/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf Binary files differnew file mode 100644 index 000000000000..ac1c5f37b972 --- /dev/null +++ b/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf diff --git a/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx b/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx index 469ded6978c6..2f7ef572c581 100644 --- a/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx +++ b/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx @@ -57,6 +57,8 @@ public: void testPDFRemoveAll(); /// Test a PDF 1.4 document, signed by Adobe. void testPDF14Adobe(); + /// Test a PDF 1.6 document, signed by Adobe. + void testPDF16Adobe(); CPPUNIT_TEST_SUITE(PDFSigningTest); CPPUNIT_TEST(testPDFAdd); @@ -64,6 +66,7 @@ public: CPPUNIT_TEST(testPDFRemove); CPPUNIT_TEST(testPDFRemoveAll); CPPUNIT_TEST(testPDF14Adobe); + CPPUNIT_TEST(testPDF16Adobe); CPPUNIT_TEST_SUITE_END(); }; @@ -254,6 +257,17 @@ void PDFSigningTest::testPDF14Adobe() #endif } +void PDFSigningTest::testPDF16Adobe() +{ +#ifndef _WIN32 + // Contains a cross-reference stream, object streams and a compressed + // stream with a predictor. And a valid signature. + // Found signatures was 0, as parsing failed due to lack of support for + // these features. + verify(m_directories.getURLFromSrc(DATA_DIRECTORY) + "pdf16adobe.pdf", 1); +#endif +} + CPPUNIT_TEST_SUITE_REGISTRATION(PDFSigningTest); CPPUNIT_PLUGIN_IMPLEMENT(); diff --git a/xmlsecurity/source/pdfio/pdfdocument.cxx b/xmlsecurity/source/pdfio/pdfdocument.cxx index b690b5d62f77..894247f8f202 100644 --- a/xmlsecurity/source/pdfio/pdfdocument.cxx +++ b/xmlsecurity/source/pdfio/pdfdocument.cxx @@ -78,6 +78,7 @@ public: class PDFReferenceElement; class PDFDictionaryElement; class PDFArrayElement; +class PDFStreamElement; /// Indirect object: something with a unique ID. class PDFObjectElement : public PDFElement @@ -93,6 +94,12 @@ class PDFObjectElement : public PDFElement PDFDictionaryElement* m_pDictionaryElement; /// The contained direct array, if any. PDFArrayElement* m_pArrayElement; + /// The stream of this object, used when this is an object stream. + PDFStreamElement* m_pStreamElement; + /// Objects of an object stream. + std::vector< std::unique_ptr<PDFObjectElement> > m_aStoredElements; + /// Elements of an object in an object stream. + std::vector< std::unique_ptr<PDFElement> > m_aElements; public: PDFObjectElement(PDFDocument& rDoc, double fObjectValue, double fGenerationValue); @@ -107,7 +114,11 @@ public: PDFDictionaryElement* GetDictionary() const; void SetDictionary(PDFDictionaryElement* pDictionaryElement); void SetArray(PDFArrayElement* pArrayElement); + void SetStream(PDFStreamElement* pStreamElement); PDFArrayElement* GetArray() const; + /// Parse objects stored in this object stream. + void ParseStoredObjects(); + std::vector< std::unique_ptr<PDFElement> >& GetStoredElements(); }; /// Dictionary object: a set key-value pairs. @@ -175,7 +186,7 @@ public: /// Assuming the reference points to a number object, return its value. double LookupNumber(SvStream& rStream) const; /// Lookup referenced object, without assuming anything about its contents. - PDFObjectElement* LookupObject() const; + PDFObjectElement* LookupObject(); int GetObjectValue() const; int GetGenerationValue() const; }; @@ -275,6 +286,14 @@ public: PDFElement* Lookup(const OString& rDictionaryKey); }; +XRefEntry::XRefEntry() + : m_eType(XRefEntryType::NOT_COMPRESSED), + m_nOffset(0), + m_nGenerationNumber(0), + m_bDirty(false) +{ +} + PDFDocument::PDFDocument() : m_pTrailer(nullptr), m_pXRefStream(nullptr) @@ -315,14 +334,15 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat // Write signature object. sal_Int32 nSignatureId = m_aXRef.size(); - sal_uInt64 nSignatureOffset = m_aEditBuffer.Tell(); - m_aXRef[nSignatureId] = nSignatureOffset; - m_aXRefDirty[nSignatureId] = true; + XRefEntry aSignatureEntry; + aSignatureEntry.m_nOffset = m_aEditBuffer.Tell(); + aSignatureEntry.m_bDirty = true; + m_aXRef[nSignatureId] = aSignatureEntry; OStringBuffer aSigBuffer; aSigBuffer.append(nSignatureId); aSigBuffer.append(" 0 obj\n"); aSigBuffer.append("<</Contents <"); - sal_Int64 nSignatureContentOffset = nSignatureOffset + aSigBuffer.getLength(); + sal_Int64 nSignatureContentOffset = aSignatureEntry.m_nOffset + aSigBuffer.getLength(); // Reserve space for the PKCS#7 object. const int MAX_SIGNATURE_CONTENT_LENGTH = 50000; OStringBuffer aContentFiller(MAX_SIGNATURE_CONTENT_LENGTH); @@ -337,7 +357,7 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat aSigBuffer.append(" "); aSigBuffer.append(nSignatureContentOffset + MAX_SIGNATURE_CONTENT_LENGTH + 1); aSigBuffer.append(" "); - sal_uInt64 nSignatureLastByteRangeOffset = nSignatureOffset + aSigBuffer.getLength(); + sal_uInt64 nSignatureLastByteRangeOffset = aSignatureEntry.m_nOffset + aSigBuffer.getLength(); // We don't know how many bytes we need for the last ByteRange value, this // should be enough. OStringBuffer aByteRangeFiller; @@ -358,8 +378,10 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat // Write appearance object. sal_Int32 nAppearanceId = m_aXRef.size(); - m_aXRef[nAppearanceId] = m_aEditBuffer.Tell(); - m_aXRefDirty[nAppearanceId] = true; + XRefEntry aAppearanceEntry; + aAppearanceEntry.m_nOffset = m_aEditBuffer.Tell(); + aAppearanceEntry.m_bDirty = true; + m_aXRef[nAppearanceId] = aAppearanceEntry; m_aEditBuffer.WriteUInt32AsString(nAppearanceId); m_aEditBuffer.WriteCharPtr(" 0 obj\n"); m_aEditBuffer.WriteCharPtr("<</Type/XObject\n/Subtype/Form\n"); @@ -368,8 +390,10 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat // Write the Annot object, references nSignatureId and nAppearanceId. sal_Int32 nAnnotId = m_aXRef.size(); - m_aXRef[nAnnotId] = m_aEditBuffer.Tell(); - m_aXRefDirty[nAnnotId] = true; + XRefEntry aAnnotEntry; + aAnnotEntry.m_nOffset = m_aEditBuffer.Tell(); + aAnnotEntry.m_bDirty = true; + m_aXRef[nAnnotId] = aAnnotEntry; m_aEditBuffer.WriteUInt32AsString(nAnnotId); m_aEditBuffer.WriteCharPtr(" 0 obj\n"); m_aEditBuffer.WriteCharPtr("<</Type/Annot/Subtype/Widget/F 132\n"); @@ -406,8 +430,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Sign: invalid first page obj id"); return false; } - m_aXRef[nFirstPageId] = m_aEditBuffer.Tell(); - m_aXRefDirty[nFirstPageId] = true; + m_aXRef[nFirstPageId].m_nOffset = m_aEditBuffer.Tell(); + m_aXRef[nFirstPageId].m_bDirty = true; m_aEditBuffer.WriteUInt32AsString(nFirstPageId); m_aEditBuffer.WriteCharPtr(" 0 obj\n"); m_aEditBuffer.WriteCharPtr("<<"); @@ -459,8 +483,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Sign: invalid catalog obj id"); return false; } - m_aXRef[nCatalogId] = m_aEditBuffer.Tell(); - m_aXRefDirty[nCatalogId] = true; + m_aXRef[nCatalogId].m_nOffset = m_aEditBuffer.Tell(); + m_aXRef[nCatalogId].m_bDirty = true; m_aEditBuffer.WriteUInt32AsString(nCatalogId); m_aEditBuffer.WriteCharPtr(" 0 obj\n"); m_aEditBuffer.WriteCharPtr("<<"); @@ -510,8 +534,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat for (const auto& rXRef : m_aXRef) { size_t nObject = rXRef.first; - size_t nOffset = rXRef.second; - if (!m_aXRefDirty[nObject]) + size_t nOffset = rXRef.second.m_nOffset; + if (!rXRef.second.m_bDirty) continue; m_aEditBuffer.WriteUInt32AsString(nObject); @@ -632,13 +656,13 @@ bool PDFDocument::Write(SvStream& rStream) return rStream.good(); } -bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) +bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode, std::vector< std::unique_ptr<PDFElement> >& rElements, PDFObjectElement* pObjectElement) { + // Last seen object token. + PDFObjectElement* pObject = pObjectElement; bool bInXRef = false; // The next number will be an xref offset. bool bInStartXRef = false; - // Last seen object token. - PDFObjectElement* pObject = nullptr; // Dictionary depth, so we know when we're outside any dictionaries. int nDictionaryDepth = 0; // Last seen array token that's outside any dictionaries. @@ -655,9 +679,9 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) case '%': { auto pComment = new PDFCommentElement(*this); - m_aElements.push_back(std::unique_ptr<PDFElement>(pComment)); + rElements.push_back(std::unique_ptr<PDFElement>(pComment)); rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; if (eMode == TokenizeMode::EOF_TOKEN && !m_aEOFs.empty() && m_aEOFs.back() == rStream.Tell()) { @@ -673,28 +697,28 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) rStream.SeekRel(-2); if (ch == '<') { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFDictionaryElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFDictionaryElement())); ++nDictionaryDepth; } else - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFHexStringElement())); - if (!m_aElements.back()->Read(rStream)) + rElements.push_back(std::unique_ptr<PDFElement>(new PDFHexStringElement())); + if (!rElements.back()->Read(rStream)) return false; break; } case '>': { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndDictionaryElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndDictionaryElement())); --nDictionaryDepth; rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; break; } case '[': { auto pArr = new PDFArrayElement(); - m_aElements.push_back(std::unique_ptr<PDFElement>(pArr)); + rElements.push_back(std::unique_ptr<PDFElement>(pArr)); if (nDictionaryDepth == 0) { // The array is attached directly, inform the object. @@ -703,32 +727,32 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) pObject->SetArray(pArray); } rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; break; } case ']': { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndArrayElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndArrayElement())); pArray = nullptr; rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; break; } case '/': { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFNameElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFNameElement())); rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; break; } case '(': { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFLiteralStringElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFLiteralStringElement())); rStream.SeekRel(-1); - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; break; } @@ -738,7 +762,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) { // Numbering object: an integer or a real. PDFNumberElement* pNumberElement = new PDFNumberElement(); - m_aElements.push_back(std::unique_ptr<PDFElement>(pNumberElement)); + rElements.push_back(std::unique_ptr<PDFElement>(pNumberElement)); rStream.SeekRel(-1); if (!pNumberElement->Read(rStream)) return false; @@ -761,15 +785,15 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) bool bObj = aKeyword == "obj"; if (bObj || aKeyword == "R") { - size_t nElements = m_aElements.size(); + size_t nElements = rElements.size(); if (nElements < 2) { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Tokenize: expected at least two tokens before 'obj' or 'R' keyword"); return false; } - auto pObjectNumber = dynamic_cast<PDFNumberElement*>(m_aElements[nElements - 2].get()); - auto pGenerationNumber = dynamic_cast<PDFNumberElement*>(m_aElements[nElements - 1].get()); + auto pObjectNumber = dynamic_cast<PDFNumberElement*>(rElements[nElements - 2].get()); + auto pGenerationNumber = dynamic_cast<PDFNumberElement*>(rElements[nElements - 1].get()); if (!pObjectNumber || !pGenerationNumber) { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Tokenize: missing object or generation number before 'obj' or 'R' keyword"); @@ -779,34 +803,34 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) if (bObj) { pObject = new PDFObjectElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue()); - m_aElements.push_back(std::unique_ptr<PDFElement>(pObject)); + rElements.push_back(std::unique_ptr<PDFElement>(pObject)); m_aOffsetObjects[pObjectNumber->GetLocation()] = pObject; m_aIDObjects[pObjectNumber->GetValue()] = pObject; } else { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFReferenceElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue()))); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFReferenceElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue()))); if (pArray) // Reference is part of a direct (non-dictionary) array, inform the array. - pArray->PushBack(m_aElements.back().get()); + pArray->PushBack(rElements.back().get()); } - if (!m_aElements.back()->Read(rStream)) + if (!rElements.back()->Read(rStream)) return false; } else if (aKeyword == "stream") { // Look up the length of the stream from the parent object's dictionary. size_t nLength = 0; - for (size_t nElement = 0; nElement < m_aElements.size(); ++nElement) + for (size_t nElement = 0; nElement < rElements.size(); ++nElement) { // Iterate in reverse order. - size_t nIndex = m_aElements.size() - nElement - 1; - PDFElement* pElement = m_aElements[nIndex].get(); - auto pObjectElement = dynamic_cast<PDFObjectElement*>(pElement); - if (!pObjectElement) + size_t nIndex = rElements.size() - nElement - 1; + PDFElement* pElement = rElements[nIndex].get(); + auto pObj = dynamic_cast<PDFObjectElement*>(pElement); + if (!pObj) continue; - PDFElement* pLookup = pObjectElement->Lookup("Length"); + PDFElement* pLookup = pObj->Lookup("Length"); auto pReference = dynamic_cast<PDFReferenceElement*>(pLookup); if (pReference) { @@ -828,20 +852,23 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) } PDFDocument::SkipLineBreaks(rStream); - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFStreamElement(nLength))); - if (!m_aElements.back()->Read(rStream)) + auto pStreamElement = new PDFStreamElement(nLength); + if (pObject) + pObject->SetStream(pStreamElement); + rElements.push_back(std::unique_ptr<PDFElement>(pStreamElement)); + if (!rElements.back()->Read(rStream)) return false; } else if (aKeyword == "endstream") { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndStreamElement())); - if (!m_aElements.back()->Read(rStream)) + rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndStreamElement())); + if (!rElements.back()->Read(rStream)) return false; } else if (aKeyword == "endobj") { - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndObjectElement())); - if (!m_aElements.back()->Read(rStream)) + rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndObjectElement())); + if (!rElements.back()->Read(rStream)) return false; if (eMode == TokenizeMode::END_OF_OBJECT) { @@ -850,9 +877,9 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) } } else if (aKeyword == "true" || aKeyword == "false") - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFBooleanElement(aKeyword.toBoolean()))); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFBooleanElement(aKeyword.toBoolean()))); else if (aKeyword == "null") - m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFNullElement())); + rElements.push_back(std::unique_ptr<PDFElement>(new PDFNullElement())); else if (aKeyword == "xref") // Allow 'f' and 'n' keywords. bInXRef = true; @@ -862,7 +889,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) else if (aKeyword == "trailer") { m_pTrailer = new PDFTrailerElement(*this); - m_aElements.push_back(std::unique_ptr<PDFElement>(m_pTrailer)); + rElements.push_back(std::unique_ptr<PDFElement>(m_pTrailer)); } else if (aKeyword == "startxref") { @@ -890,6 +917,11 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) return true; } +void PDFDocument::SetIDObject(size_t nID, PDFObjectElement* pObject) +{ + m_aIDObjects[nID] = pObject; +} + bool PDFDocument::Read(SvStream& rStream) { // Check file magic. @@ -917,16 +949,30 @@ bool PDFDocument::Read(SvStream& rStream) while (true) { rStream.Seek(nStartXRef); - ReadXRef(rStream); - if (!Tokenize(rStream, TokenizeMode::EOF_TOKEN)) + OString aKeyword = ReadKeyword(rStream); + if (aKeyword.isEmpty()) + ReadXRefStream(rStream); + + else { - SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: failed to tokenizer trailer after xref"); - return false; + if (aKeyword != "xref") + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: xref is not the first keyword"); + return false; + } + ReadXRef(rStream); + if (!Tokenize(rStream, TokenizeMode::EOF_TOKEN, m_aElements, nullptr)) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: failed to tokenizer trailer after xref"); + return false; + } } PDFNumberElement* pPrev = nullptr; if (m_pTrailer) pPrev = dynamic_cast<PDFNumberElement*>(m_pTrailer->Lookup("Prev")); + else if (m_pXRefStream) + pPrev = dynamic_cast<PDFNumberElement*>(m_pXRefStream->Lookup("Prev")); if (pPrev) nStartXRef = pPrev->GetValue(); @@ -942,7 +988,7 @@ bool PDFDocument::Read(SvStream& rStream) // Then we can tokenize the stream. rStream.Seek(0); - return Tokenize(rStream, TokenizeMode::END_OF_STREAM); + return Tokenize(rStream, TokenizeMode::END_OF_STREAM, m_aElements, nullptr); } OString PDFDocument::ReadKeyword(SvStream& rStream) @@ -997,7 +1043,7 @@ size_t PDFDocument::FindStartXRef(SvStream& rStream) void PDFDocument::ReadXRefStream(SvStream& rStream) { // Look up the stream length in the object dictionary. - if (!Tokenize(rStream, TokenizeMode::END_OF_OBJECT)) + if (!Tokenize(rStream, TokenizeMode::END_OF_OBJECT, m_aElements, nullptr)) { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: failed to read object"); return; @@ -1024,6 +1070,9 @@ void PDFDocument::ReadXRefStream(SvStream& rStream) return; } + // So that the Prev key can be looked up later. + m_pXRefStream = pObject; + PDFElement* pLookup = pObject->Lookup("Length"); auto pNumber = dynamic_cast<PDFNumberElement*>(pLookup); if (!pNumber) @@ -1095,25 +1144,37 @@ void PDFDocument::ReadXRefStream(SvStream& rStream) // Look up the first and the last entry we need to read. auto pIndex = dynamic_cast<PDFArrayElement*>(pObject->Lookup("Index")); + size_t nFirstObject = 0; + size_t nNumberOfObjects = 0; if (!pIndex || pIndex->GetElements().size() < 2) { - SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index not found or has < 2 elements"); - return; + auto pSize = dynamic_cast<PDFNumberElement*>(pObject->Lookup("Size")); + if (pSize) + nNumberOfObjects = pSize->GetValue(); + else + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index not found or has < 2 elements"); + return; + } } - - const std::vector<PDFElement*>& rIndexElements = pIndex->GetElements(); - auto pFirstObject = dynamic_cast<PDFNumberElement*>(rIndexElements[0]); - if (!pFirstObject) + else { - SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no first object"); - return; - } + const std::vector<PDFElement*>& rIndexElements = pIndex->GetElements(); + auto pFirstObject = dynamic_cast<PDFNumberElement*>(rIndexElements[0]); + if (!pFirstObject) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no first object"); + return; + } + nFirstObject = pFirstObject->GetValue(); - auto pNumberOfObjects = dynamic_cast<PDFNumberElement*>(rIndexElements[1]); - if (!pNumberOfObjects) - { - SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no number of objects"); - return; + auto pNumberOfObjects = dynamic_cast<PDFNumberElement*>(rIndexElements[1]); + if (!pNumberOfObjects) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no number of objects"); + return; + } + nNumberOfObjects = pNumberOfObjects->GetValue(); } // Look up the format of a single entry. @@ -1145,15 +1206,14 @@ void PDFDocument::ReadXRefStream(SvStream& rStream) return; } - size_t nSize = pNumberOfObjects->GetValue(); aStream.Seek(0); // This is the line as read from the stream. std::vector<unsigned char> aOrigLine(nLineLength); // This is the line as it appears after tweaking according to nPredictor. std::vector<unsigned char> aFilteredLine(nLineLength); - for (size_t nEntry = 0; nEntry < nSize; ++nEntry) + for (size_t nEntry = 0; nEntry < nNumberOfObjects; ++nEntry) { - size_t nIndex = pFirstObject->GetValue() + nEntry; + size_t nIndex = nFirstObject + nEntry; aStream.ReadBytes(aOrigLine.data(), aOrigLine.size()); if (aOrigLine[0] + 10 != nPredictor) @@ -1210,12 +1270,15 @@ void PDFDocument::ReadXRefStream(SvStream& rStream) } // "n" entry of the xref table - if (nType == 1) + if (nType == 1 || nType == 2) { if (m_aXRef.find(nIndex) == m_aXRef.end()) { - m_aXRef[nIndex] = nStreamOffset; - m_aXRefDirty[nIndex] = false; + XRefEntry aEntry; + aEntry.m_eType = nType == 1 ? XRefEntryType::NOT_COMPRESSED : XRefEntryType::COMPRESSED; + aEntry.m_nOffset = nStreamOffset; + aEntry.m_nGenerationNumber = nGenerationNumber; + m_aXRef[nIndex] = aEntry; } } } @@ -1223,19 +1286,6 @@ void PDFDocument::ReadXRefStream(SvStream& rStream) void PDFDocument::ReadXRef(SvStream& rStream) { - OString aKeyword = ReadKeyword(rStream); - if (aKeyword.isEmpty()) - { - ReadXRefStream(rStream); - return; - } - - if (aKeyword != "xref") - { - SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: xref is not the first keyword"); - return; - } - PDFDocument::SkipWhitespace(rStream); while (true) @@ -1288,7 +1338,7 @@ void PDFDocument::ReadXRef(SvStream& rStream) } PDFDocument::SkipWhitespace(rStream); - aKeyword = ReadKeyword(rStream); + OString aKeyword = ReadKeyword(rStream); if (aKeyword != "f" && aKeyword != "n") { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: unexpected keyword"); @@ -1298,9 +1348,13 @@ void PDFDocument::ReadXRef(SvStream& rStream) // offset with an older one. if (m_aXRef.find(nIndex) == m_aXRef.end()) { - m_aXRef[nIndex] = aOffset.GetValue(); + XRefEntry aEntry; + aEntry.m_nOffset = aOffset.GetValue(); + aEntry.m_nGenerationNumber = aGenerationNumber.GetValue(); // Initially only the first entry is dirty. - m_aXRefDirty[nIndex] = nIndex == 0; + if (nIndex == 0) + aEntry.m_bDirty = true; + m_aXRef[nIndex] = aEntry; } PDFDocument::SkipWhitespace(rStream); } @@ -1346,13 +1400,13 @@ void PDFDocument::SkipLineBreaks(SvStream& rStream) size_t PDFDocument::GetObjectOffset(size_t nIndex) const { auto it = m_aXRef.find(nIndex); - if (it == m_aXRef.end()) + if (it == m_aXRef.end() || it->second.m_eType == XRefEntryType::COMPRESSED) { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::GetObjectOffset: wanted to look up index #" << nIndex << ", but failed"); return 0; } - return it->second; + return it->second.m_nOffset; } const std::vector< std::unique_ptr<PDFElement> >& PDFDocument::GetElements() @@ -1360,11 +1414,6 @@ const std::vector< std::unique_ptr<PDFElement> >& PDFDocument::GetElements() return m_aElements; } -const std::map<size_t, PDFObjectElement*>& PDFDocument::GetIDObjects() const -{ - return m_aIDObjects; -} - std::vector<PDFObjectElement*> PDFDocument::GetPages() { std::vector<PDFObjectElement*> aRet; @@ -2011,7 +2060,8 @@ PDFObjectElement::PDFObjectElement(PDFDocument& rDoc, double fObjectValue, doubl m_nDictionaryOffset(0), m_nDictionaryLength(0), m_pDictionaryElement(nullptr), - m_pArrayElement(nullptr) + m_pArrayElement(nullptr), + m_pStreamElement(nullptr) { } @@ -2236,7 +2286,14 @@ PDFElement* PDFDictionaryElement::Lookup(const std::map<OString, PDFElement*>& r PDFElement* PDFObjectElement::Lookup(const OString& rDictionaryKey) { if (m_aDictionary.empty()) - PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary); + { + if (!m_aElements.empty()) + // This is a stored object in an object stream. + PDFDictionaryElement::Parse(m_aElements, this, m_aDictionary); + else + // Normal object: elements are stored as members of the document itself. + PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary); + } return PDFDictionaryElement::Lookup(m_aDictionary, rDictionaryKey); } @@ -2332,11 +2389,139 @@ void PDFObjectElement::SetArray(PDFArrayElement* pArrayElement) m_pArrayElement = pArrayElement; } +void PDFObjectElement::SetStream(PDFStreamElement* pStreamElement) +{ + m_pStreamElement = pStreamElement; +} + PDFArrayElement* PDFObjectElement::GetArray() const { return m_pArrayElement; } +void PDFObjectElement::ParseStoredObjects() +{ + if (!m_pStreamElement) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no stream"); + return; + } + + auto pType = dynamic_cast<PDFNameElement*>(Lookup("Type")); + if (!pType || pType->GetValue() != "ObjStm") + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: missing or unexpected type: " << pType->GetValue()); + return; + } + + auto pFilter = dynamic_cast<PDFNameElement*>(Lookup("Filter")); + if (!pFilter || pFilter->GetValue() != "FlateDecode") + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: missing or unexpected filter"); + return; + } + + auto pFirst = dynamic_cast<PDFNumberElement*>(Lookup("First")); + if (!pFirst) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no First"); + return; + } + + auto pN = dynamic_cast<PDFNumberElement*>(Lookup("N")); + if (!pN) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no N"); + return; + } + size_t nN = pN->GetValue(); + + auto pLength = dynamic_cast<PDFNumberElement*>(Lookup("Length")); + if (!pLength) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no length"); + return; + } + size_t nLength = pLength->GetValue(); + + // Read and decompress it. + SvMemoryStream& rEditBuffer = m_rDoc.GetEditBuffer(); + rEditBuffer.Seek(m_pStreamElement->GetOffset()); + std::vector<char> aBuf(nLength); + rEditBuffer.ReadBytes(aBuf.data(), aBuf.size()); + SvMemoryStream aSource(aBuf.data(), aBuf.size(), StreamMode::READ); + SvMemoryStream aStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + aZCodec.Decompress(aSource, aStream); + if (!aZCodec.EndCompression()) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: decompression failed"); + return; + } + + aStream.Seek(STREAM_SEEK_TO_END); + nLength = aStream.Tell(); + aStream.Seek(0); + std::vector<size_t> aObjNums; + std::vector<size_t> aOffsets; + std::vector<size_t> aLengths; + // First iterate over and find out the lengths. + for (size_t nObject = 0; nObject < nN; ++nObject) + { + PDFNumberElement aObjNum; + if (!aObjNum.Read(aStream)) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: failed to read object number"); + return; + } + aObjNums.push_back(aObjNum.GetValue()); + + PDFDocument::SkipWhitespace(aStream); + + PDFNumberElement aByteOffset; + if (!aByteOffset.Read(aStream)) + { + SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: failed to read byte offset"); + return; + } + aOffsets.push_back(pFirst->GetValue() + aByteOffset.GetValue()); + + if (aOffsets.size() > 1) + aLengths.push_back(aOffsets.back() - aOffsets[aOffsets.size() - 2]); + if (nObject + 1 == nN) + aLengths.push_back(nLength - aOffsets.back()); + + PDFDocument::SkipWhitespace(aStream); + } + + // Now create streams with the proper length and tokenize the data. + for (size_t nObject = 0; nObject < nN; ++nObject) + { + size_t nObjNum = aObjNums[nObject]; + size_t nOffset = aOffsets[nObject]; + size_t nLen = aLengths[nObject]; + + aStream.Seek(nOffset); + m_aStoredElements.push_back(std::unique_ptr<PDFObjectElement>(new PDFObjectElement(m_rDoc, nObjNum, 0))); + PDFObjectElement* pStored = m_aStoredElements.back().get(); + + aBuf.clear(); + aBuf.resize(nLen); + aStream.ReadBytes(aBuf.data(), aBuf.size()); + SvMemoryStream aStoredStream(aBuf.data(), aBuf.size(), StreamMode::READ); + + m_rDoc.Tokenize(aStoredStream, TokenizeMode::STORED_OBJECT, pStored->GetStoredElements(), pStored); + // This is how references know the object is stored inside this object stream. + m_rDoc.SetIDObject(nObjNum, pStored); + } +} + +std::vector< std::unique_ptr<PDFElement> >& PDFObjectElement::GetStoredElements() +{ + return m_aElements; +} + PDFReferenceElement::PDFReferenceElement(PDFDocument& rDoc, int fObjectValue, int fGenerationValue) : m_rDoc(rDoc), m_fObjectValue(fObjectValue), @@ -2409,17 +2594,43 @@ double PDFReferenceElement::LookupNumber(SvStream& rStream) const return aNumber.GetValue(); } -PDFObjectElement* PDFReferenceElement::LookupObject() const +PDFObjectElement* PDFReferenceElement::LookupObject() { - const std::map<size_t, PDFObjectElement*>& rIDObjects = m_rDoc.GetIDObjects(); - auto it = rIDObjects.find(m_fObjectValue); - if (it != rIDObjects.end()) - return it->second; + return m_rDoc.LookupObject(m_fObjectValue); +} - SAL_WARN("xmlsecurity.pdfio", "PDFReferenceElement::LookupObject: can't find obj " << m_fObjectValue); +PDFObjectElement* PDFDocument::LookupObject(size_t nObjectNumber) +{ + auto itIDObjects = m_aIDObjects.find(nObjectNumber); + auto itXRef = m_aXRef.find(nObjectNumber); + if (itIDObjects == m_aIDObjects.end() && itXRef != m_aXRef.end()) + { + // We don't have an object for this number yet, but there is an xref + // entry for it. + const XRefEntry& rEntry = itXRef->second; + if (rEntry.m_eType == XRefEntryType::COMPRESSED) + { + // It's a compressed entry, try parsing the stored objects. + if (PDFObjectElement* pObjectStream = LookupObject(rEntry.m_nOffset)) + // This registers new IDs. + pObjectStream->ParseStoredObjects(); + } + // Find again, now that the new objects are registered. + itIDObjects = m_aIDObjects.find(nObjectNumber); + } + + if (itIDObjects != m_aIDObjects.end()) + return itIDObjects->second; + + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::LookupObject: can't find obj " << nObjectNumber); return nullptr; } +SvMemoryStream& PDFDocument::GetEditBuffer() +{ + return m_aEditBuffer; +} + int PDFReferenceElement::GetObjectValue() const { return m_fObjectValue; |