From 56cc352d8229c16604f39a21bd77a05b422470f4 Mon Sep 17 00:00:00 2001 From: Miklos Vajna Date: Fri, 28 Oct 2016 17:34:55 +0200 Subject: xmlsecurity PDF verify: initial support of cross-reference streams This adds support for cross-reference streams (which can be used instead of plain-text cross-reference tables) + also one stream predictor. The actual parsed data is still not used, though. Change-Id: Ia806abd8a97636a1bd25dfdafea377b088800f00 --- xmlsecurity/inc/pdfio/pdfdocument.hxx | 15 +- xmlsecurity/source/pdfio/pdfdocument.cxx | 252 ++++++++++++++++++++++++++++++- 2 files changed, 258 insertions(+), 9 deletions(-) diff --git a/xmlsecurity/inc/pdfio/pdfdocument.hxx b/xmlsecurity/inc/pdfio/pdfdocument.hxx index 8ef7afdf2c66..80a8de68f1fd 100644 --- a/xmlsecurity/inc/pdfio/pdfdocument.hxx +++ b/xmlsecurity/inc/pdfio/pdfdocument.hxx @@ -38,6 +38,16 @@ public: virtual ~PDFElement() { } }; +enum class TokenizeMode +{ + /// Full file. + END_OF_STREAM, + /// Till the first %%EOF token. + EOF_TOKEN, + /// Till the end of the current object. + END_OF_OBJECT +}; + /** * In-memory representation of an on-disk PDF document. * @@ -64,8 +74,8 @@ class XMLSECURITY_DLLPUBLIC PDFDocument static int AsHex(char ch); /// Decode a hex dump. static std::vector DecodeHexString(PDFHexStringElement* pElement); - /// Tokenize elements from current offset, optionally only till the next EOF. - bool Tokenize(SvStream& rStream, bool bPartial); + /// Tokenize elements from current offset. + bool Tokenize(SvStream& rStream, TokenizeMode eMode); public: PDFDocument(); @@ -74,6 +84,7 @@ public: static OString ReadKeyword(SvStream& rStream); static size_t FindStartXRef(SvStream& rStream); void ReadXRef(SvStream& rStream); + void ReadXRefStream(SvStream& rStream); static void SkipWhitespace(SvStream& rStream); /// Instead of all whitespace, just skip CR and NL characters. static void SkipLineBreaks(SvStream& rStream); diff --git a/xmlsecurity/source/pdfio/pdfdocument.cxx b/xmlsecurity/source/pdfio/pdfdocument.cxx index 1ca73618cff5..1fef027f4163 100644 --- a/xmlsecurity/source/pdfio/pdfdocument.cxx +++ b/xmlsecurity/source/pdfio/pdfdocument.cxx @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -184,10 +185,12 @@ public: class PDFStreamElement : public PDFElement { size_t m_nLength; + sal_uInt64 m_nOffset; public: PDFStreamElement(size_t nLength); bool Read(SvStream& rStream) override; + sal_uInt64 GetOffset() const; }; /// End of a stream: 'endstream' keyword. @@ -629,7 +632,7 @@ bool PDFDocument::Write(SvStream& rStream) return rStream.good(); } -bool PDFDocument::Tokenize(SvStream& rStream, bool bPartial) +bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode) { bool bInXRef = false; // The next number will be an xref offset. @@ -656,7 +659,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, bool bPartial) rStream.SeekRel(-1); if (!m_aElements.back()->Read(rStream)) return false; - if (bPartial && !m_aEOFs.empty() && m_aEOFs.back() == rStream.Tell()) + if (eMode == TokenizeMode::EOF_TOKEN && !m_aEOFs.empty() && m_aEOFs.back() == rStream.Tell()) { // Found EOF and partial parsing requested, we're done. return true; @@ -834,6 +837,11 @@ bool PDFDocument::Tokenize(SvStream& rStream, bool bPartial) m_aElements.push_back(std::unique_ptr(new PDFEndObjectElement())); if (!m_aElements.back()->Read(rStream)) return false; + if (eMode == TokenizeMode::END_OF_OBJECT) + { + // Found endobj and only object parsing was requested, we're done. + return true; + } } else if (aKeyword == "true" || aKeyword == "false") m_aElements.push_back(std::unique_ptr(new PDFBooleanElement(aKeyword.toBoolean()))); @@ -904,7 +912,7 @@ bool PDFDocument::Read(SvStream& rStream) { rStream.Seek(nStartXRef); ReadXRef(rStream); - if (!Tokenize(rStream, /*bPartial=*/true)) + if (!Tokenize(rStream, TokenizeMode::EOF_TOKEN)) { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: failed to tokenizer trailer after xref"); return false; @@ -931,7 +939,7 @@ bool PDFDocument::Read(SvStream& rStream) // Then we can tokenize the stream. rStream.Seek(0); - return Tokenize(rStream, /*bPartial=*/false); + return Tokenize(rStream, TokenizeMode::END_OF_STREAM); } OString PDFDocument::ReadKeyword(SvStream& rStream) @@ -983,9 +991,232 @@ size_t PDFDocument::FindStartXRef(SvStream& rStream) return aNumber.GetValue(); } +void PDFDocument::ReadXRefStream(SvStream& rStream) +{ + // Look up the stream length in the object dictionary. + if (!Tokenize(rStream, TokenizeMode::END_OF_OBJECT)) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: failed to read object"); + return; + } + + if (m_aElements.empty()) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: no tokens found"); + return; + } + + PDFObjectElement* pObject = nullptr; + for (const auto& pElement : m_aElements) + { + if (auto pObj = dynamic_cast(pElement.get())) + { + pObject = pObj; + break; + } + } + if (!pObject) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: no object token found"); + return; + } + + PDFElement* pLookup = pObject->Lookup("Length"); + auto pNumber = dynamic_cast(pLookup); + if (!pNumber) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: stream length is not provided"); + return; + } + sal_uInt64 nLength = pNumber->GetValue(); + + // Look up the stream offset. + PDFStreamElement* pStream = nullptr; + for (const auto& pElement : m_aElements) + { + if (auto pS = dynamic_cast(pElement.get())) + { + pStream = pS; + break; + } + } + if (!pStream) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: no stream token found"); + return; + } + + // Read and decompress it. + rStream.Seek(pStream->GetOffset()); + std::vector aBuf(nLength); + rStream.ReadBytes(aBuf.data(), aBuf.size()); + + auto pFilter = dynamic_cast(pObject->Lookup("Filter")); + if (!pFilter) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: no Filter found"); + return; + } + + if (pFilter->GetValue() != "FlateDecode") + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: unexpected filter: " << pFilter->GetValue()); + return; + } + + int nColumns = 1; + int nPredictor = 1; + if (auto pDecodeParams = dynamic_cast(pObject->Lookup("DecodeParms"))) + { + const std::map& rItems = pDecodeParams->GetItems(); + auto it = rItems.find("Columns"); + if (it != rItems.end()) + if (auto pColumns = dynamic_cast(it->second)) + nColumns = pColumns->GetValue(); + it = rItems.find("Predictor"); + if (it != rItems.end()) + if (auto pPredictor = dynamic_cast(it->second)) + nPredictor = pPredictor->GetValue(); + } + + SvMemoryStream aSource(aBuf.data(), aBuf.size(), StreamMode::READ); + SvMemoryStream aStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + aZCodec.Decompress(aSource, aStream); + if (!aZCodec.EndCompression()) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: decompression failed"); + return; + } + + // Look up the first and the last entry we need to read. + auto pIndex = dynamic_cast(pObject->Lookup("Index")); + if (!pIndex || pIndex->GetElements().size() < 2) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index not found or has < 2 elements"); + return; + } + + const std::vector& rIndexElements = pIndex->GetElements(); + auto pFirstObject = dynamic_cast(rIndexElements[0]); + if (!pFirstObject) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no first object"); + return; + } + + auto pNumberOfObjects = dynamic_cast(rIndexElements[1]); + if (!pNumberOfObjects) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no number of objects"); + return; + } + + // Look up the format of a single entry. + const int nWSize = 3; + auto pW = dynamic_cast(pObject->Lookup("W")); + if (!pW || pW->GetElements().size() < nWSize) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: W not found or has < 3 elements"); + return; + } + int aW[nWSize]; + // First character is the (kind of) repeated predictor. + int nLineLength = 1; + for (size_t i = 0; i < nWSize; ++i) + { + auto pI = dynamic_cast(pW->GetElements()[i]); + if (!pI) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: W contains non-number"); + return; + } + aW[i] = pI->GetValue(); + nLineLength += aW[i]; + } + + if (nLineLength - 1 != nColumns) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: /DecodeParms/Columns is inconsitent with /W"); + return; + } + + size_t nSize = pNumberOfObjects->GetValue(); + aStream.Seek(0); + // This is the line as read from the stream. + std::vector aOrigLine(nLineLength); + // This is the line as it appears after tweaking according to nPredictor. + std::vector aFilteredLine(nLineLength); + for (size_t nEntry = 0; nEntry < nSize; ++nEntry) + { + size_t nIndex = pFirstObject->GetValue() + nEntry; + + aStream.ReadBytes(aOrigLine.data(), aOrigLine.size()); + if (aOrigLine[0] + 10 != nPredictor) + { + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: in-stream predictor is inconsistent with /DecodeParms/Predictor for object #" << nIndex); + return; + } + + for (int i = 0; i < nLineLength; ++i) + { + switch (nPredictor) + { + case 1: + // No prediction. + break; + case 12: + // PNG prediction: up (on all rows). + aFilteredLine[i] = aFilteredLine[i] + aOrigLine[i]; + break; + default: + SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: unexpected predictor: " << nPredictor); + return; + break; + } + } + + // First character is already handled above. + int nPos = 1; + size_t nType = 0; + // Start of the current field in the stream data. + int nOffset = nPos; + for (; nPos < nOffset + aW[0]; ++nPos) + { + unsigned char nCh = aFilteredLine[nPos]; + nType = (nType << 8) + nCh; + } + + // Start of the object in the file stream. + size_t nStreamOffset = 0; + nOffset = nPos; + for (; nPos < nOffset + aW[1]; ++nPos) + { + unsigned char nCh = aFilteredLine[nPos]; + nStreamOffset = (nStreamOffset << 8) + nCh; + } + + size_t nGenerationNumber = 0; + nOffset = nPos; + for (; nPos < nOffset + aW[2]; ++nPos) + { + unsigned char nCh = aFilteredLine[nPos]; + nGenerationNumber = (nGenerationNumber << 8) + nCh; + } + } +} + void PDFDocument::ReadXRef(SvStream& rStream) { - if (ReadKeyword(rStream) != "xref") + OString aKeyword = ReadKeyword(rStream); + if (aKeyword.isEmpty()) + { + ReadXRefStream(rStream); + return; + } + + if (aKeyword != "xref") { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: xref is not the first keyword"); return; @@ -1043,7 +1274,7 @@ void PDFDocument::ReadXRef(SvStream& rStream) } PDFDocument::SkipWhitespace(rStream); - OString aKeyword = ReadKeyword(rStream); + aKeyword = ReadKeyword(rStream); if (aKeyword != "f" && aKeyword != "n") { SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: unexpected keyword"); @@ -2323,18 +2554,25 @@ sal_uInt64 PDFNameElement::GetLength() const } PDFStreamElement::PDFStreamElement(size_t nLength) - : m_nLength(nLength) + : m_nLength(nLength), + m_nOffset(0) { } bool PDFStreamElement::Read(SvStream& rStream) { SAL_INFO("xmlsecurity.pdfio", "PDFStreamElement::Read: length is " << m_nLength); + m_nOffset = rStream.Tell(); rStream.SeekRel(m_nLength); return rStream.good(); } +sal_uInt64 PDFStreamElement::GetOffset() const +{ + return m_nOffset; +} + bool PDFEndStreamElement::Read(SvStream& /*rStream*/) { return true; -- cgit