diff options
Diffstat (limited to 'xmlreader/source')
-rw-r--r-- | xmlreader/source/makefile.mk | 53 | ||||
-rw-r--r-- | xmlreader/source/pad.cxx | 85 | ||||
-rw-r--r-- | xmlreader/source/span.cxx | 66 | ||||
-rw-r--r-- | xmlreader/source/xmlreader.cxx | 1054 |
4 files changed, 1258 insertions, 0 deletions
diff --git a/xmlreader/source/makefile.mk b/xmlreader/source/makefile.mk new file mode 100644 index 000000000000..cb71e21a16b1 --- /dev/null +++ b/xmlreader/source/makefile.mk @@ -0,0 +1,53 @@ +#************************************************************************* +# +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# Copyright 2000, 2010 Oracle and/or its affiliates. +# +# OpenOffice.org - a multi-platform office productivity suite +# +# This file is part of OpenOffice.org. +# +# OpenOffice.org is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 +# only, as published by the Free Software Foundation. +# +# OpenOffice.org is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License version 3 for more details +# (a copy is included in the LICENSE file that accompanied this code). +# +# You should have received a copy of the GNU Lesser General Public License +# version 3 along with OpenOffice.org. If not, see +# <http://www.openoffice.org/license.html> +# for a copy of the LGPLv3 License. +# +#***********************************************************************/ + +PRJ = .. +PRJNAME = xmlreader +TARGET = xmlreader + +ENABLE_EXCEPTIONS = TRUE +VISIBILITY_HIDDEN = TRUE + +.INCLUDE: settings.mk + +CDEFS += -DOOO_DLLIMPLEMENTATION_XMLREADER + +SLOFILES = \ + $(SLO)/pad.obj \ + $(SLO)/span.obj \ + $(SLO)/xmlreader.obj + +SHL1IMPLIB = i$(SHL1TARGET) +SHL1OBJS = $(SLOFILES) +SHL1RPATH = URELIB +SHL1STDLIBS = \ + $(SALLIB) +SHL1TARGET = xmlreader +SHL1USE_EXPORTS = name +DEF1NAME = $(SHL1TARGET) + +.INCLUDE: target.mk diff --git a/xmlreader/source/pad.cxx b/xmlreader/source/pad.cxx new file mode 100644 index 000000000000..b1673c4a4431 --- /dev/null +++ b/xmlreader/source/pad.cxx @@ -0,0 +1,85 @@ +/************************************************************************* +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* Copyright 2000, 2010 Oracle and/or its affiliates. +* +* OpenOffice.org - a multi-platform office productivity suite +* +* This file is part of OpenOffice.org. +* +* OpenOffice.org is free software: you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License version 3 +* only, as published by the Free Software Foundation. +* +* OpenOffice.org is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Lesser General Public License version 3 for more details +* (a copy is included in the LICENSE file that accompanied this code). +* +* You should have received a copy of the GNU Lesser General Public License +* version 3 along with OpenOffice.org. If not, see +* <http://www.openoffice.org/license.html> +* for a copy of the LGPLv3 License. +* +************************************************************************/ + +#include "precompiled_xmlreader.hxx" +#include "sal/config.h" + +#include "osl/diagnose.h" +#include "rtl/string.h" +#include "sal/types.h" +#include "xmlreader/pad.hxx" +#include "xmlreader/span.hxx" + +namespace xmlreader { + +void Pad::add(char const * begin, sal_Int32 length) { + OSL_ASSERT( + begin != 0 && length >= 0 && !(span_.is() && buffer_.getLength() != 0)); + if (length != 0) { + flushSpan(); + if (buffer_.getLength() == 0) { + span_ = Span(begin, length); + } else { + buffer_.append(begin, length); + } + } +} + +void Pad::addEphemeral(char const * begin, sal_Int32 length) { + OSL_ASSERT( + begin != 0 && length >= 0 && !(span_.is() && buffer_.getLength() != 0)); + if (length != 0) { + flushSpan(); + buffer_.append(begin, length); + } +} + +void Pad::clear() { + OSL_ASSERT(!(span_.is() && buffer_.getLength() != 0)); + span_.clear(); + buffer_.setLength(0); +} + +Span Pad::get() const { + OSL_ASSERT(!(span_.is() && buffer_.getLength() != 0)); + if (span_.is()) { + return span_; + } else if (buffer_.getLength() == 0) { + return Span(RTL_CONSTASCII_STRINGPARAM("")); + } else { + return Span(buffer_.getStr(), buffer_.getLength()); + } +} + +void Pad::flushSpan() { + if (span_.is()) { + buffer_.append(span_.begin, span_.length); + span_.clear(); + } +} + +} diff --git a/xmlreader/source/span.cxx b/xmlreader/source/span.cxx new file mode 100644 index 000000000000..3b936553b431 --- /dev/null +++ b/xmlreader/source/span.cxx @@ -0,0 +1,66 @@ +/************************************************************************* +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* Copyright 2000, 2010 Oracle and/or its affiliates. +* +* OpenOffice.org - a multi-platform office productivity suite +* +* This file is part of OpenOffice.org. +* +* OpenOffice.org is free software: you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License version 3 +* only, as published by the Free Software Foundation. +* +* OpenOffice.org is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Lesser General Public License version 3 for more details +* (a copy is included in the LICENSE file that accompanied this code). +* +* You should have received a copy of the GNU Lesser General Public License +* version 3 along with OpenOffice.org. If not, see +* <http://www.openoffice.org/license.html> +* for a copy of the LGPLv3 License. +* +************************************************************************/ + +#include "precompiled_xmlreader.hxx" +#include "sal/config.h" + +#include "com/sun/star/uno/RuntimeException.hpp" +#include "com/sun/star/uno/XInterface.hpp" +#include "osl/diagnose.h" +#include "rtl/textcvt.h" +#include "rtl/textenc.h" +#include "rtl/ustring.h" +#include "rtl/ustring.hxx" +#include "sal/types.h" +#include "xmlreader/span.hxx" + +namespace xmlreader { + +namespace { + +namespace css = com::sun::star; + +} + +rtl::OUString Span::convertFromUtf8() const { + OSL_ASSERT(is()); + rtl_uString * s = 0; + if (!rtl_convertStringToUString( + &s, begin, length, RTL_TEXTENCODING_UTF8, + (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR | + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR | + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR))) + { + throw css::uno::RuntimeException( + rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("cannot convert from UTF-8")), + css::uno::Reference< css::uno::XInterface >()); + } + return rtl::OUString(s, SAL_NO_ACQUIRE); +} + +} diff --git a/xmlreader/source/xmlreader.cxx b/xmlreader/source/xmlreader.cxx new file mode 100644 index 000000000000..27350a8f0947 --- /dev/null +++ b/xmlreader/source/xmlreader.cxx @@ -0,0 +1,1054 @@ +/************************************************************************* +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* Copyright 2000, 2010 Oracle and/or its affiliates. +* +* OpenOffice.org - a multi-platform office productivity suite +* +* This file is part of OpenOffice.org. +* +* OpenOffice.org is free software: you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License version 3 +* only, as published by the Free Software Foundation. +* +* OpenOffice.org is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Lesser General Public License version 3 for more details +* (a copy is included in the LICENSE file that accompanied this code). +* +* You should have received a copy of the GNU Lesser General Public License +* version 3 along with OpenOffice.org. If not, see +* <http://www.openoffice.org/license.html> +* for a copy of the LGPLv3 License. +* +************************************************************************/ + +#include "precompiled_xmlreader.hxx" +#include "sal/config.h" + +#include <climits> +#include <cstddef> + +#include "com/sun/star/container/NoSuchElementException.hpp" +#include "com/sun/star/uno/Reference.hxx" +#include "com/sun/star/uno/RuntimeException.hpp" +#include "com/sun/star/uno/XInterface.hpp" +#include "osl/diagnose.h" +#include "osl/file.h" +#include "rtl/string.h" +#include "rtl/ustring.h" +#include "rtl/ustring.hxx" +#include "sal/types.h" +#include "xmlreader/pad.hxx" +#include "xmlreader/span.hxx" +#include "xmlreader/xmlreader.hxx" + +namespace xmlreader { + +namespace { + +namespace css = com::sun::star; + +bool isSpace(char c) { + switch (c) { + case '\x09': + case '\x0A': + case '\x0D': + case ' ': + return true; + default: + return false; + } +} + +} + +XmlReader::XmlReader(rtl::OUString const & fileUrl) + SAL_THROW(( + css::container::NoSuchElementException, css::uno::RuntimeException)): + fileUrl_(fileUrl) +{ + switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) + { + case osl_File_E_None: + break; + case osl_File_E_NOENT: + throw css::container::NoSuchElementException( + fileUrl_, css::uno::Reference< css::uno::XInterface >()); + default: + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); + if (e == osl_File_E_None) { + e = osl_mapFile( + fileHandle_, &fileAddress_, fileSize_, 0, + osl_File_MapFlag_WillNeed); + } + if (e != osl_File_E_None) { + e = osl_closeFile(fileHandle_); + if (e != osl_File_E_None) { + OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); + } + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + namespaceIris_.push_back( + Span( + RTL_CONSTASCII_STRINGPARAM( + "http://www.w3.org/XML/1998/namespace"))); + namespaces_.push_back( + NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); + pos_ = static_cast< char * >(fileAddress_); + end_ = pos_ + fileSize_; + state_ = STATE_CONTENT; +} + +XmlReader::~XmlReader() { + oslFileError e = osl_unmapFile(fileAddress_, fileSize_); + if (e != osl_File_E_None) { + OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); + } + e = osl_closeFile(fileHandle_); + if (e != osl_File_E_None) { + OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); + } +} + +int XmlReader::registerNamespaceIri(Span const & iri) { + int id = toNamespaceId(namespaceIris_.size()); + namespaceIris_.push_back(iri); + if (iri.equals( + Span( + RTL_CONSTASCII_STRINGPARAM( + "http://www.w3.org/2001/XMLSchema-instance")))) + { + // Old user layer .xcu files used the xsi namespace prefix without + // declaring a corresponding namespace binding, see issue 77174; reading + // those files during migration would fail without this hack that can be + // removed once migration is no longer relevant (see + // configmgr::Components::parseModificationLayer): + namespaces_.push_back( + NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); + } + return id; +} + +XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) +{ + switch (state_) { + case STATE_CONTENT: + switch (reportText) { + case TEXT_NONE: + return handleSkippedText(data, nsId); + case TEXT_RAW: + return handleRawText(data); + case TEXT_NORMALIZED: + return handleNormalizedText(data); + } + case STATE_START_TAG: + return handleStartTag(nsId, data); + case STATE_END_TAG: + return handleEndTag(); + case STATE_EMPTY_ELEMENT_TAG: + handleElementEnd(); + return RESULT_END; + default: // STATE_DONE + return RESULT_DONE; + } +} + +bool XmlReader::nextAttribute(int * nsId, Span * localName) { + OSL_ASSERT(nsId != 0 && localName != 0); + if (firstAttribute_) { + currentAttribute_ = attributes_.begin(); + firstAttribute_ = false; + } else { + ++currentAttribute_; + } + if (currentAttribute_ == attributes_.end()) { + return false; + } + if (currentAttribute_->nameColon == 0) { + *nsId = NAMESPACE_NONE; + *localName = Span( + currentAttribute_->nameBegin, + currentAttribute_->nameEnd - currentAttribute_->nameBegin); + } else { + *nsId = getNamespaceId( + Span( + currentAttribute_->nameBegin, + currentAttribute_->nameColon - currentAttribute_->nameBegin)); + *localName = Span( + currentAttribute_->nameColon + 1, + currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); + } + return true; +} + +Span XmlReader::getAttributeValue(bool fullyNormalize) { + return handleAttributeValue( + currentAttribute_->valueBegin, currentAttribute_->valueEnd, + fullyNormalize); +} + +int XmlReader::getNamespaceId(Span const & prefix) const { + for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); + i != namespaces_.rend(); ++i) + { + if (prefix.equals(i->prefix)) { + return i->nsId; + } + } + return NAMESPACE_UNKNOWN; +} + +rtl::OUString XmlReader::getUrl() const { + return fileUrl_; +} + +void XmlReader::normalizeLineEnds(Span const & text) { + char const * p = text.begin; + sal_Int32 n = text.length; + for (;;) { + sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); + if (i < 0) { + break; + } + pad_.add(p, i); + p += i + 1; + n -= i + 1; + if (n == 0 || *p != '\x0A') { + pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); + } + } + pad_.add(p, n); +} + +void XmlReader::skipSpace() { + while (isSpace(peek())) { + ++pos_; + } +} + +bool XmlReader::skipComment() { + if (rtl_str_shortenedCompare_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), + RTL_CONSTASCII_LENGTH("--")) != + 0) + { + return false; + } + pos_ += RTL_CONSTASCII_LENGTH("--"); + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within comment) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + RTL_CONSTASCII_LENGTH("--"); + if (read() != '>') { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "illegal \"--\" within comment in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + return true; +} + +void XmlReader::skipProcessingInstruction() { + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + RTL_CONSTASCII_LENGTH("?>"); +} + +void XmlReader::skipDocumentTypeDeclaration() { + // Neither is it checked that the doctypedecl is at the correct position in + // the document, nor that it is well-formed: + for (;;) { + char c = read(); + switch (c) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within DTD) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + case '"': + case '\'': + { + sal_Int32 i = rtl_str_indexOfChar_WithLength( + pos_, end_ - pos_, c); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within DTD) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + 1; + } + break; + case '>': + return; + case '[': + for (;;) { + c = read(); + switch (c) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within DTD) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + case '"': + case '\'': + { + sal_Int32 i = rtl_str_indexOfChar_WithLength( + pos_, end_ - pos_, c); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within DTD) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + 1; + } + break; + case '<': + switch (read()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within DTD) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + case '!': + skipComment(); + break; + case '?': + skipProcessingInstruction(); + break; + default: + break; + } + break; + case ']': + skipSpace(); + if (read() != '>') { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "missing \">\" of DTD in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + return; + default: + break; + } + } + default: + break; + } + } +} + +Span XmlReader::scanCdataSection() { + if (rtl_str_shortenedCompare_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), + RTL_CONSTASCII_LENGTH("[CDATA[")) != + 0) + { + return Span(); + } + pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); + char const * begin = pos_; + sal_Int32 i = rtl_str_indexOfStr_WithLength( + pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "premature end (within CDATA section) of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); + return Span(begin, i); +} + +bool XmlReader::scanName(char const ** nameColon) { + OSL_ASSERT(nameColon != 0 && *nameColon == 0); + for (char const * begin = pos_;; ++pos_) { + switch (peek()) { + case '\0': // i.e., EOF + case '\x09': + case '\x0A': + case '\x0D': + case ' ': + case '/': + case '=': + case '>': + return pos_ != begin; + case ':': + *nameColon = pos_; + break; + default: + break; + } + } +} + +int XmlReader::scanNamespaceIri(char const * begin, char const * end) { + OSL_ASSERT(begin != 0 && begin <= end); + Span iri(handleAttributeValue(begin, end, false)); + for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { + if (namespaceIris_[i].equals(iri)) { + return toNamespaceId(i); + } + } + return XmlReader::NAMESPACE_UNKNOWN; +} + +char const * XmlReader::handleReference(char const * position, char const * end) +{ + OSL_ASSERT(position != 0 && *position == '&' && position < end); + ++position; + if (*position == '#') { + ++position; + sal_Int32 val = 0; + char const * p; + if (*position == 'x') { + ++position; + p = position; + for (;; ++position) { + char c = *position; + if (c >= '0' && c <= '9') { + val = 16 * val + (c - '0'); + } else if (c >= 'A' && c <= 'F') { + val = 16 * val + (c - 'A') + 10; + } else if (c >= 'a' && c <= 'f') { + val = 16 * val + (c - 'a') + 10; + } else { + break; + } + if (val > 0x10FFFF) { // avoid overflow + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "'&#x...' too large in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + } + } else { + p = position; + for (;; ++position) { + char c = *position; + if (c >= '0' && c <= '9') { + val = 10 * val + (c - '0'); + } else { + break; + } + if (val > 0x10FFFF) { // avoid overflow + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "'&#...' too large in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + } + } + if (position == p || *position++ != ';') { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + OSL_ASSERT(val >= 0 && val <= 0x10FFFF); + if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || + (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) + { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "character reference denoting invalid character in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char buf[4]; + sal_Int32 len; + if (val < 0x80) { + buf[0] = static_cast< char >(val); + len = 1; + } else if (val < 0x800) { + buf[0] = static_cast< char >((val >> 6) | 0xC0); + buf[1] = static_cast< char >((val & 0x3F) | 0x80); + len = 2; + } else if (val < 0x10000) { + buf[0] = static_cast< char >((val >> 12) | 0xE0); + buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); + buf[2] = static_cast< char >((val & 0x3F) | 0x80); + len = 3; + } else { + buf[0] = static_cast< char >((val >> 18) | 0xF0); + buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); + buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); + buf[3] = static_cast< char >((val & 0x3F) | 0x80); + len = 4; + } + pad_.addEphemeral(buf, len); + return position; + } else { + struct EntityRef { + char const * inBegin; + sal_Int32 inLength; + char const * outBegin; + sal_Int32 outLength; + }; + static EntityRef const refs[] = { + { RTL_CONSTASCII_STRINGPARAM("amp;"), + RTL_CONSTASCII_STRINGPARAM("&") }, + { RTL_CONSTASCII_STRINGPARAM("lt;"), + RTL_CONSTASCII_STRINGPARAM("<") }, + { RTL_CONSTASCII_STRINGPARAM("gt;"), + RTL_CONSTASCII_STRINGPARAM(">") }, + { RTL_CONSTASCII_STRINGPARAM("apos;"), + RTL_CONSTASCII_STRINGPARAM("'") }, + { RTL_CONSTASCII_STRINGPARAM("quot;"), + RTL_CONSTASCII_STRINGPARAM("\"") } }; + for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { + if (rtl_str_shortenedCompare_WithLength( + position, end - position, refs[i].inBegin, refs[i].inLength, + refs[i].inLength) == + 0) + { + position += refs[i].inLength; + pad_.add(refs[i].outBegin, refs[i].outLength); + return position; + } + } + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } +} + +Span XmlReader::handleAttributeValue( + char const * begin, char const * end, bool fullyNormalize) +{ + pad_.clear(); + if (fullyNormalize) { + while (begin != end && isSpace(*begin)) { + ++begin; + } + while (end != begin && isSpace(end[-1])) { + --end; + } + char const * p = begin; + enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; + // a single true space character can go into the current span, + // everything else breaks the span + Space space = SPACE_NONE; + while (p != end) { + switch (*p) { + case '\x09': + case '\x0A': + case '\x0D': + switch (space) { + case SPACE_NONE: + pad_.add(begin, p - begin); + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + space = SPACE_BREAK; + break; + case SPACE_SPAN: + pad_.add(begin, p - begin); + space = SPACE_BREAK; + break; + case SPACE_BREAK: + break; + } + begin = ++p; + break; + case ' ': + switch (space) { + case SPACE_NONE: + ++p; + space = SPACE_SPAN; + break; + case SPACE_SPAN: + pad_.add(begin, p - begin); + begin = ++p; + space = SPACE_BREAK; + break; + case SPACE_BREAK: + begin = ++p; + break; + } + break; + case '&': + pad_.add(begin, p - begin); + p = handleReference(p, end); + begin = p; + space = SPACE_NONE; + break; + default: + ++p; + space = SPACE_NONE; + break; + } + } + pad_.add(begin, p - begin); + } else { + char const * p = begin; + while (p != end) { + switch (*p) { + case '\x09': + case '\x0A': + pad_.add(begin, p - begin); + begin = ++p; + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + break; + case '\x0D': + pad_.add(begin, p - begin); + ++p; + if (peek() == '\x0A') { + ++p; + } + begin = p; + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + break; + case '&': + pad_.add(begin, p - begin); + p = handleReference(p, end); + begin = p; + break; + default: + ++p; + break; + } + } + pad_.add(begin, p - begin); + } + return pad_.get(); +} + +XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { + OSL_ASSERT(nsId != 0 && localName); + char const * nameBegin = pos_; + char const * nameColon = 0; + if (!scanName(&nameColon)) { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * nameEnd = pos_; + NamespaceList::size_type inheritedNamespaces = namespaces_.size(); + bool hasDefaultNs = false; + int defaultNsId = NAMESPACE_NONE; + attributes_.clear(); + for (;;) { + char const * p = pos_; + skipSpace(); + if (peek() == '/' || peek() == '>') { + break; + } + if (pos_ == p) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "missing whitespace before attribute in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * attrNameBegin = pos_; + char const * attrNameColon = 0; + if (!scanName(&attrNameColon)) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * attrNameEnd = pos_; + skipSpace(); + if (read() != '=') { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + skipSpace(); + char del = read(); + if (del != '\'' && del != '"') { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * valueBegin = pos_; + sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM( + "unterminated attribute value in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * valueEnd = pos_ + i; + pos_ += i + 1; + if (attrNameColon == 0 && + Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( + RTL_CONSTASCII_STRINGPARAM("xmlns"))) + { + hasDefaultNs = true; + defaultNsId = scanNamespaceIri(valueBegin, valueEnd); + } else if (attrNameColon != 0 && + Span(attrNameBegin, attrNameColon - attrNameBegin).equals( + RTL_CONSTASCII_STRINGPARAM("xmlns"))) + { + namespaces_.push_back( + NamespaceData( + Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), + scanNamespaceIri(valueBegin, valueEnd))); + } else { + attributes_.push_back( + AttributeData( + attrNameBegin, attrNameEnd, attrNameColon, valueBegin, + valueEnd)); + } + } + if (!hasDefaultNs && !elements_.empty()) { + defaultNsId = elements_.top().defaultNamespaceId; + } + firstAttribute_ = true; + if (peek() == '/') { + state_ = STATE_EMPTY_ELEMENT_TAG; + ++pos_; + } else { + state_ = STATE_CONTENT; + } + if (peek() != '>') { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + ++pos_; + elements_.push( + ElementData( + Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, + defaultNsId)); + if (nameColon == 0) { + *nsId = defaultNsId; + *localName = Span(nameBegin, nameEnd - nameBegin); + } else { + *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); + *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); + } + return RESULT_BEGIN; +} + +XmlReader::Result XmlReader::handleEndTag() { + if (elements_.empty()) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + char const * nameBegin = pos_; + char const * nameColon = 0; + if (!scanName(&nameColon) || + !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) + { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + handleElementEnd(); + skipSpace(); + if (peek() != '>') { + throw css::uno::RuntimeException( + (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + ++pos_; + return RESULT_END; +} + +void XmlReader::handleElementEnd() { + OSL_ASSERT(!elements_.empty()); + namespaces_.resize(elements_.top().inheritedNamespaces); + elements_.pop(); + state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; +} + +XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { + for (;;) { + sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); + if (i < 0) { + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + } + pos_ += i + 1; + switch (peek()) { + case '!': + ++pos_; + if (!skipComment() && !scanCdataSection().is()) { + skipDocumentTypeDeclaration(); + } + break; + case '/': + ++pos_; + return handleEndTag(); + case '?': + ++pos_; + skipProcessingInstruction(); + break; + default: + return handleStartTag(nsId, data); + } + } +} + +XmlReader::Result XmlReader::handleRawText(Span * text) { + pad_.clear(); + for (char const * begin = pos_;;) { + switch (peek()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + case '\x0D': + pad_.add(begin, pos_ - begin); + ++pos_; + if (peek() != '\x0A') { + pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); + } + begin = pos_; + break; + case '&': + pad_.add(begin, pos_ - begin); + pos_ = handleReference(pos_, end_); + begin = pos_; + break; + case '<': + pad_.add(begin, pos_ - begin); + ++pos_; + switch (peek()) { + case '!': + ++pos_; + if (!skipComment()) { + Span cdata(scanCdataSection()); + if (cdata.is()) { + normalizeLineEnds(cdata); + } else { + skipDocumentTypeDeclaration(); + } + } + begin = pos_; + break; + case '/': + *text = pad_.get(); + ++pos_; + state_ = STATE_END_TAG; + return RESULT_TEXT; + case '?': + ++pos_; + skipProcessingInstruction(); + begin = pos_; + break; + default: + *text = pad_.get(); + state_ = STATE_START_TAG; + return RESULT_TEXT; + } + break; + default: + ++pos_; + break; + } + } +} + +XmlReader::Result XmlReader::handleNormalizedText(Span * text) { + pad_.clear(); + char const * flowBegin = pos_; + char const * flowEnd = pos_; + enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; + // a single true space character can go into the current flow, + // everything else breaks the flow + Space space = SPACE_START; + for (;;) { + switch (peek()) { + case '\0': // i.e., EOF + throw css::uno::RuntimeException( + (rtl::OUString( + RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + + fileUrl_), + css::uno::Reference< css::uno::XInterface >()); + case '\x09': + case '\x0A': + case '\x0D': + switch (space) { + case SPACE_START: + case SPACE_BREAK: + break; + case SPACE_NONE: + case SPACE_SPAN: + space = SPACE_BREAK; + break; + } + ++pos_; + break; + case ' ': + switch (space) { + case SPACE_START: + case SPACE_BREAK: + break; + case SPACE_NONE: + space = SPACE_SPAN; + break; + case SPACE_SPAN: + space = SPACE_BREAK; + break; + } + ++pos_; + break; + case '&': + switch (space) { + case SPACE_START: + break; + case SPACE_NONE: + case SPACE_SPAN: + pad_.add(flowBegin, pos_ - flowBegin); + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + break; + } + pos_ = handleReference(pos_, end_); + flowBegin = pos_; + flowEnd = pos_; + space = SPACE_NONE; + break; + case '<': + ++pos_; + switch (peek()) { + case '!': + ++pos_; + if (skipComment()) { + space = SPACE_BREAK; + } else { + Span cdata(scanCdataSection()); + if (cdata.is()) { + // CDATA is not normalized (similar to character + // references; it keeps the code simple), but it might + // arguably be better to normalize it: + switch (space) { + case SPACE_START: + break; + case SPACE_NONE: + case SPACE_SPAN: + pad_.add(flowBegin, pos_ - flowBegin); + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + break; + } + normalizeLineEnds(cdata); + flowBegin = pos_; + flowEnd = pos_; + space = SPACE_NONE; + } else { + skipDocumentTypeDeclaration(); + } + } + break; + case '/': + ++pos_; + pad_.add(flowBegin, flowEnd - flowBegin); + *text = pad_.get(); + state_ = STATE_END_TAG; + return RESULT_TEXT; + case '?': + ++pos_; + skipProcessingInstruction(); + space = SPACE_BREAK; + break; + default: + pad_.add(flowBegin, flowEnd - flowBegin); + *text = pad_.get(); + state_ = STATE_START_TAG; + return RESULT_TEXT; + } + break; + default: + switch (space) { + case SPACE_START: + flowBegin = pos_; + break; + case SPACE_NONE: + case SPACE_SPAN: + break; + case SPACE_BREAK: + pad_.add(flowBegin, flowEnd - flowBegin); + pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); + flowBegin = pos_; + break; + } + flowEnd = ++pos_; + space = SPACE_NONE; + break; + } + } +} + +int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { + OSL_ASSERT(pos <= INT_MAX); + return static_cast< int >(pos); +} + +} |