/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * Copyright 2000, 2010 Oracle and/or its affiliates. * * OpenOffice.org - a multi-platform office productivity suite * * This file is part of OpenOffice.org. * * OpenOffice.org is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3 * only, as published by the Free Software Foundation. * * OpenOffice.org is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License version 3 for more details * (a copy is included in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU Lesser General Public License * version 3 along with OpenOffice.org. If not, see * * for a copy of the LGPLv3 License. * ************************************************************************/ #include "sal/config.h" #include #include #include #include "com/sun/star/container/NoSuchElementException.hpp" #include "com/sun/star/uno/Reference.hxx" #include "com/sun/star/uno/RuntimeException.hpp" #include "com/sun/star/uno/XInterface.hpp" #include "osl/file.h" #include "rtl/oustringostreaminserter.hxx" #include "rtl/string.h" #include "rtl/ustring.h" #include "rtl/ustring.hxx" #include "sal/log.hxx" #include "sal/types.h" #include "xmlreader/pad.hxx" #include "xmlreader/span.hxx" #include "xmlreader/xmlreader.hxx" namespace xmlreader { namespace { namespace css = com::sun::star; bool isSpace(char c) { switch (c) { case '\x09': case '\x0A': case '\x0D': case ' ': return true; default: return false; } } } XmlReader::XmlReader(rtl::OUString const & fileUrl) SAL_THROW(( css::container::NoSuchElementException, css::uno::RuntimeException)): fileUrl_(fileUrl) { switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) { case osl_File_E_None: break; case osl_File_E_NOENT: throw css::container::NoSuchElementException( fileUrl_, css::uno::Reference< css::uno::XInterface >()); default: throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); if (e == osl_File_E_None) { e = osl_mapFile( fileHandle_, &fileAddress_, fileSize_, 0, osl_File_MapFlag_WillNeed); } if (e != osl_File_E_None) { e = osl_closeFile(fileHandle_); if (e != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); } throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } namespaceIris_.push_back( Span( RTL_CONSTASCII_STRINGPARAM( "http://www.w3.org/XML/1998/namespace"))); namespaces_.push_back( NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); pos_ = static_cast< char * >(fileAddress_); end_ = pos_ + fileSize_; state_ = STATE_CONTENT; } XmlReader::~XmlReader() { oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); if (e != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e); } e = osl_closeFile(fileHandle_); if (e != osl_File_E_None) { SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); } } int XmlReader::registerNamespaceIri(Span const & iri) { int id = toNamespaceId(namespaceIris_.size()); namespaceIris_.push_back(iri); if (iri.equals( Span( RTL_CONSTASCII_STRINGPARAM( "http://www.w3.org/2001/XMLSchema-instance")))) { // Old user layer .xcu files used the xsi namespace prefix without // declaring a corresponding namespace binding, see issue 77174; reading // those files during migration would fail without this hack that can be // removed once migration is no longer relevant (see // configmgr::Components::parseModificationLayer): namespaces_.push_back( NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); } return id; } XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) { switch (state_) { case STATE_CONTENT: switch (reportText) { case TEXT_NONE: return handleSkippedText(data, nsId); case TEXT_RAW: return handleRawText(data); case TEXT_NORMALIZED: return handleNormalizedText(data); } case STATE_START_TAG: return handleStartTag(nsId, data); case STATE_END_TAG: return handleEndTag(); case STATE_EMPTY_ELEMENT_TAG: handleElementEnd(); return RESULT_END; default: // STATE_DONE return RESULT_DONE; } } bool XmlReader::nextAttribute(int * nsId, Span * localName) { assert(nsId != 0 && localName != 0); if (firstAttribute_) { currentAttribute_ = attributes_.begin(); firstAttribute_ = false; } else { ++currentAttribute_; } if (currentAttribute_ == attributes_.end()) { return false; } if (currentAttribute_->nameColon == 0) { *nsId = NAMESPACE_NONE; *localName = Span( currentAttribute_->nameBegin, currentAttribute_->nameEnd - currentAttribute_->nameBegin); } else { *nsId = getNamespaceId( Span( currentAttribute_->nameBegin, currentAttribute_->nameColon - currentAttribute_->nameBegin)); *localName = Span( currentAttribute_->nameColon + 1, currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); } return true; } Span XmlReader::getAttributeValue(bool fullyNormalize) { return handleAttributeValue( currentAttribute_->valueBegin, currentAttribute_->valueEnd, fullyNormalize); } int XmlReader::getNamespaceId(Span const & prefix) const { for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); i != namespaces_.rend(); ++i) { if (prefix.equals(i->prefix)) { return i->nsId; } } return NAMESPACE_UNKNOWN; } rtl::OUString XmlReader::getUrl() const { return fileUrl_; } void XmlReader::normalizeLineEnds(Span const & text) { char const * p = text.begin; sal_Int32 n = text.length; for (;;) { sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); if (i < 0) { break; } pad_.add(p, i); p += i + 1; n -= i + 1; if (n == 0 || *p != '\x0A') { pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); } } pad_.add(p, n); } void XmlReader::skipSpace() { while (isSpace(peek())) { ++pos_; } } bool XmlReader::skipComment() { if (rtl_str_shortenedCompare_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), RTL_CONSTASCII_LENGTH("--")) != 0) { return false; } pos_ += RTL_CONSTASCII_LENGTH("--"); sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within comment) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } pos_ += i + RTL_CONSTASCII_LENGTH("--"); if (read() != '>') { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "illegal \"--\" within comment in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } return true; } void XmlReader::skipProcessingInstruction() { sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '()); } pos_ += i + RTL_CONSTASCII_LENGTH("?>"); } void XmlReader::skipDocumentTypeDeclaration() { // Neither is it checked that the doctypedecl is at the correct position in // the document, nor that it is well-formed: for (;;) { char c = read(); switch (c) { case '\0': // i.e., EOF throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within DTD) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); case '"': case '\'': { sal_Int32 i = rtl_str_indexOfChar_WithLength( pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within DTD) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } pos_ += i + 1; } break; case '>': return; case '[': for (;;) { c = read(); switch (c) { case '\0': // i.e., EOF throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within DTD) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); case '"': case '\'': { sal_Int32 i = rtl_str_indexOfChar_WithLength( pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within DTD) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } pos_ += i + 1; } break; case '<': switch (read()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within DTD) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); case '!': skipComment(); break; case '?': skipProcessingInstruction(); break; default: break; } break; case ']': skipSpace(); if (read() != '>') { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "missing \">\" of DTD in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } return; default: break; } } default: break; } } } Span XmlReader::scanCdataSection() { if (rtl_str_shortenedCompare_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), RTL_CONSTASCII_LENGTH("[CDATA[")) != 0) { return Span(); } pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); char const * begin = pos_; sal_Int32 i = rtl_str_indexOfStr_WithLength( pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "premature end (within CDATA section) of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); return Span(begin, i); } bool XmlReader::scanName(char const ** nameColon) { assert(nameColon != 0 && *nameColon == 0); for (char const * begin = pos_;; ++pos_) { switch (peek()) { case '\0': // i.e., EOF case '\x09': case '\x0A': case '\x0D': case ' ': case '/': case '=': case '>': return pos_ != begin; case ':': *nameColon = pos_; break; default: break; } } } int XmlReader::scanNamespaceIri(char const * begin, char const * end) { assert(begin != 0 && begin <= end); Span iri(handleAttributeValue(begin, end, false)); for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { if (namespaceIris_[i].equals(iri)) { return toNamespaceId(i); } } return XmlReader::NAMESPACE_UNKNOWN; } char const * XmlReader::handleReference(char const * position, char const * end) { assert(position != 0 && *position == '&' && position < end); ++position; if (*position == '#') { ++position; sal_Int32 val = 0; char const * p; if (*position == 'x') { ++position; p = position; for (;; ++position) { char c = *position; if (c >= '0' && c <= '9') { val = 16 * val + (c - '0'); } else if (c >= 'A' && c <= 'F') { val = 16 * val + (c - 'A') + 10; } else if (c >= 'a' && c <= 'f') { val = 16 * val + (c - 'a') + 10; } else { break; } if (val > 0x10FFFF) { // avoid overflow throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "'&#x...' too large in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } } } else { p = position; for (;; ++position) { char c = *position; if (c >= '0' && c <= '9') { val = 10 * val + (c - '0'); } else { break; } if (val > 0x10FFFF) { // avoid overflow throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "'&#...' too large in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } } } if (position == p || *position++ != ';') { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } assert(val >= 0 && val <= 0x10FFFF); if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "character reference denoting invalid character in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char buf[4]; sal_Int32 len; if (val < 0x80) { buf[0] = static_cast< char >(val); len = 1; } else if (val < 0x800) { buf[0] = static_cast< char >((val >> 6) | 0xC0); buf[1] = static_cast< char >((val & 0x3F) | 0x80); len = 2; } else if (val < 0x10000) { buf[0] = static_cast< char >((val >> 12) | 0xE0); buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); buf[2] = static_cast< char >((val & 0x3F) | 0x80); len = 3; } else { buf[0] = static_cast< char >((val >> 18) | 0xF0); buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); buf[3] = static_cast< char >((val & 0x3F) | 0x80); len = 4; } pad_.addEphemeral(buf, len); return position; } else { struct EntityRef { char const * inBegin; sal_Int32 inLength; char const * outBegin; sal_Int32 outLength; }; static EntityRef const refs[] = { { RTL_CONSTASCII_STRINGPARAM("amp;"), RTL_CONSTASCII_STRINGPARAM("&") }, { RTL_CONSTASCII_STRINGPARAM("lt;"), RTL_CONSTASCII_STRINGPARAM("<") }, { RTL_CONSTASCII_STRINGPARAM("gt;"), RTL_CONSTASCII_STRINGPARAM(">") }, { RTL_CONSTASCII_STRINGPARAM("apos;"), RTL_CONSTASCII_STRINGPARAM("'") }, { RTL_CONSTASCII_STRINGPARAM("quot;"), RTL_CONSTASCII_STRINGPARAM("\"") } }; for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { if (rtl_str_shortenedCompare_WithLength( position, end - position, refs[i].inBegin, refs[i].inLength, refs[i].inLength) == 0) { position += refs[i].inLength; pad_.add(refs[i].outBegin, refs[i].outLength); return position; } } throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } } Span XmlReader::handleAttributeValue( char const * begin, char const * end, bool fullyNormalize) { pad_.clear(); if (fullyNormalize) { while (begin != end && isSpace(*begin)) { ++begin; } while (end != begin && isSpace(end[-1])) { --end; } char const * p = begin; enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; // a single true space character can go into the current span, // everything else breaks the span Space space = SPACE_NONE; while (p != end) { switch (*p) { case '\x09': case '\x0A': case '\x0D': switch (space) { case SPACE_NONE: pad_.add(begin, p - begin); pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); space = SPACE_BREAK; break; case SPACE_SPAN: pad_.add(begin, p - begin); space = SPACE_BREAK; break; case SPACE_BREAK: break; } begin = ++p; break; case ' ': switch (space) { case SPACE_NONE: ++p; space = SPACE_SPAN; break; case SPACE_SPAN: pad_.add(begin, p - begin); begin = ++p; space = SPACE_BREAK; break; case SPACE_BREAK: begin = ++p; break; } break; case '&': pad_.add(begin, p - begin); p = handleReference(p, end); begin = p; space = SPACE_NONE; break; default: ++p; space = SPACE_NONE; break; } } pad_.add(begin, p - begin); } else { char const * p = begin; while (p != end) { switch (*p) { case '\x09': case '\x0A': pad_.add(begin, p - begin); begin = ++p; pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); break; case '\x0D': pad_.add(begin, p - begin); ++p; if (peek() == '\x0A') { ++p; } begin = p; pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); break; case '&': pad_.add(begin, p - begin); p = handleReference(p, end); begin = p; break; default: ++p; break; } } pad_.add(begin, p - begin); } return pad_.get(); } XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { assert(nsId != 0 && localName); char const * nameBegin = pos_; char const * nameColon = 0; if (!scanName(&nameColon)) { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * nameEnd = pos_; NamespaceList::size_type inheritedNamespaces = namespaces_.size(); bool hasDefaultNs = false; int defaultNsId = NAMESPACE_NONE; attributes_.clear(); for (;;) { char const * p = pos_; skipSpace(); if (peek() == '/' || peek() == '>') { break; } if (pos_ == p) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "missing whitespace before attribute in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * attrNameBegin = pos_; char const * attrNameColon = 0; if (!scanName(&attrNameColon)) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * attrNameEnd = pos_; skipSpace(); if (read() != '=') { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } skipSpace(); char del = read(); if (del != '\'' && del != '"') { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * valueBegin = pos_; sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM( "unterminated attribute value in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * valueEnd = pos_ + i; pos_ += i + 1; if (attrNameColon == 0 && Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( RTL_CONSTASCII_STRINGPARAM("xmlns"))) { hasDefaultNs = true; defaultNsId = scanNamespaceIri(valueBegin, valueEnd); } else if (attrNameColon != 0 && Span(attrNameBegin, attrNameColon - attrNameBegin).equals( RTL_CONSTASCII_STRINGPARAM("xmlns"))) { namespaces_.push_back( NamespaceData( Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), scanNamespaceIri(valueBegin, valueEnd))); } else { attributes_.push_back( AttributeData( attrNameBegin, attrNameEnd, attrNameColon, valueBegin, valueEnd)); } } if (!hasDefaultNs && !elements_.empty()) { defaultNsId = elements_.top().defaultNamespaceId; } firstAttribute_ = true; if (peek() == '/') { state_ = STATE_EMPTY_ELEMENT_TAG; ++pos_; } else { state_ = STATE_CONTENT; } if (peek() != '>') { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } ++pos_; elements_.push( ElementData( Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, defaultNsId)); if (nameColon == 0) { *nsId = defaultNsId; *localName = Span(nameBegin, nameEnd - nameBegin); } else { *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); } return RESULT_BEGIN; } XmlReader::Result XmlReader::handleEndTag() { if (elements_.empty()) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } char const * nameBegin = pos_; char const * nameColon = 0; if (!scanName(&nameColon) || !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } handleElementEnd(); skipSpace(); if (peek() != '>') { throw css::uno::RuntimeException( (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } ++pos_; return RESULT_END; } void XmlReader::handleElementEnd() { assert(!elements_.empty()); namespaces_.resize(elements_.top().inheritedNamespaces); elements_.pop(); state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; } XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { for (;;) { sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); if (i < 0) { throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); } pos_ += i + 1; switch (peek()) { case '!': ++pos_; if (!skipComment() && !scanCdataSection().is()) { skipDocumentTypeDeclaration(); } break; case '/': ++pos_; return handleEndTag(); case '?': ++pos_; skipProcessingInstruction(); break; default: return handleStartTag(nsId, data); } } } XmlReader::Result XmlReader::handleRawText(Span * text) { pad_.clear(); for (char const * begin = pos_;;) { switch (peek()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); case '\x0D': pad_.add(begin, pos_ - begin); ++pos_; if (peek() != '\x0A') { pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); } begin = pos_; break; case '&': pad_.add(begin, pos_ - begin); pos_ = handleReference(pos_, end_); begin = pos_; break; case '<': pad_.add(begin, pos_ - begin); ++pos_; switch (peek()) { case '!': ++pos_; if (!skipComment()) { Span cdata(scanCdataSection()); if (cdata.is()) { normalizeLineEnds(cdata); } else { skipDocumentTypeDeclaration(); } } begin = pos_; break; case '/': *text = pad_.get(); ++pos_; state_ = STATE_END_TAG; return RESULT_TEXT; case '?': ++pos_; skipProcessingInstruction(); begin = pos_; break; default: *text = pad_.get(); state_ = STATE_START_TAG; return RESULT_TEXT; } break; default: ++pos_; break; } } } XmlReader::Result XmlReader::handleNormalizedText(Span * text) { pad_.clear(); char const * flowBegin = pos_; char const * flowEnd = pos_; enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; // a single true space character can go into the current flow, // everything else breaks the flow Space space = SPACE_START; for (;;) { switch (peek()) { case '\0': // i.e., EOF throw css::uno::RuntimeException( (rtl::OUString( RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + fileUrl_), css::uno::Reference< css::uno::XInterface >()); case '\x09': case '\x0A': case '\x0D': switch (space) { case SPACE_START: case SPACE_BREAK: break; case SPACE_NONE: case SPACE_SPAN: space = SPACE_BREAK; break; } ++pos_; break; case ' ': switch (space) { case SPACE_START: case SPACE_BREAK: break; case SPACE_NONE: space = SPACE_SPAN; break; case SPACE_SPAN: space = SPACE_BREAK; break; } ++pos_; break; case '&': switch (space) { case SPACE_START: break; case SPACE_NONE: case SPACE_SPAN: pad_.add(flowBegin, pos_ - flowBegin); break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); break; } pos_ = handleReference(pos_, end_); flowBegin = pos_; flowEnd = pos_; space = SPACE_NONE; break; case '<': ++pos_; switch (peek()) { case '!': ++pos_; if (skipComment()) { space = SPACE_BREAK; } else { Span cdata(scanCdataSection()); if (cdata.is()) { // CDATA is not normalized (similar to character // references; it keeps the code simple), but it might // arguably be better to normalize it: switch (space) { case SPACE_START: break; case SPACE_NONE: case SPACE_SPAN: pad_.add(flowBegin, pos_ - flowBegin); break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); break; } normalizeLineEnds(cdata); flowBegin = pos_; flowEnd = pos_; space = SPACE_NONE; } else { skipDocumentTypeDeclaration(); } } break; case '/': ++pos_; pad_.add(flowBegin, flowEnd - flowBegin); *text = pad_.get(); state_ = STATE_END_TAG; return RESULT_TEXT; case '?': ++pos_; skipProcessingInstruction(); space = SPACE_BREAK; break; default: pad_.add(flowBegin, flowEnd - flowBegin); *text = pad_.get(); state_ = STATE_START_TAG; return RESULT_TEXT; } break; default: switch (space) { case SPACE_START: flowBegin = pos_; break; case SPACE_NONE: case SPACE_SPAN: break; case SPACE_BREAK: pad_.add(flowBegin, flowEnd - flowBegin); pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); flowBegin = pos_; break; } flowEnd = ++pos_; space = SPACE_NONE; break; } } } int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { assert(pos <= INT_MAX); return static_cast< int >(pos); } } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */