/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include #include #include #include #include #include #include #include #include #include #include // structure to store the actual data template struct SvParser_Impl { OUString aToken; // parsed token sal_uInt64 nFilePos; // actual position in stream sal_uInt32 nlLineNr; // actual line number sal_uInt32 nlLinePos; // actual column number tools::Long nTokenValue; // extra value (RTF) bool bTokenHasValue; // indicates whether nTokenValue is valid T nToken; // actual Token sal_uInt32 nNextCh; // actual character T nSaveToken; // the token from Continue rtl_TextToUnicodeConverter hConv; rtl_TextToUnicodeContext hContext; SvParser_Impl() : nFilePos(0) , nlLineNr(0) , nlLinePos(0) , nTokenValue(0) , bTokenHasValue(false) , nToken(static_cast(0)) , nNextCh(0) , nSaveToken(static_cast(0)) , hConv( nullptr ) , hContext( reinterpret_cast(1) ) { } }; template SvParser::TokenStackType::TokenStackType() : nTokenValue(0) , bTokenHasValue(false) , nTokenId(static_cast(0)) { } // Constructor template SvParser::SvParser( SvStream& rIn, sal_uInt8 nStackSize ) : rInput( rIn ) , nlLineNr( 1 ) , nlLinePos( 1 ) , nConversionErrors( 0 ) , pImplData( nullptr ) , m_nTokenIndex(0) , nTokenValue( 0 ) , bTokenHasValue( false ) , bFuzzing(comphelper::IsFuzzing()) , eState( SvParserState::NotStarted ) , eSrcEnc( RTL_TEXTENCODING_DONTKNOW ) , nNextChPos(0) , nNextCh(0) , bSwitchToUCS2(false) , bRTF_InTextRead(false) , nTokenStackSize( nStackSize ) , nTokenStackPos( 0 ) { eState = SvParserState::NotStarted; if( nTokenStackSize < 3 ) nTokenStackSize = 3; pTokenStack.reset(new TokenStackType[ nTokenStackSize ]); pTokenStackPos = pTokenStack.get(); } template SvParser::~SvParser() { if( pImplData && pImplData->hConv ) { rtl_destroyTextToUnicodeContext( pImplData->hConv, pImplData->hContext ); rtl_destroyTextToUnicodeConverter( pImplData->hConv ); } pTokenStack.reset(); } template SvParserState SvParser::GetStatus() const { return eState; } template sal_uInt32 SvParser::GetLineNr() const { return nlLineNr; } template sal_uInt32 SvParser::GetLinePos() const { return nlLinePos; } template void SvParser::IncLineNr() { ++nlLineNr; } template sal_uInt32 SvParser::IncLinePos() { return ++nlLinePos; } template void SvParser::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; } template void SvParser::SetLinePos( sal_uInt32 nlPos ) { nlLinePos = nlPos; } template bool SvParser::IsParserWorking() const { return SvParserState::Working == eState; } template rtl_TextEncoding SvParser::GetSrcEncoding() const { return eSrcEnc; } template void SvParser::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; } template bool SvParser::IsSwitchToUCS2() const { return bSwitchToUCS2; } template sal_uInt16 SvParser::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; } template Link SvParser::GetAsynchCallLink() const { return LINK( const_cast(this), SvParser, NewDataRead ); } template void SvParser::ClearTxtConvContext() { if( pImplData && pImplData->hConv ) rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext ); } template void SvParser::SetSrcEncoding( rtl_TextEncoding eEnc ) { if( eEnc == eSrcEnc ) return; if( pImplData && pImplData->hConv ) { rtl_destroyTextToUnicodeContext( pImplData->hConv, pImplData->hContext ); rtl_destroyTextToUnicodeConverter( pImplData->hConv ); pImplData->hConv = nullptr; pImplData->hContext = reinterpret_cast(1); } if( rtl_isOctetTextEncoding(eEnc) || RTL_TEXTENCODING_UCS2 == eEnc ) { eSrcEnc = eEnc; if( !pImplData ) pImplData.reset(new SvParser_Impl); pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc ); DBG_ASSERT( pImplData->hConv, "SvParser::SetSrcEncoding: no converter for source encoding" ); if( !pImplData->hConv ) eSrcEnc = RTL_TEXTENCODING_DONTKNOW; else pImplData->hContext = rtl_createTextToUnicodeContext( pImplData->hConv ); } else { SAL_WARN( "svtools", "SvParser::SetSrcEncoding: invalid source encoding" ); eSrcEnc = RTL_TEXTENCODING_DONTKNOW; } } template void SvParser::RereadLookahead() { rInput.Seek(nNextChPos); nNextCh = GetNextChar(); } template sal_uInt32 SvParser::GetNextChar() { sal_uInt32 c = 0U; // When reading multiple bytes, we don't have to care about the file // position when we run into the pending state. The file position is // maintained by SaveState/RestoreState. if( bSwitchToUCS2 && 0 == rInput.Tell() ) { rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW); if (rInput.good()) { sal_uInt64 nPos = rInput.Tell(); if (nPos == 2) eSrcEnc = RTL_TEXTENCODING_UCS2; else if (nPos == 3) SetSrcEncoding(RTL_TEXTENCODING_UTF8); else // Try to detect encoding without BOM { std::vector buf(65535); // Arbitrarily chosen 64KiB buffer const size_t nSize = rInput.ReadBytes(buf.data(), buf.size()); rInput.Seek(0); if (nSize > 0) { UErrorCode uerr = U_ZERO_ERROR; UCharsetDetector* ucd = ucsdet_open(&uerr); ucsdet_setText(ucd, buf.data(), nSize, &uerr); if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr)) { const char* pEncodingName = ucsdet_getName(match, &uerr); if (U_SUCCESS(uerr)) { if (strcmp("UTF-8", pEncodingName) == 0) { SetSrcEncoding(RTL_TEXTENCODING_UTF8); } else if (strcmp("UTF-16LE", pEncodingName) == 0) { eSrcEnc = RTL_TEXTENCODING_UCS2; rInput.SetEndian(SvStreamEndian::LITTLE); } else if (strcmp("UTF-16BE", pEncodingName) == 0) { eSrcEnc = RTL_TEXTENCODING_UCS2; rInput.SetEndian(SvStreamEndian::BIG); } } } ucsdet_close(ucd); } } } bSwitchToUCS2 = false; } bool bErr; nNextChPos = rInput.Tell(); if( RTL_TEXTENCODING_UCS2 == eSrcEnc ) { sal_Unicode cUC; rInput.ReadUtf16(cUC); bErr = !rInput.good(); if( !bErr ) { c = cUC; if (rtl::isHighSurrogate(cUC)) { const sal_uInt64 nPos = rInput.Tell(); rInput.ReadUtf16(cUC); if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded c = rtl::combineSurrogates(c, cUC); else rInput.Seek(nPos); // process lone high surrogate } } } else { sal_Size nChars = 0; do { char c1; // signed, that's the text converter expects rInput.ReadChar( c1 ); bErr = !rInput.good(); if( !bErr ) { if ( RTL_TEXTENCODING_DONTKNOW == eSrcEnc || RTL_TEXTENCODING_SYMBOL == eSrcEnc ) { // no conversion shall take place c = reinterpret_cast( c1 ); nChars = 1; } else { assert(pImplData && pImplData->hConv && "no text converter!"); sal_Unicode cUC; sal_uInt32 nInfo = 0; sal_Size nCvtBytes; nChars = rtl_convertTextToUnicode( pImplData->hConv, pImplData->hContext, &c1, 1, &cUC, 1, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, &nInfo, &nCvtBytes); if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 ) { // The conversion wasn't successful because we haven't // read enough characters. if( pImplData->hContext != reinterpret_cast(1) ) { sal_Unicode sCh[2]; while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 ) { rInput.ReadChar( c1 ); bErr = !rInput.good(); if( bErr ) break; nChars = rtl_convertTextToUnicode( pImplData->hConv, pImplData->hContext, &c1, 1, sCh , 2, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, &nInfo, &nCvtBytes); } if( !bErr ) { if( 1 == nChars && 0 == nInfo ) { c = sal_uInt32( sCh[0] ); } else if( 2 == nChars && 0 == nInfo ) { c = rtl::combineSurrogates( sCh[0], sCh[1] ); } else if( 0 != nChars || 0 != nInfo ) { DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, "source buffer is too small" ); DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, "there is a conversion error" ); DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" ); // There are still errors, but nothing we can // do c = '?'; nChars = 1; ++nConversionErrors; } } } else { char sBuffer[10]; sBuffer[0] = c1; sal_uInt16 nLen = 1; while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 && nLen < 10 ) { rInput.ReadChar( c1 ); bErr = !rInput.good(); if( bErr ) break; sBuffer[nLen++] = c1; nChars = rtl_convertTextToUnicode( pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, &nInfo, &nCvtBytes); } if( !bErr ) { if( 1 == nChars && 0 == nInfo ) { DBG_ASSERT( nCvtBytes == nLen, "no all bytes have been converted!" ); c = cUC; } else { DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, "source buffer is too small" ); DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, "there is a conversion error" ); DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" ); // There are still errors, so we use the first // character and restart after that. c = reinterpret_cast( sBuffer[0] ); rInput.SeekRel( -(nLen-1) ); nChars = 1; ++nConversionErrors; } } } } else if( 1 == nChars && 0 == nInfo ) { // The conversion was successful DBG_ASSERT( nCvtBytes == 1, "no all bytes have been converted!" ); c = cUC; } else if( 0 != nChars || 0 != nInfo ) { DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" ); DBG_ASSERT( 0 != nInfo, "there is no converted character and no error" ); // #73398#: If the character could not be converted, // because a conversion is not available, do no conversion at all. c = reinterpret_cast( c1 ); nChars = 1; ++nConversionErrors; } } } } while( 0 == nChars && !bErr ); } if ( ! rtl::isUnicodeScalarValue( c ) ) c = '?' ; if (bFuzzing && nConversionErrors > 128) { SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance"); bErr = true; } if( bErr ) { if( ERRCODE_IO_PENDING == rInput.GetError() ) { eState = SvParserState::Pending; return c; } else return sal_Unicode(EOF); } if( c == '\n' ) { IncLineNr(); SetLinePos( 1 ); } else IncLinePos(); return c; } template T SvParser::GetNextToken() { T nRet = static_cast(0); if( !nTokenStackPos ) { aToken.setLength( 0 ); // empty token buffer nTokenValue = -1; // marker for no value read bTokenHasValue = false; nRet = GetNextToken_(); if( SvParserState::Pending == eState ) return nRet; } ++pTokenStackPos; if( pTokenStackPos == pTokenStack.get() + nTokenStackSize ) pTokenStackPos = pTokenStack.get(); // pop from stack ?? if( nTokenStackPos ) { --nTokenStackPos; nTokenValue = pTokenStackPos->nTokenValue; bTokenHasValue = pTokenStackPos->bTokenHasValue; aToken = pTokenStackPos->sToken; nRet = pTokenStackPos->nTokenId; ++m_nTokenIndex; } // no, now push actual value on stack else if( SvParserState::Working == eState ) { pTokenStackPos->sToken = aToken; pTokenStackPos->nTokenValue = nTokenValue; pTokenStackPos->bTokenHasValue = bTokenHasValue; pTokenStackPos->nTokenId = nRet; ++m_nTokenIndex; } else if( SvParserState::Accepted != eState && SvParserState::Pending != eState ) eState = SvParserState::Error; // an error occurred return nRet; } template T SvParser::SkipToken( short nCnt ) // "skip" n Tokens backward { pTokenStackPos = GetStackPtr( nCnt ); short nTmp = nTokenStackPos - nCnt; if( nTmp < 0 ) nTmp = 0; else if( nTmp > nTokenStackSize ) nTmp = nTokenStackSize; nTokenStackPos = sal_uInt8(nTmp); m_nTokenIndex -= nTmp; // restore values aToken = pTokenStackPos->sToken; nTokenValue = pTokenStackPos->nTokenValue; bTokenHasValue = pTokenStackPos->bTokenHasValue; return pTokenStackPos->nTokenId; } template typename SvParser::TokenStackType* SvParser::GetStackPtr( short nCnt ) { sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get()); if( nCnt > 0 ) { if( nCnt >= nTokenStackSize ) nCnt = (nTokenStackSize-1); if( nCurrentPos + nCnt < nTokenStackSize ) nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt); else nCurrentPos = sal::static_int_cast< sal_uInt8 >( nCurrentPos + (nCnt - nTokenStackSize)); } else if( nCnt < 0 ) { if( -nCnt >= nTokenStackSize ) nCnt = -nTokenStackSize+1; if( -nCnt <= nCurrentPos ) nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt); else nCurrentPos = sal::static_int_cast< sal_uInt8 >( nCurrentPos + (nCnt + nTokenStackSize)); } return pTokenStack.get() + nCurrentPos; } // to read asynchronous from SvStream template T SvParser::GetSaveToken() const { return pImplData ? pImplData->nSaveToken : static_cast(0); } template void SvParser::SaveState( T nToken ) { // save actual status if( !pImplData ) { pImplData.reset(new SvParser_Impl); pImplData->nSaveToken = static_cast(0); } pImplData->nFilePos = rInput.Tell(); pImplData->nToken = nToken; pImplData->aToken = aToken; pImplData->nlLineNr = nlLineNr; pImplData->nlLinePos = nlLinePos; pImplData->nTokenValue= nTokenValue; pImplData->bTokenHasValue = bTokenHasValue; pImplData->nNextCh = nNextCh; } template void SvParser::RestoreState() { // restore old status if( !pImplData ) return; if( ERRCODE_IO_PENDING == rInput.GetError() ) rInput.ResetError(); aToken = pImplData->aToken; nlLineNr = pImplData->nlLineNr; nlLinePos = pImplData->nlLinePos; nTokenValue= pImplData->nTokenValue; bTokenHasValue=pImplData->bTokenHasValue; nNextCh = pImplData->nNextCh; pImplData->nSaveToken = pImplData->nToken; rInput.Seek( pImplData->nFilePos ); } template void SvParser::Continue( T ) { } // expanded out version of // IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void ) // since it can't cope with template methods template void SvParser::LinkStubNewDataRead(void * instance, LinkParamNone* data) { return static_cast *>(instance)->NewDataRead(data); } template void SvParser::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*) { switch( eState ) { case SvParserState::Pending: eState = SvParserState::Working; RestoreState(); Continue( pImplData->nToken ); if( ERRCODE_IO_PENDING == rInput.GetError() ) rInput.ResetError(); if( SvParserState::Pending != eState ) ReleaseRef(); // ready otherwise! break; case SvParserState::NotStarted: case SvParserState::Working: break; default: ReleaseRef(); // ready otherwise! break; } } template class SVT_DLLPUBLIC SvParser; template class SVT_DLLPUBLIC SvParser; /*======================================================================== * * SvKeyValueIterator. * *======================================================================*/ typedef std::vector SvKeyValueList_Impl; struct SvKeyValueIterator::Impl { SvKeyValueList_Impl maList; sal_uInt16 mnPos; Impl() : mnPos(0) {} }; SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {} SvKeyValueIterator::~SvKeyValueIterator() = default; bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal) { mpImpl->mnPos = mpImpl->maList.size(); return GetNext (rKeyVal); } bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal) { if (mpImpl->mnPos > 0) { rKeyVal = mpImpl->maList[--mpImpl->mnPos]; return true; } else { // Nothing to do. return false; } } void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal) { mpImpl->maList.push_back(rKeyVal); } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */