diff options
Diffstat (limited to 'sax/source/expatwrap/xml2utf.cxx')
-rw-r--r-- | sax/source/expatwrap/xml2utf.cxx | 570 |
1 files changed, 570 insertions, 0 deletions
diff --git a/sax/source/expatwrap/xml2utf.cxx b/sax/source/expatwrap/xml2utf.cxx new file mode 100644 index 000000000000..bbd72b2a0d8b --- /dev/null +++ b/sax/source/expatwrap/xml2utf.cxx @@ -0,0 +1,570 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ +#include <string.h> + +#include <sal/types.h> + +#include <rtl/textenc.h> +#include <rtl/tencinfo.h> + + +#include <com/sun/star/io/XInputStream.hpp> + +using namespace rtl; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::io; + +#include "xml2utf.hxx" + +namespace sax_expatwrap { + +sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) + throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException ) +{ + + Sequence<sal_Int8> seqIn; + + if( ! m_in.is() ) { + throw NotConnectedException(); + } + if( ! m_bStarted ) { + nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute + // within the first 512 bytes == 128 chars in UCS-4 + } + + sal_Int32 nRead; + Sequence< sal_Int8 > seqStart; + while( sal_True ) + { + nRead = m_in->readSomeBytes( seq , nMaxToRead ); + + if( nRead + seqStart.getLength()) + { + // if nRead is 0, the file is already eof. + if( ! m_bStarted && nRead ) + { + // ensure that enough data is available to parse encoding + if( seqStart.getLength() ) + { + // prefix with what we had so far. + sal_Int32 nLength = seq.getLength(); + seq.realloc( seqStart.getLength() + nLength ); + + memmove (seq.getArray() + seqStart.getLength(), + seq.getConstArray(), + nLength); + memcpy (seq.getArray(), + seqStart.getConstArray(), + seqStart.getLength()); + } + + // autodetection with the first bytes + if( ! isEncodingRecognizable( seq ) ) + { + // remember what we have so far. + seqStart = seq; + + // read more ! + continue; + } + if( scanForEncoding( seq ) || m_sEncoding.getLength() ) { + // initialize decoding + initializeDecoding(); + } + nRead = seq.getLength(); + seqStart = Sequence < sal_Int8 > (); + } + + // do the encoding + if( m_pText2Unicode && m_pUnicode2Text && + m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) { + + Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); + seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); + } + + if( ! m_bStarted ) + { + // it must now be ensured, that no encoding attribute exist anymore + // ( otherwise the expat-Parser will crash ) + // This must be done after decoding ! + // ( e.g. Files decoded in ucs-4 cannot be read properly ) + m_bStarted = sal_True; + removeEncoding( seq ); + } + nRead = seq.getLength(); + } + + break; + } + return nRead; +} + + +XMLFile2UTFConverter::~XMLFile2UTFConverter() +{ + if( m_pText2Unicode ) + delete m_pText2Unicode; + if( m_pUnicode2Text ) + delete m_pUnicode2Text; +} + + +void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) +{ + const sal_Int8 *pSource = seq.getArray(); + if( ! strncmp( (const char * ) pSource , "<?xml" , 4) ) + { + + // scan for encoding + OString str( (sal_Char * ) pSource , seq.getLength() ); + + // cut sequence to first line break + // find first line break; + int nMax = str.indexOf( 10 ); + if( nMax >= 0 ) + { + str = str.copy( 0 , nMax ); + } + + int nFound = str.indexOf( " encoding" ); + if( nFound >= 0 ) { + int nStop; + int nStart = str.indexOf( "\"" , nFound ); + if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) + { + nStart = str.indexOf( "'" , nFound ); + nStop = str.indexOf( "'" , nStart +1 ); + } + else + { + nStop = str.indexOf( "\"" , nStart +1); + } + + if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) + { + // remove encoding tag from file + memmove( &( seq.getArray()[nFound] ) , + &( seq.getArray()[nStop+1]) , + seq.getLength() - nStop -1); + seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); +// str = String( (char * ) seq.getArray() , seq.getLen() ); + } + } + } +} + +// Checks, if enough data has been accumulated to recognize the encoding +sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) +{ + const sal_Int8 *pSource = seq.getConstArray(); + sal_Bool bCheckIfFirstClosingBracketExsists = sal_False; + + if( seq.getLength() < 8 ) { + // no recognition possible, when less than 8 bytes are available + return sal_False; + } + + if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { + // scan if the <?xml tag finishes within this buffer + bCheckIfFirstClosingBracketExsists = sal_True; + } + else if( ('<' == pSource[0] || '<' == pSource[2] ) && + ( ('?' == pSource[4] || '?' == pSource[6] ) ) ) + { + // check for utf-16 + bCheckIfFirstClosingBracketExsists = sal_True; + } + else if( ( '<' == pSource[1] || '<' == pSource[3] ) && + ( '?' == pSource[5] || '?' == pSource[7] ) ) + { + // check for + bCheckIfFirstClosingBracketExsists = sal_True; + } + + if( bCheckIfFirstClosingBracketExsists ) + { + for( sal_Int32 i = 0; i < seq.getLength() ; i ++ ) + { + // whole <?xml tag is valid + if( '>' == pSource[ i ] ) + { + return sal_True; + } + } + return sal_False; + } + + // No <? tag in front, no need for a bigger buffer + return sal_True; +} + +sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) +{ + const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); + sal_Bool bReturn = sal_True; + + if( seq.getLength() < 4 ) { + // no recognition possible, when less than 4 bytes are available + return sal_False; + } + + // first level : detect possible file formats + if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { + + // scan for encoding + OString str( (const sal_Char *) pSource , seq.getLength() ); + + // cut sequence to first line break + //find first line break; + int nMax = str.indexOf( 10 ); + if( nMax >= 0 ) + { + str = str.copy( 0 , nMax ); + } + + int nFound = str.indexOf( " encoding" ); + if( nFound < str.getLength() ) { + int nStop; + int nStart = str.indexOf( "\"" , nFound ); + if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) + { + nStart = str.indexOf( "'" , nFound ); + nStop = str.indexOf( "'" , nStart +1 ); + } + else + { + nStop = str.indexOf( "\"" , nStart +1); + } + if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) + { + // encoding found finally + m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); + } + } + } + else if( 0xFE == pSource[0] && + 0xFF == pSource[1] ) { + // UTF-16 big endian + // conversion is done so that encoding information can be easily extracted + m_sEncoding = "utf-16"; + } + else if( 0xFF == pSource[0] && + 0xFE == pSource[1] ) { + // UTF-16 little endian + // conversion is done so that encoding information can be easily extracted + m_sEncoding = "utf-16"; + } + else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { + // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) + // The byte order mark is simply added + + // simply add the byte order mark ! + seq.realloc( seq.getLength() + 2 ); + memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); + ((sal_uInt8*)seq.getArray())[0] = 0xFE; + ((sal_uInt8*)seq.getArray())[1] = 0xFF; + + m_sEncoding = "utf-16"; + } + else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { + // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) + // The byte order mark is simply added + + seq.realloc( seq.getLength() + 2 ); + memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); + ((sal_uInt8*)seq.getArray())[0] = 0xFF; + ((sal_uInt8*)seq.getArray())[1] = 0xFE; + + m_sEncoding = "utf-16"; + } + else if( 0xEF == pSource[0] && + 0xBB == pSource[1] && + 0xBF == pSource[2] ) + { + // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order + // The BOM is removed. + memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); + seq.realloc( seq.getLength() - 3 ); + m_sEncoding = "utf-8"; + } + else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { + // UCS-4 big endian + m_sEncoding = "ucs-4"; + } + else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { + // UCS-4 little endian + m_sEncoding = "ucs-4"; + } + else if( 0x4c == pSource[0] && 0x6f == pSource[1] && + 0xa7 == static_cast<unsigned char> (pSource[2]) && + 0x94 == static_cast<unsigned char> (pSource[3]) ) { + // EBCDIC + bReturn = sal_False; // must be extended + } + else { + // other + // UTF8 is directly recognized by the parser. + bReturn = sal_False; + } + + return bReturn; +} + +void XMLFile2UTFConverter::initializeDecoding() +{ + + if( m_sEncoding.getLength() ) + { + rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); + if( encoding != RTL_TEXTENCODING_UTF8 ) + { + m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding ); + m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 ); + } + } +} + + +//---------------------------------------------- +// +// Text2UnicodeConverter +// +//---------------------------------------------- +Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) +{ + rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); + if( RTL_TEXTENCODING_DONTKNOW == encoding ) + { + m_bCanContinue = sal_False; + m_bInitialized = sal_False; + } + else + { + init( encoding ); + } +} + +Text2UnicodeConverter::~Text2UnicodeConverter() +{ + if( m_bInitialized ) + { + rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); + rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); + } +} + +void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) +{ + m_bCanContinue = sal_True; + m_bInitialized = sal_True; + + m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); + m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); + m_rtlEncoding = encoding; +} + + +Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) +{ + sal_uInt32 uiInfo; + sal_Size nSrcCvtBytes = 0; + sal_Size nTargetCount = 0; + sal_Size nSourceCount = 0; + + // the whole source size + sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); + Sequence<sal_Unicode> seqUnicode ( nSourceSize ); + + const sal_Int8 *pbSource = seqText.getConstArray(); + sal_Int8 *pbTempMem = 0; + + if( m_seqSource.getLength() ) { + // put old rest and new byte sequence into one array + pbTempMem = new sal_Int8[ nSourceSize ]; + memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() ); + memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); + pbSource = pbTempMem; + + // set to zero again + m_seqSource = Sequence< sal_Int8 >(); + } + + while( sal_True ) { + + /* All invalid characters are transformed to the unicode undefined char */ + nTargetCount += rtl_convertTextToUnicode( + m_convText2Unicode, + m_contextText2Unicode, + ( const sal_Char * ) &( pbSource[nSourceCount] ), + nSourceSize - nSourceCount , + &( seqUnicode.getArray()[ nTargetCount ] ), + seqUnicode.getLength() - nTargetCount, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | + RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, + &uiInfo, + &nSrcCvtBytes ); + nSourceCount += nSrcCvtBytes; + + if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) { + // save necessary bytes for next conversion + seqUnicode.realloc( seqUnicode.getLength() * 2 ); + continue; + } + break; + } + if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) { + m_seqSource.realloc( nSourceSize - nSourceCount ); + memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); + } + + + if( pbTempMem ) { + delete [] pbTempMem; + } + + // set to correct unicode size + seqUnicode.realloc( nTargetCount ); + + return seqUnicode; +} + + + +//---------------------------------------------- +// +// Unicode2TextConverter +// +//---------------------------------------------- +Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) +{ + init( encoding ); +} + + +Unicode2TextConverter::~Unicode2TextConverter() +{ + if( m_bInitialized ) { + rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); + rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); + } +} + + +Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) +{ + sal_Unicode *puTempMem = 0; + + if( m_seqSource.getLength() ) { + // For surrogates ! + // put old rest and new byte sequence into one array + // In general when surrogates are used, they should be rarely + // cut off between two convert()-calls. So this code is used + // rarely and the extra copy is acceptable. + puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()]; + memcpy( puTempMem , + m_seqSource.getConstArray() , + m_seqSource.getLength() * sizeof( sal_Unicode ) ); + memcpy( + &(puTempMem[ m_seqSource.getLength() ]) , + puSource , + nSourceSize*sizeof( sal_Unicode ) ); + puSource = puTempMem; + nSourceSize += m_seqSource.getLength(); + + m_seqSource = Sequence< sal_Unicode > (); + } + + + sal_Size nTargetCount = 0; + sal_Size nSourceCount = 0; + + sal_uInt32 uiInfo; + sal_Size nSrcCvtChars; + + // take nSourceSize * 3 as preference + // this is an upper boundary for converting to utf8, + // which most often used as the target. + sal_Int32 nSeqSize = nSourceSize * 3; + + Sequence<sal_Int8> seqText( nSeqSize ); + sal_Char *pTarget = (sal_Char *) seqText.getArray(); + while( sal_True ) { + + nTargetCount += rtl_convertUnicodeToText( + m_convUnicode2Text, + m_contextUnicode2Text, + &( puSource[nSourceCount] ), + nSourceSize - nSourceCount , + &( pTarget[nTargetCount] ), + nSeqSize - nTargetCount, + RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | + RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , + &uiInfo, + &nSrcCvtChars); + nSourceCount += nSrcCvtChars; + + if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { + nSeqSize = nSeqSize *2; + seqText.realloc( nSeqSize ); // double array size + pTarget = ( sal_Char * ) seqText.getArray(); + continue; + } + break; + } + + // for surrogates + if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { + m_seqSource.realloc( nSourceSize - nSourceCount ); + memcpy( m_seqSource.getArray() , + &(puSource[nSourceCount]), + (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); + } + + if( puTempMem ) { + delete [] puTempMem; + } + + // reduce the size of the buffer (fast, no copy necessary) + seqText.realloc( nTargetCount ); + + return seqText; +} + +void Unicode2TextConverter::init( rtl_TextEncoding encoding ) +{ + m_bCanContinue = sal_True; + m_bInitialized = sal_True; + + m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); + m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); + m_rtlEncoding = encoding; +}; + + +} |