diff options
-rw-r--r-- | include/svtools/svparser.hxx | 1 | ||||
-rw-r--r-- | svtools/Library_svt.mk | 1 | ||||
-rw-r--r-- | svtools/source/svrtf/svparser.cxx | 99 |
3 files changed, 40 insertions, 61 deletions
diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx index b0ba450a7285..d54b4a972f15 100644 --- a/include/svtools/svparser.hxx +++ b/include/svtools/svparser.hxx @@ -62,7 +62,6 @@ protected: sal_uInt64 nNextChPos; sal_uInt32 nNextCh; // current character codepoint in UTF32 for the "lex" - bool bUCS2BSrcEnc : 1; // or as big-endian UCS2 bool bSwitchToUCS2 : 1; // switching is allowed bool bRTF_InTextRead : 1; // only for RTF-Parser!!! diff --git a/svtools/Library_svt.mk b/svtools/Library_svt.mk index 59bd29ac240e..31685bbd300c 100644 --- a/svtools/Library_svt.mk +++ b/svtools/Library_svt.mk @@ -61,6 +61,7 @@ $(eval $(call gb_Library_use_libraries,svt,\ $(eval $(call gb_Library_use_externals,svt,\ boost_headers \ + icui18n \ icuuc \ icu_headers \ )) diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx index 3da34404f517..0fec7a97097e 100644 --- a/svtools/source/svrtf/svparser.cxx +++ b/svtools/source/svrtf/svparser.cxx @@ -25,6 +25,7 @@ #include <rtl/tencinfo.h> #include <rtl/character.hxx> #include <sal/log.hxx> +#include <unicode/ucsdet.h> #include <vector> #include <climits> @@ -85,7 +86,6 @@ SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize ) , eSrcEnc( RTL_TEXTENCODING_DONTKNOW ) , nNextChPos(0) , nNextCh(0) - , bUCS2BSrcEnc(false) , bSwitchToUCS2(false) , bRTF_InTextRead(false) , nTokenStackSize( nStackSize ) @@ -188,87 +188,66 @@ sal_uInt32 SvParser<T>::GetNextChar() // When reading multiple bytes, we don't have to care about the file // position when we run into the pending state. The file position is // maintained by SaveState/RestoreState. - bool bErr; if( bSwitchToUCS2 && 0 == rInput.Tell() ) { - unsigned char c1; - bool bSeekBack = true; - - rInput.ReadUChar( c1 ); - bErr = !rInput.good(); - if( !bErr ) + rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW); + if (rInput.good()) { - if( 0xff == c1 || 0xfe == c1 ) - { - unsigned char c2; - rInput.ReadUChar( c2 ); - bErr = !rInput.good(); - if( !bErr ) - { - if( 0xfe == c1 && 0xff == c2 ) - { - eSrcEnc = RTL_TEXTENCODING_UCS2; - bUCS2BSrcEnc = true; - bSeekBack = false; - } - else if( 0xff == c1 && 0xfe == c2 ) - { - eSrcEnc = RTL_TEXTENCODING_UCS2; - bUCS2BSrcEnc = false; - bSeekBack = false; - } - } - } - else if( 0xef == c1 || 0xbb == c1 ) // check for UTF-8 BOM + sal_uInt64 nPos = rInput.Tell(); + if (nPos == 2) + eSrcEnc = RTL_TEXTENCODING_UCS2; + else if (nPos == 3) + SetSrcEncoding(RTL_TEXTENCODING_UTF8); + else // Try to detect encoding without BOM { - unsigned char c2; - rInput.ReadUChar( c2 ); - bErr = !rInput.good(); - if( !bErr ) + std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer + const size_t nSize = rInput.ReadBytes(buf.data(), buf.size()); + rInput.Seek(0); + if (nSize > 0) { - if( ( 0xef == c1 && 0xbb == c2 ) || ( 0xbb == c1 && 0xef == c2 ) ) + UErrorCode uerr = U_ZERO_ERROR; + UCharsetDetector* ucd = ucsdet_open(&uerr); + ucsdet_setText(ucd, buf.data(), nSize, &uerr); + if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr)) { - unsigned char c3(0); - rInput.ReadUChar( c3 ); - bErr = !rInput.good(); - if( !bErr && ( 0xbf == c3 ) ) + const char* pEncodingName = ucsdet_getName(match, &uerr); + + if (U_SUCCESS(uerr)) { - SetSrcEncoding(RTL_TEXTENCODING_UTF8); - bSeekBack = false; + if (strcmp("UTF-8", pEncodingName) == 0) + { + SetSrcEncoding(RTL_TEXTENCODING_UTF8); + } + else if (strcmp("UTF-16LE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + rInput.SetEndian(SvStreamEndian::LITTLE); + } + else if (strcmp("UTF-16BE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + rInput.SetEndian(SvStreamEndian::BIG); + } } } + + ucsdet_close(ucd); } } } - if( bSeekBack ) - rInput.Seek( 0 ); - bSwitchToUCS2 = false; } + bool bErr; nNextChPos = rInput.Tell(); if( RTL_TEXTENCODING_UCS2 == eSrcEnc ) { - unsigned char c1, c2; - - rInput.ReadUChar( c1 ).ReadUChar( c2 ); - if( 2 == rInput.Tell() && rInput.good() && - ( (bUCS2BSrcEnc && 0xfe == c1 && 0xff == c2) || - (!bUCS2BSrcEnc && 0xff == c1 && 0xfe == c2) ) ) - rInput.ReadUChar( c1 ).ReadUChar( c2 ); - + sal_Unicode cUC; + rInput.ReadUtf16(cUC); bErr = !rInput.good(); if( !bErr ) - { - sal_Unicode cUC = USHRT_MAX; - if( bUCS2BSrcEnc ) - cUC = (sal_Unicode(c1) << 8) | c2; - else - cUC = (sal_Unicode(c2) << 8) | c1; - c = cUC; - } } else { |