diff options
author | Michael Brauer <mib@openoffice.org> | 2001-10-15 07:49:57 +0000 |
---|---|---|
committer | Michael Brauer <mib@openoffice.org> | 2001-10-15 07:49:57 +0000 |
commit | 2836dbdcd0ac9f9d065b8c4084294590fed9dccf (patch) | |
tree | c6369458752b51194d1237691db1bff1393d826d /svtools/source/svrtf | |
parent | 2a11fe558866221f77b38ac52964caa30bc06795 (diff) |
#86025#: use of text converter for conversion to unicode
Diffstat (limited to 'svtools/source/svrtf')
-rw-r--r-- | svtools/source/svrtf/svparser.cxx | 238 |
1 files changed, 146 insertions, 92 deletions
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx index e79f071697d3..6a7e1476207c 100644 --- a/svtools/source/svrtf/svparser.cxx +++ b/svtools/source/svrtf/svparser.cxx @@ -2,9 +2,9 @@ * * $RCSfile: svparser.cxx,v $ * - * $Revision: 1.2 $ + * $Revision: 1.3 $ * - * last change: $Author: mib $ $Date: 2000-12-13 14:12:56 $ + * last change: $Author: mib $ $Date: 2001-10-15 08:49:57 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -70,6 +70,9 @@ #endif #define _SVSTDARR_USHORTS #include <svstdarr.hxx> +#ifndef _RTL_TEXTCVT_H +#include <rtl/textcvt.h> +#endif #define SVPAR_CSM_ @@ -115,6 +118,8 @@ struct SvParser_Impl int nSaveToken; // das Token vom Continue + rtl_TextToUnicodeConverter hConv; + #ifdef ASYNCHRON_TEST // HACK _SvLockBytes_Impl* pLB; @@ -123,6 +128,12 @@ _SvLockBytes_Impl* pLB; #ifndef PRODUCT SvFileStream aOut; #endif + + SvParser_Impl() : + nSaveToken(0), hConv( 0 ) + { + } + }; @@ -183,6 +194,11 @@ delete pImplData->pLB; pImplData->aOut.Close(); #endif + if( pImplData && pImplData->hConv ) + { + rtl_destroyTextToUnicodeConverter( pImplData->hConv ); + } + delete pImplData; #ifdef MPW @@ -197,17 +213,34 @@ delete pImplData->pLB; void SvParser::SetSrcEncoding( rtl_TextEncoding eEnc ) { - if( ( eEnc < RTL_TEXTENCODING_STD_COUNT && - eEnc != RTL_TEXTENCODING_UTF7 ) || - RTL_TEXTENCODING_UCS2 == eEnc ) - { - eSrcEnc = eEnc; - } - else + + if( eEnc != eSrcEnc ) { - DBG_ASSERT( !this, - "SvParser::SetSrcEncoding: invalid source encoding" ); - eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + if( pImplData && pImplData->hConv ) + { + rtl_destroyTextToUnicodeConverter( pImplData->hConv ); + pImplData->hConv = 0; + } + + if( eEnc < RTL_TEXTENCODING_STD_COUNT || + RTL_TEXTENCODING_UCS2 == eEnc ) + { + eSrcEnc = eEnc; + if( !pImplData ) + pImplData = new SvParser_Impl; + pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc ); + DBG_ASSERT( pImplData->hConv, + "SvParser::SetSrcEncoding: no converter for source encoding" ); + if( !pImplData->hConv ) + eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + + } + else + { + DBG_ASSERT( !this, + "SvParser::SetSrcEncoding: invalid source encoding" ); + eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + } } } @@ -250,6 +283,9 @@ sal_Unicode SvParser::GetNextChar() //HACK #else + // When reading muliple bytes, we don't have to care about the file + // position when we run inti the pending state. The file position is + // maintained by SaveState/RestoreState. BOOL bErr; if( bSwitchToUCS2 && 0 == rInput.Tell() ) { @@ -285,105 +321,123 @@ sal_Unicode SvParser::GetNextChar() bSwitchToUCS2 = FALSE; } - if( RTL_TEXTENCODING_UCS2 == eSrcEnc || RTL_TEXTENCODING_UTF8 == eSrcEnc ) + if( RTL_TEXTENCODING_UCS2 == eSrcEnc ) { sal_Unicode cUC = USHRT_MAX; - sal_uChar c1, c2, c3; + sal_uChar c1, c2; + + rInput >> c1 >> c2; + if( 2 == rInput.Tell() && + !(rInput.IsEof() || rInput.GetError()) && + ( (bUCS2BSrcEnc && 0xfe == c1 && 0xff == c2) || + (!bUCS2BSrcEnc && 0xff == c1 && 0xfe == c2) ) ) + rInput >> c1 >> c2; + + if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) + { + if( bUCS2BSrcEnc ) + cUC = (sal_Unicode(c1) << 8) | c2; + else + cUC = (sal_Unicode(c2) << 8) | c1; + } - if( RTL_TEXTENCODING_UTF8 == eSrcEnc ) + if( !bErr ) + { + c = cUC; + } + } + else + { + sal_Char c1; // signed, that's the text converter expects + rInput >> c1; + if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) { - rInput >> c1; - if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) + if( RTL_TEXTENCODING_DONTKNOW == eSrcEnc ) { - switch( c1 >> 4 ) + // no convserion shall take place + c = (sal_Unicode)c1; + } + else + { + DBG_ASSERT( pImplData && pImplData->hConv, + "no text converter!" ) + + sal_Unicode cUC; + sal_uInt32 nInfo = 0; + sal_Size nCvtBytes; + sal_Size nChars = rtl_convertTextToUnicode( + pImplData->hConv, 0, &c1, 1, &cUC, 1, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR| + RTL_TEXTTOUNICODE_FLAGS_FLUSH, + &nInfo, &nCvtBytes); + if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 ) { - case 0: case 1: case 2: case 3: - case 4: case 5: case 6: case 7: - // 0xxxxxxx - cUC = c1; - break; - - case 12: case 13: - // 110x xxxx 10xx xxxx - rInput >> c2; - if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) + // The conversion wasn't successfull because we haven't + // read enough characters. + sal_Char sBuffer[10]; + sBuffer[0] = c1; + sal_uInt16 nLen = 1; + while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 && + nLen < 10 ) { - if( (c2 & 0xC0) == 0x80 ) - { - cUC = (sal_Unicode(c1 & 0x1F) << 6) | - (c2 & 0x3F); - } - else - { - // Kein UTF-8? Dann Zeichen direkt einfuegen - cUC = c1; - rInput.SeekRel( -1 ); - } + rInput >> c1; + if( (bErr = (rInput.IsEof() || rInput.GetError())) ) + break; + + sBuffer[nLen++] = c1; + nChars = rtl_convertTextToUnicode( + pImplData->hConv, 0, sBuffer, nLen, &cUC, 1, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR| + RTL_TEXTTOUNICODE_FLAGS_FLUSH, + &nInfo, &nCvtBytes); } - break; - - case 14: - // 1110 xxxx 10xx xxxx 10xx xxxx - rInput >> c2 >> c3; - if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) + if( !bErr ) { - if( (c2 & 0xC0) == 0x80 && (c3 & 0xC0) == 0x80 ) + if( 1 == nChars && 0 == nInfo ) { - cUC = (sal_Unicode(c1 & 0x0F) << 12) | - (sal_Unicode(c2 & 0x3F) << 6) | - (c3 & 0x3F); + DBG_ASSERT( nCvtBytes == nLen, + "no all bytes have been converted!" ); + c = cUC; } else { - // Kein UTF-8? Dann Zeichen direkt einfuegen - cUC = c1; - rInput.SeekRel( -2 ); + DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) == 0, + "source buffer is to small" ); + DBG_ASSERT( 0 == nChars, + "there is a converted character, but an error" ); + DBG_ASSERT( 0 != nInfo, + "there is no converted character and no error" ); + // There are still errors, so we use the first + // character and restart after that. + c = (sal_Unicode)sBuffer[0]; + rInput.SeekRel( -(nLen-1) ); } } - break; - - default: - cUC = c1; - break; } - } - } - else - { - rInput >> c1 >> c2; - if( 2 == rInput.Tell() && ! - !(rInput.IsEof() || rInput.GetError()) && - ( (bUCS2BSrcEnc && 0xfe == c1 && 0xff == c2) || - (!bUCS2BSrcEnc && 0xff == c1 && 0xfe == c2) ) ) - rInput >> c1 >> c2; - - if( !(bErr = (rInput.IsEof() || rInput.GetError())) ) - { - if( bUCS2BSrcEnc ) - cUC = (sal_Unicode(c1) << 8) | c2; + else if( 1 == nChars && 0 == nInfo ) + { + // The conversion was successfull + DBG_ASSERT( nCvtBytes == 1, + "no all bytes have been converted!" ); + c = cUC; + } else - cUC = (sal_Unicode(c2) << 8) | c1; - } - } + { + DBG_ASSERT( 0 == nChars, + "there is a converted character, but an error" ); + DBG_ASSERT( 0 != nInfo, + "there is no converted character and no error" ); + // #73398#: If the character could not be converted, + // because a conversion is not available, do no conversion at all. + c = (sal_Unicode)c1; - if( !bErr ) - { - c = cUC; - } - } - else - { - sal_uChar c1; - rInput >> c1; - if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc) - { - // #73398#: If the character could not be converted, - // because a conversion is not available, do no conversion at all. - sal_Unicode cUC = ByteString::ConvertToUnicode( c1, eSrcEnc ); - c = 0U != cUC ? cUC : (sal_Unicode)c; + } + } } - - bErr = rInput.IsEof() || rInput.GetError(); } if( bErr ) { |