diff options
author | Mark Hung <marklh9@gmail.com> | 2015-12-27 00:46:49 +0800 |
---|---|---|
committer | Mark Hung <marklh9@gmail.com> | 2016-02-13 08:05:09 +0000 |
commit | 4647e778993250b8c9431e2890750916fb986ecc (patch) | |
tree | 99d285ec6a33aeca2d9df32d30d2aea801066a37 /svtools/source/svrtf | |
parent | 3596613153289dae204b5abdc7446b303021f597 (diff) |
tdf#81129 Support reading non-BMP characters in HTML documents.
1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText()
2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 )
from SvParser::GetNextChar().
Conflicts:
sw/qa/extras/htmlexport/htmlexport.cxx
Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4
Reviewed-on: https://gerrit.libreoffice.org/21152
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Mark Hung <marklh9@gmail.com>
Diffstat (limited to 'svtools/source/svrtf')
-rw-r--r-- | svtools/source/svrtf/parrtf.cxx | 6 | ||||
-rw-r--r-- | svtools/source/svrtf/svparser.cxx | 29 |
2 files changed, 23 insertions, 12 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx index f6f75eb73162..bdc73d363970 100644 --- a/svtools/source/svrtf/parrtf.cxx +++ b/svtools/source/svrtf/parrtf.cxx @@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken() // can be also \{, \}, \'88 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) { - sal_Unicode cAnsi = nNextCh; + sal_uInt32 cAnsi = nNextCh; while( 0xD == cAnsi ) cAnsi = GetNextChar(); while( 0xA == cAnsi ) @@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak ) case '}': case '{': case '+': // I found in a RTF file - aStrBuffer.append(nNextCh); + aStrBuffer.append(sal_Unicode(nNextCh)); break; case '~': // nonbreaking space aStrBuffer.append(static_cast< sal_Unicode >(0xA0)); @@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak ) { do { // all other characters end up in the text - aStrBuffer.append(nNextCh); + aStrBuffer.appendUtf32(nNextCh); if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) { diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx index b5c377b72ea0..b862e66766ca 100644 --- a/svtools/source/svrtf/svparser.cxx +++ b/svtools/source/svrtf/svparser.cxx @@ -22,6 +22,7 @@ #include <tools/debug.hxx> #include <rtl/textcvt.h> #include <rtl/tencinfo.h> +#include <rtl/character.hxx> #include <vector> @@ -35,7 +36,7 @@ struct SvParser_Impl long nTokenValue; // extra value (RTF) bool bTokenHasValue; // indicates whether nTokenValue is valid int nToken; // actual Token - sal_Unicode nNextCh; // actual character + sal_uInt32 nNextCh; // actual character int nSaveToken; // the token from Continue rtl_TextToUnicodeConverter hConv; @@ -148,9 +149,9 @@ void SvParser::RereadLookahead() nNextCh = GetNextChar(); } -sal_Unicode SvParser::GetNextChar() +sal_uInt32 SvParser::GetNextChar() { - sal_Unicode c = 0U; + sal_uInt32 c = 0U; // When reading multiple bytes, we don't have to care about the file // position when we run into the pending state. The file position is @@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar() ) { // no convserion shall take place - c = (sal_Unicode)c1; + c = reinterpret_cast<sal_uChar&>( c1 ); nChars = 1; } else @@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar() // read enough characters. if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) ) { + sal_Unicode sCh[2]; while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 ) { rInput.ReadChar( c1 ); @@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar() nChars = rtl_convertTextToUnicode( pImplData->hConv, pImplData->hContext, - &c1, 1, &cUC, 1, + &c1, 1, sCh , 2, RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, @@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar() { if( 1 == nChars && 0 == nInfo ) { - c = cUC; + c = sal_uInt32( sCh[0] ); + } + else if( 2 == nChars && 0 == nInfo ) + { + c = rtl::combineSurrogates( sCh[0], sCh[1] ); } else if( 0 != nChars || 0 != nInfo ) { @@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar() "there is a converted character, but an error" ); // There are still errors, but nothing we can // do - c = (sal_Unicode)'?'; + c = (sal_uInt32)'?'; nChars = 1; } } @@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar() // There are still errors, so we use the first // character and restart after that. - c = (sal_Unicode)sBuffer[0]; + c = reinterpret_cast<sal_uChar&>( sBuffer[0] ); rInput.SeekRel( -(nLen-1) ); nChars = 1; } @@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar() "there is no converted character and no error" ); // #73398#: If the character could not be converted, // because a conversion is not available, do no conversion at all. - c = (sal_Unicode)c1; + c = reinterpret_cast<sal_uChar&>( c1 ); nChars = 1; } @@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar() } while( 0 == nChars && !bErr ); } + + if ( ! rtl::isValidCodePoint( c ) ) + c = (sal_uInt32) '?' ; + if( bErr ) { if( ERRCODE_IO_PENDING == rInput.GetError() ) @@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar() } else IncLinePos(); + return c; } |