summaryrefslogtreecommitdiff
path: root/svtools/source/svrtf
diff options
context:
space:
mode:
authorMark Hung <marklh9@gmail.com>2015-12-27 00:46:49 +0800
committerMark Hung <marklh9@gmail.com>2016-02-13 08:05:09 +0000
commit4647e778993250b8c9431e2890750916fb986ecc (patch)
tree99d285ec6a33aeca2d9df32d30d2aea801066a37 /svtools/source/svrtf
parent3596613153289dae204b5abdc7446b303021f597 (diff)
tdf#81129 Support reading non-BMP characters in HTML documents.
1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText() 2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 ) from SvParser::GetNextChar(). Conflicts: sw/qa/extras/htmlexport/htmlexport.cxx Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4 Reviewed-on: https://gerrit.libreoffice.org/21152 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Mark Hung <marklh9@gmail.com>
Diffstat (limited to 'svtools/source/svrtf')
-rw-r--r--svtools/source/svrtf/parrtf.cxx6
-rw-r--r--svtools/source/svrtf/svparser.cxx29
2 files changed, 23 insertions, 12 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
index f6f75eb73162..bdc73d363970 100644
--- a/svtools/source/svrtf/parrtf.cxx
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken()
// can be also \{, \}, \'88
for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
{
- sal_Unicode cAnsi = nNextCh;
+ sal_uInt32 cAnsi = nNextCh;
while( 0xD == cAnsi )
cAnsi = GetNextChar();
while( 0xA == cAnsi )
@@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
case '}':
case '{':
case '+': // I found in a RTF file
- aStrBuffer.append(nNextCh);
+ aStrBuffer.append(sal_Unicode(nNextCh));
break;
case '~': // nonbreaking space
aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
@@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
{
do {
// all other characters end up in the text
- aStrBuffer.append(nNextCh);
+ aStrBuffer.appendUtf32(nNextCh);
if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
{
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index b5c377b72ea0..b862e66766ca 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -22,6 +22,7 @@
#include <tools/debug.hxx>
#include <rtl/textcvt.h>
#include <rtl/tencinfo.h>
+#include <rtl/character.hxx>
#include <vector>
@@ -35,7 +36,7 @@ struct SvParser_Impl
long nTokenValue; // extra value (RTF)
bool bTokenHasValue; // indicates whether nTokenValue is valid
int nToken; // actual Token
- sal_Unicode nNextCh; // actual character
+ sal_uInt32 nNextCh; // actual character
int nSaveToken; // the token from Continue
rtl_TextToUnicodeConverter hConv;
@@ -148,9 +149,9 @@ void SvParser::RereadLookahead()
nNextCh = GetNextChar();
}
-sal_Unicode SvParser::GetNextChar()
+sal_uInt32 SvParser::GetNextChar()
{
- sal_Unicode c = 0U;
+ sal_uInt32 c = 0U;
// When reading multiple bytes, we don't have to care about the file
// position when we run into the pending state. The file position is
@@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar()
)
{
// no convserion shall take place
- c = (sal_Unicode)c1;
+ c = reinterpret_cast<sal_uChar&>( c1 );
nChars = 1;
}
else
@@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar()
// read enough characters.
if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
{
+ sal_Unicode sCh[2];
while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 )
{
rInput.ReadChar( c1 );
@@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar()
nChars = rtl_convertTextToUnicode(
pImplData->hConv, pImplData->hContext,
- &c1, 1, &cUC, 1,
+ &c1, 1, sCh , 2,
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
@@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar()
{
if( 1 == nChars && 0 == nInfo )
{
- c = cUC;
+ c = sal_uInt32( sCh[0] );
+ }
+ else if( 2 == nChars && 0 == nInfo )
+ {
+ c = rtl::combineSurrogates( sCh[0], sCh[1] );
}
else if( 0 != nChars || 0 != nInfo )
{
@@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar()
"there is a converted character, but an error" );
// There are still errors, but nothing we can
// do
- c = (sal_Unicode)'?';
+ c = (sal_uInt32)'?';
nChars = 1;
}
}
@@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar()
// There are still errors, so we use the first
// character and restart after that.
- c = (sal_Unicode)sBuffer[0];
+ c = reinterpret_cast<sal_uChar&>( sBuffer[0] );
rInput.SeekRel( -(nLen-1) );
nChars = 1;
}
@@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar()
"there is no converted character and no error" );
// #73398#: If the character could not be converted,
// because a conversion is not available, do no conversion at all.
- c = (sal_Unicode)c1;
+ c = reinterpret_cast<sal_uChar&>( c1 );
nChars = 1;
}
@@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar()
}
while( 0 == nChars && !bErr );
}
+
+ if ( ! rtl::isValidCodePoint( c ) )
+ c = (sal_uInt32) '?' ;
+
if( bErr )
{
if( ERRCODE_IO_PENDING == rInput.GetError() )
@@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar()
}
else
IncLinePos();
+
return c;
}