tdf#81129 Support reading non-BMP characters in HTML documents.

1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText() 2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 ) from SvParser::GetNextChar(). Conflicts: sw/qa/extras/htmlexport/htmlexport.cxx Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4 Reviewed-on: https://gerrit.libreoffice.org/21152 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Mark Hung <marklh9@gmail.com>
author: Mark Hung <marklh9@gmail.com> 2015-12-27 00:46:49 +0800
committer: Mark Hung <marklh9@gmail.com> 2016-02-13 08:05:09 +0000
commit: 4647e778993250b8c9431e2890750916fb986ecc (patch)
tree: 99d285ec6a33aeca2d9df32d30d2aea801066a37 /svtools/source/svrtf
parent: 3596613153289dae204b5abdc7446b303021f597 (diff)
2 files changed, 23 insertions, 12 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
index f6f75eb73162..bdc73d363970 100644
--- a/svtools/source/svrtf/parrtf.cxx
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken()
                                 // can be also \{, \}, \'88
                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                                 {
-                                    sal_Unicode cAnsi = nNextCh;
+                                    sal_uInt32 cAnsi = nNextCh;
                                     while( 0xD == cAnsi )
                                         cAnsi = GetNextChar();
                                     while( 0xA == cAnsi )
@@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
                 case '}':
                 case '{':
                 case '+':       // I found in a RTF file
-                    aStrBuffer.append(nNextCh);
+                    aStrBuffer.append(sal_Unicode(nNextCh));
                     break;
                 case '~':       // nonbreaking space
                     aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
@@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
             {
                 do {
                     // all other characters end up in the text
-                    aStrBuffer.append(nNextCh);
+                    aStrBuffer.appendUtf32(nNextCh);
 
                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
                     {
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index b5c377b72ea0..b862e66766ca 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -22,6 +22,7 @@
 #include <tools/debug.hxx>
 #include <rtl/textcvt.h>
 #include <rtl/tencinfo.h>
+#include <rtl/character.hxx>
 
 #include <vector>
 
@@ -35,7 +36,7 @@ struct SvParser_Impl
     long            nTokenValue;        // extra value (RTF)
     bool        bTokenHasValue;     // indicates whether nTokenValue is valid
     int             nToken;             // actual Token
-    sal_Unicode     nNextCh;            // actual character
+    sal_uInt32      nNextCh;            // actual character
     int             nSaveToken;         // the token from Continue
 
     rtl_TextToUnicodeConverter hConv;
@@ -148,9 +149,9 @@ void SvParser::RereadLookahead()
     nNextCh = GetNextChar();
 }
 
-sal_Unicode SvParser::GetNextChar()
+sal_uInt32 SvParser::GetNextChar()
 {
-    sal_Unicode c = 0U;
+    sal_uInt32 c = 0U;
 
     // When reading multiple bytes, we don't have to care about the file
     // position when we run into the pending state. The file position is
@@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar()
                    )
                 {
                     // no convserion shall take place
-                    c = (sal_Unicode)c1;
+                    c = reinterpret_cast<sal_uChar&>( c1 );
                     nChars = 1;
                 }
                 else
@@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar()
                         // read enough characters.
                         if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
                         {
+                            sal_Unicode sCh[2];
                             while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 )
                             {
                                 rInput.ReadChar( c1 );
@@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar()
 
                                 nChars = rtl_convertTextToUnicode(
                                             pImplData->hConv, pImplData->hContext,
-                                            &c1, 1, &cUC, 1,
+                                            &c1, 1, sCh , 2,
                                             RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                             RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                             RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
@@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar()
                             {
                                 if( 1 == nChars && 0 == nInfo )
                                 {
-                                    c = cUC;
+                                    c = sal_uInt32( sCh[0] );
+                                }
+                                else if( 2 == nChars && 0 == nInfo )
+                                {
+                                    c = rtl::combineSurrogates( sCh[0], sCh[1] );
                                 }
                                 else if( 0 != nChars || 0 != nInfo )
                                 {
@@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar()
                                        "there is a converted character, but an error" );
                                     // There are still errors, but nothing we can
                                     // do
-                                    c = (sal_Unicode)'?';
+                                    c = (sal_uInt32)'?';
                                     nChars = 1;
                                 }
                             }
@@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar()
 
                                     // There are still errors, so we use the first
                                     // character and restart after that.
-                                    c = (sal_Unicode)sBuffer[0];
+                                    c = reinterpret_cast<sal_uChar&>( sBuffer[0] );
                                     rInput.SeekRel( -(nLen-1) );
                                     nChars = 1;
                                 }
@@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar()
                                 "there is no converted character and no error" );
                         // #73398#: If the character could not be converted,
                         // because a conversion is not available, do no conversion at all.
-                        c = (sal_Unicode)c1;
+                        c = reinterpret_cast<sal_uChar&>( c1 );
                         nChars = 1;
 
                     }
@@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar()
         }
         while( 0 == nChars  && !bErr );
     }
+
+    if ( ! rtl::isValidCodePoint( c ) )
+        c = (sal_uInt32) '?' ;
+
     if( bErr )
     {
         if( ERRCODE_IO_PENDING == rInput.GetError() )
@@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar()
     }
     else
         IncLinePos();
+
     return c;
 }
author	Mark Hung <marklh9@gmail.com>	2015-12-27 00:46:49 +0800
committer	Mark Hung <marklh9@gmail.com>	2016-02-13 08:05:09 +0000
commit	4647e778993250b8c9431e2890750916fb986ecc (patch)
tree	99d285ec6a33aeca2d9df32d30d2aea801066a37 /svtools/source/svrtf
parent	3596613153289dae204b5abdc7446b303021f597 (diff)