summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorMike Kaganski <mike.kaganski@collabora.com>2021-12-10 12:01:19 +0200
committerMike Kaganski <mike.kaganski@collabora.com>2021-12-10 14:18:30 +0100
commit1f1ce06a185abcbf0c533cb3aa288418ecaa5ef4 (patch)
tree9c877abaf46f0daecb6aa2f6a77c6b03a8d936ce /tools
parent16376cae68f4406ef9440bdcb7b9617de6a6f998 (diff)
Make BOM detection slightly more straightforward
Without taking system endianness and current stream endianness into account - just read and check single bytes. Change-Id: I9273d8f403caad7adb5e11cecc04e326919dad1f Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126595 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'tools')
-rw-r--r--tools/source/stream/stream.cxx66
1 files changed, 34 insertions, 32 deletions
diff --git a/tools/source/stream/stream.cxx b/tools/source/stream/stream.cxx
index d42cefdf63cf..016c8b67e891 100644
--- a/tools/source/stream/stream.cxx
+++ b/tools/source/stream/stream.cxx
@@ -718,52 +718,54 @@ void SvStream::StartReadingUnicodeText( rtl_TextEncoding eReadBomCharSet )
eReadBomCharSet == RTL_TEXTENCODING_UTF8))
return; // nothing to read
- bool bTryUtf8 = false;
- sal_uInt16 nFlag(0);
- sal_sSize nBack = sizeof(nFlag);
- ReadUInt16( nFlag );
+ const sal_uInt64 nOldPos = Tell();
+ bool bGetBack = true;
+ unsigned char nFlag(0);
+ ReadUChar( nFlag );
switch ( nFlag )
{
- case 0xfeff :
- // native UTF-16
+ case 0xfe: // UTF-16BE?
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
- nBack = 0;
+ {
+ ReadUChar(nFlag);
+ if (nFlag == 0xff)
+ {
+ SetEndian(SvStreamEndian::BIG);
+ bGetBack = false;
+ }
+ }
break;
- case 0xfffe :
- // swapped UTF-16
+ case 0xff: // UTF-16LE?
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
{
- SetEndian( m_nEndian == SvStreamEndian::BIG ? SvStreamEndian::LITTLE : SvStreamEndian::BIG );
- nBack = 0;
+ ReadUChar(nFlag);
+ if (nFlag == 0xfe)
+ {
+ SetEndian(SvStreamEndian::LITTLE);
+ bGetBack = false;
+ }
}
break;
- case 0xefbb :
- if (m_nEndian == SvStreamEndian::BIG &&
- (eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
- eReadBomCharSet == RTL_TEXTENCODING_UTF8))
- bTryUtf8 = true;
- break;
- case 0xbbef :
- if (m_nEndian == SvStreamEndian::LITTLE &&
- (eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
- eReadBomCharSet == RTL_TEXTENCODING_UTF8))
- bTryUtf8 = true;
+ case 0xef: // UTF-8?
+ if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
+ eReadBomCharSet == RTL_TEXTENCODING_UTF8)
+ {
+ ReadUChar(nFlag);
+ if (nFlag == 0xbb)
+ {
+ ReadUChar(nFlag);
+ if (nFlag == 0xbf)
+ bGetBack = false; // it is UTF-8
+ }
+ }
break;
default:
; // nothing
}
- if (bTryUtf8)
- {
- unsigned char nChar(0);
- nBack += sizeof(nChar);
- ReadUChar( nChar );
- if (nChar == 0xbf)
- nBack = 0; // it is UTF-8
- }
- if (nBack)
- SeekRel( -nBack ); // no BOM, pure data
+ if (bGetBack)
+ Seek(nOldPos); // no BOM, pure data
}
sal_uInt64 SvStream::SeekRel(sal_Int64 const nPos)