diff options
author | Mike Kaganski <mike.kaganski@collabora.com> | 2021-12-10 12:01:19 +0200 |
---|---|---|
committer | Mike Kaganski <mike.kaganski@collabora.com> | 2021-12-10 14:18:30 +0100 |
commit | 1f1ce06a185abcbf0c533cb3aa288418ecaa5ef4 (patch) | |
tree | 9c877abaf46f0daecb6aa2f6a77c6b03a8d936ce /tools | |
parent | 16376cae68f4406ef9440bdcb7b9617de6a6f998 (diff) |
Make BOM detection slightly more straightforward
Without taking system endianness and current stream endianness
into account - just read and check single bytes.
Change-Id: I9273d8f403caad7adb5e11cecc04e326919dad1f
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126595
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'tools')
-rw-r--r-- | tools/source/stream/stream.cxx | 66 |
1 files changed, 34 insertions, 32 deletions
diff --git a/tools/source/stream/stream.cxx b/tools/source/stream/stream.cxx index d42cefdf63cf..016c8b67e891 100644 --- a/tools/source/stream/stream.cxx +++ b/tools/source/stream/stream.cxx @@ -718,52 +718,54 @@ void SvStream::StartReadingUnicodeText( rtl_TextEncoding eReadBomCharSet ) eReadBomCharSet == RTL_TEXTENCODING_UTF8)) return; // nothing to read - bool bTryUtf8 = false; - sal_uInt16 nFlag(0); - sal_sSize nBack = sizeof(nFlag); - ReadUInt16( nFlag ); + const sal_uInt64 nOldPos = Tell(); + bool bGetBack = true; + unsigned char nFlag(0); + ReadUChar( nFlag ); switch ( nFlag ) { - case 0xfeff : - // native UTF-16 + case 0xfe: // UTF-16BE? if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW || eReadBomCharSet == RTL_TEXTENCODING_UNICODE) - nBack = 0; + { + ReadUChar(nFlag); + if (nFlag == 0xff) + { + SetEndian(SvStreamEndian::BIG); + bGetBack = false; + } + } break; - case 0xfffe : - // swapped UTF-16 + case 0xff: // UTF-16LE? if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW || eReadBomCharSet == RTL_TEXTENCODING_UNICODE) { - SetEndian( m_nEndian == SvStreamEndian::BIG ? SvStreamEndian::LITTLE : SvStreamEndian::BIG ); - nBack = 0; + ReadUChar(nFlag); + if (nFlag == 0xfe) + { + SetEndian(SvStreamEndian::LITTLE); + bGetBack = false; + } } break; - case 0xefbb : - if (m_nEndian == SvStreamEndian::BIG && - (eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW || - eReadBomCharSet == RTL_TEXTENCODING_UTF8)) - bTryUtf8 = true; - break; - case 0xbbef : - if (m_nEndian == SvStreamEndian::LITTLE && - (eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW || - eReadBomCharSet == RTL_TEXTENCODING_UTF8)) - bTryUtf8 = true; + case 0xef: // UTF-8? + if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW || + eReadBomCharSet == RTL_TEXTENCODING_UTF8) + { + ReadUChar(nFlag); + if (nFlag == 0xbb) + { + ReadUChar(nFlag); + if (nFlag == 0xbf) + bGetBack = false; // it is UTF-8 + } + } break; default: ; // nothing } - if (bTryUtf8) - { - unsigned char nChar(0); - nBack += sizeof(nChar); - ReadUChar( nChar ); - if (nChar == 0xbf) - nBack = 0; // it is UTF-8 - } - if (nBack) - SeekRel( -nBack ); // no BOM, pure data + if (bGetBack) + Seek(nOldPos); // no BOM, pure data } sal_uInt64 SvStream::SeekRel(sal_Int64 const nPos) |