summaryrefslogtreecommitdiff
path: root/sal/textenc/tcvtutf8.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'sal/textenc/tcvtutf8.cxx')
-rw-r--r--sal/textenc/tcvtutf8.cxx73
1 files changed, 49 insertions, 24 deletions
diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx
index c1a6949c01c5..4943f6987a29 100644
--- a/sal/textenc/tcvtutf8.cxx
+++ b/sal/textenc/tcvtutf8.cxx
@@ -30,6 +30,7 @@
struct ImplUtf8ToUnicodeContext
{
sal_uInt32 nUtf32;
+ int nBytes;
int nShift;
bool bCheckBom;
};
@@ -65,18 +66,9 @@ sal_Size ImplConvertUtf8ToUnicode(
sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
{
- /*
- This function is very liberal with the UTF-8 input. Accepted are:
- - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
- - surrogates (e.g., ED A0 80 to represent U+D800)
- - encodings with up to six bytes (everything outside the range
- U+0000..10FFFF is considered "undefined")
- The first two of these points allow this routine to translate from both
- RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
- */
-
bool bJavaUtf8 = pData != nullptr;
sal_uInt32 nUtf32 = 0;
+ int nBytes;
int nShift = -1;
bool bCheckBom = true;
sal_uInt32 nInfo = 0;
@@ -88,19 +80,22 @@ sal_Size ImplConvertUtf8ToUnicode(
if (pContext != nullptr)
{
nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
+ nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
}
while (pSrcBufPtr < pSrcBufEnd)
{
- bool bUndefined = false;
bool bConsume = true;
sal_uInt32 nChar = *pSrcBufPtr++;
if (nShift < 0)
+ // Allow (illegal) 5 and 6 byte sequences, so they are read as a
+ // single individual bad character:
if (nChar <= 0x7F)
{
nUtf32 = nChar;
+ nBytes = 1;
goto transform;
}
else if (nChar <= 0xBF)
@@ -108,26 +103,31 @@ sal_Size ImplConvertUtf8ToUnicode(
else if (nChar <= 0xDF)
{
nUtf32 = (nChar & 0x1F) << 6;
+ nBytes = 2;
nShift = 0;
}
else if (nChar <= 0xEF)
{
nUtf32 = (nChar & 0x0F) << 12;
+ nBytes = 3;
nShift = 6;
}
else if (nChar <= 0xF7)
{
nUtf32 = (nChar & 0x07) << 18;
+ nBytes = 4;
nShift = 12;
}
else if (nChar <= 0xFB)
{
nUtf32 = (nChar & 0x03) << 24;
+ nBytes = 5;
nShift = 18;
}
else if (nChar <= 0xFD)
{
nUtf32 = (nChar & 0x01) << 30;
+ nBytes = 6;
nShift = 24;
}
else
@@ -154,28 +154,52 @@ sal_Size ImplConvertUtf8ToUnicode(
continue;
transform:
- if (!bCheckBom || nUtf32 != 0xFEFF
+ if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
|| (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
|| bJavaUtf8)
{
+ switch (nBytes) {
+ case 1:
+ if (bJavaUtf8 && nUtf32 == 0) {
+ goto bad_input;
+ }
+ break;
+ case 2:
+ if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
+ goto bad_input;
+ }
+ break;
+ case 3:
+ if (nUtf32 < 0x800
+ || (!bJavaUtf8
+ && (rtl::isHighSurrogate(nUtf32)
+ || rtl::isLowSurrogate(nUtf32))))
+ {
+ goto bad_input;
+ }
+ break;
+ case 4:
+ if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
+ || bJavaUtf8)
+ {
+ goto bad_input;
+ }
+ break;
+ default:
+ goto bad_input;
+ }
if (nUtf32 <= 0xFFFF)
if (pDestBufPtr != pDestBufEnd)
*pDestBufPtr++ = (sal_Unicode) nUtf32;
else
goto no_output;
- else if (rtl::isUnicodeCodePoint(nUtf32))
- if (pDestBufEnd - pDestBufPtr >= 2)
- {
- *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
- *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
- }
- else
- goto no_output;
- else
+ else if (pDestBufEnd - pDestBufPtr >= 2)
{
- bUndefined = true;
- goto bad_input;
+ *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
+ *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
}
+ else
+ goto no_output;
}
nShift = -1;
bCheckBom = false;
@@ -183,7 +207,7 @@ sal_Size ImplConvertUtf8ToUnicode(
bad_input:
switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
- bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
+ false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
&nInfo))
{
case sal::detail::textenc::BAD_INPUT_STOP:
@@ -238,6 +262,7 @@ sal_Size ImplConvertUtf8ToUnicode(
if (pContext != nullptr)
{
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
+ static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
}