diff options
Diffstat (limited to 'sal/textenc/tcvtutf8.cxx')
-rw-r--r-- | sal/textenc/tcvtutf8.cxx | 73 |
1 files changed, 49 insertions, 24 deletions
diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx index c1a6949c01c5..4943f6987a29 100644 --- a/sal/textenc/tcvtutf8.cxx +++ b/sal/textenc/tcvtutf8.cxx @@ -30,6 +30,7 @@ struct ImplUtf8ToUnicodeContext { sal_uInt32 nUtf32; + int nBytes; int nShift; bool bCheckBom; }; @@ -65,18 +66,9 @@ sal_Size ImplConvertUtf8ToUnicode( sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) { - /* - This function is very liberal with the UTF-8 input. Accepted are: - - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041) - - surrogates (e.g., ED A0 80 to represent U+D800) - - encodings with up to six bytes (everything outside the range - U+0000..10FFFF is considered "undefined") - The first two of these points allow this routine to translate from both - RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8. - */ - bool bJavaUtf8 = pData != nullptr; sal_uInt32 nUtf32 = 0; + int nBytes; int nShift = -1; bool bCheckBom = true; sal_uInt32 nInfo = 0; @@ -88,19 +80,22 @@ sal_Size ImplConvertUtf8ToUnicode( if (pContext != nullptr) { nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32; + nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes; nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift; bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom; } while (pSrcBufPtr < pSrcBufEnd) { - bool bUndefined = false; bool bConsume = true; sal_uInt32 nChar = *pSrcBufPtr++; if (nShift < 0) + // Allow (illegal) 5 and 6 byte sequences, so they are read as a + // single individual bad character: if (nChar <= 0x7F) { nUtf32 = nChar; + nBytes = 1; goto transform; } else if (nChar <= 0xBF) @@ -108,26 +103,31 @@ sal_Size ImplConvertUtf8ToUnicode( else if (nChar <= 0xDF) { nUtf32 = (nChar & 0x1F) << 6; + nBytes = 2; nShift = 0; } else if (nChar <= 0xEF) { nUtf32 = (nChar & 0x0F) << 12; + nBytes = 3; nShift = 6; } else if (nChar <= 0xF7) { nUtf32 = (nChar & 0x07) << 18; + nBytes = 4; nShift = 12; } else if (nChar <= 0xFB) { nUtf32 = (nChar & 0x03) << 24; + nBytes = 5; nShift = 18; } else if (nChar <= 0xFD) { nUtf32 = (nChar & 0x01) << 30; + nBytes = 6; nShift = 24; } else @@ -154,28 +154,52 @@ sal_Size ImplConvertUtf8ToUnicode( continue; transform: - if (!bCheckBom || nUtf32 != 0xFEFF + if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0 || bJavaUtf8) { + switch (nBytes) { + case 1: + if (bJavaUtf8 && nUtf32 == 0) { + goto bad_input; + } + break; + case 2: + if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) { + goto bad_input; + } + break; + case 3: + if (nUtf32 < 0x800 + || (!bJavaUtf8 + && (rtl::isHighSurrogate(nUtf32) + || rtl::isLowSurrogate(nUtf32)))) + { + goto bad_input; + } + break; + case 4: + if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32) + || bJavaUtf8) + { + goto bad_input; + } + break; + default: + goto bad_input; + } if (nUtf32 <= 0xFFFF) if (pDestBufPtr != pDestBufEnd) *pDestBufPtr++ = (sal_Unicode) nUtf32; else goto no_output; - else if (rtl::isUnicodeCodePoint(nUtf32)) - if (pDestBufEnd - pDestBufPtr >= 2) - { - *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); - *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); - } - else - goto no_output; - else + else if (pDestBufEnd - pDestBufPtr >= 2) { - bUndefined = true; - goto bad_input; + *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); + *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); } + else + goto no_output; } nShift = -1; bCheckBom = false; @@ -183,7 +207,7 @@ sal_Size ImplConvertUtf8ToUnicode( bad_input: switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion( - bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd, + false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: @@ -238,6 +262,7 @@ sal_Size ImplConvertUtf8ToUnicode( if (pContext != nullptr) { static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32; + static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes; static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift; static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom; } |