summaryrefslogtreecommitdiff
path: root/sal
diff options
context:
space:
mode:
Diffstat (limited to 'sal')
-rw-r--r--sal/qa/rtl/textenc/rtl_textcvt.cxx332
-rw-r--r--sal/textenc/tcvtutf8.cxx73
2 files changed, 352 insertions, 53 deletions
diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index d698bc22cd74..3c36852bebfc 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
void testComplexCut();
+ void testInvalidUtf8();
+
void testSRCBUFFERTOSMALL();
void testMime();
@@ -465,6 +467,7 @@ public:
CPPUNIT_TEST(testSingleByte);
CPPUNIT_TEST(testComplex);
CPPUNIT_TEST(testComplexCut);
+ CPPUNIT_TEST(testInvalidUtf8);
CPPUNIT_TEST(testSRCBUFFERTOSMALL);
CPPUNIT_TEST(testMime);
CPPUNIT_TEST(testWindows);
@@ -2330,35 +2333,6 @@ void Test::testComplex() {
true,
false,
RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM(
- "\xC0\x80\xE0\x80\x81\xF0\x80\x80\x82\xF8\x80\x80\x80\x83"
- "\xFC\x80\x80\x80\x80\x84"),
- { 0x0000,0x0001,0x0002,0x0003,0x0004 },
- 5,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\xED\xB4\x93"),
- { 0xD849,0xDD13 },
- 2,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\x41"),
- { 0xD849,0x0041 },
- 2,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
// Test Java UTF-8:
@@ -2664,6 +2638,306 @@ void Test::testComplexCut() {
#endif
}
+void Test::testInvalidUtf8() {
+ // UTF-8, invalid bytes:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80\xBF\xFE\xFF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD\uFFFD\uFFFD\uFFFD"),
+ OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest two-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0\x80"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut two-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+ CPPUNIT_ASSERT(converted <= 1);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest three-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x9F\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut three-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+ CPPUNIT_ASSERT(converted <= 2);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut three-byte sequence followed by more:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80."), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD."), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, surrogates:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xED\xA0\x80\xED\xB0\x80"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest four-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF0\x8F\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, too-large four-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF4\x90\x80\x80"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, five-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xFB\xBF\xBF\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(5), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, six-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xFD\xBF\xBF\xBF\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // Java UTF-8, U+0000:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_JAVA_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\0"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // Java UTF-8, U+10000:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_JAVA_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM(u8"\U00010000"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+}
+
void Test::testSRCBUFFERTOSMALL() {
rtl_TextToUnicodeConverter cv = rtl_createTextToUnicodeConverter(
RTL_TEXTENCODING_EUC_JP);
diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx
index c1a6949c01c5..4943f6987a29 100644
--- a/sal/textenc/tcvtutf8.cxx
+++ b/sal/textenc/tcvtutf8.cxx
@@ -30,6 +30,7 @@
struct ImplUtf8ToUnicodeContext
{
sal_uInt32 nUtf32;
+ int nBytes;
int nShift;
bool bCheckBom;
};
@@ -65,18 +66,9 @@ sal_Size ImplConvertUtf8ToUnicode(
sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
{
- /*
- This function is very liberal with the UTF-8 input. Accepted are:
- - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
- - surrogates (e.g., ED A0 80 to represent U+D800)
- - encodings with up to six bytes (everything outside the range
- U+0000..10FFFF is considered "undefined")
- The first two of these points allow this routine to translate from both
- RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
- */
-
bool bJavaUtf8 = pData != nullptr;
sal_uInt32 nUtf32 = 0;
+ int nBytes;
int nShift = -1;
bool bCheckBom = true;
sal_uInt32 nInfo = 0;
@@ -88,19 +80,22 @@ sal_Size ImplConvertUtf8ToUnicode(
if (pContext != nullptr)
{
nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
+ nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
}
while (pSrcBufPtr < pSrcBufEnd)
{
- bool bUndefined = false;
bool bConsume = true;
sal_uInt32 nChar = *pSrcBufPtr++;
if (nShift < 0)
+ // Allow (illegal) 5 and 6 byte sequences, so they are read as a
+ // single individual bad character:
if (nChar <= 0x7F)
{
nUtf32 = nChar;
+ nBytes = 1;
goto transform;
}
else if (nChar <= 0xBF)
@@ -108,26 +103,31 @@ sal_Size ImplConvertUtf8ToUnicode(
else if (nChar <= 0xDF)
{
nUtf32 = (nChar & 0x1F) << 6;
+ nBytes = 2;
nShift = 0;
}
else if (nChar <= 0xEF)
{
nUtf32 = (nChar & 0x0F) << 12;
+ nBytes = 3;
nShift = 6;
}
else if (nChar <= 0xF7)
{
nUtf32 = (nChar & 0x07) << 18;
+ nBytes = 4;
nShift = 12;
}
else if (nChar <= 0xFB)
{
nUtf32 = (nChar & 0x03) << 24;
+ nBytes = 5;
nShift = 18;
}
else if (nChar <= 0xFD)
{
nUtf32 = (nChar & 0x01) << 30;
+ nBytes = 6;
nShift = 24;
}
else
@@ -154,28 +154,52 @@ sal_Size ImplConvertUtf8ToUnicode(
continue;
transform:
- if (!bCheckBom || nUtf32 != 0xFEFF
+ if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
|| (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
|| bJavaUtf8)
{
+ switch (nBytes) {
+ case 1:
+ if (bJavaUtf8 && nUtf32 == 0) {
+ goto bad_input;
+ }
+ break;
+ case 2:
+ if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
+ goto bad_input;
+ }
+ break;
+ case 3:
+ if (nUtf32 < 0x800
+ || (!bJavaUtf8
+ && (rtl::isHighSurrogate(nUtf32)
+ || rtl::isLowSurrogate(nUtf32))))
+ {
+ goto bad_input;
+ }
+ break;
+ case 4:
+ if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
+ || bJavaUtf8)
+ {
+ goto bad_input;
+ }
+ break;
+ default:
+ goto bad_input;
+ }
if (nUtf32 <= 0xFFFF)
if (pDestBufPtr != pDestBufEnd)
*pDestBufPtr++ = (sal_Unicode) nUtf32;
else
goto no_output;
- else if (rtl::isUnicodeCodePoint(nUtf32))
- if (pDestBufEnd - pDestBufPtr >= 2)
- {
- *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
- *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
- }
- else
- goto no_output;
- else
+ else if (pDestBufEnd - pDestBufPtr >= 2)
{
- bUndefined = true;
- goto bad_input;
+ *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
+ *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
}
+ else
+ goto no_output;
}
nShift = -1;
bCheckBom = false;
@@ -183,7 +207,7 @@ sal_Size ImplConvertUtf8ToUnicode(
bad_input:
switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
- bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
+ false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
&nInfo))
{
case sal::detail::textenc::BAD_INPUT_STOP:
@@ -238,6 +262,7 @@ sal_Size ImplConvertUtf8ToUnicode(
if (pContext != nullptr)
{
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
+ static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
}