2 files changed, 352 insertions, 53 deletions
diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index d698bc22cd74..3c36852bebfc 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
 
     void testComplexCut();
 
+    void testInvalidUtf8();
+
     void testSRCBUFFERTOSMALL();
 
     void testMime();
@@ -465,6 +467,7 @@ public:
     CPPUNIT_TEST(testSingleByte);
     CPPUNIT_TEST(testComplex);
     CPPUNIT_TEST(testComplexCut);
+    CPPUNIT_TEST(testInvalidUtf8);
     CPPUNIT_TEST(testSRCBUFFERTOSMALL);
     CPPUNIT_TEST(testMime);
     CPPUNIT_TEST(testWindows);
@@ -2330,35 +2333,6 @@ void Test::testComplex() {
               true,
               false,
               RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM(
-                  "\xC0\x80\xE0\x80\x81\xF0\x80\x80\x82\xF8\x80\x80\x80\x83"
-                  "\xFC\x80\x80\x80\x80\x84"),
-              { 0x0000,0x0001,0x0002,0x0003,0x0004 },
-              5,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\xED\xB4\x93"),
-              { 0xD849,0xDD13 },
-              2,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-            { RTL_TEXTENCODING_UTF8,
-              RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\x41"),
-              { 0xD849,0x0041 },
-              2,
-              false,
-              true,
-              false,
-              false,
-              RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
 
             // Test Java UTF-8:
 
@@ -2664,6 +2638,306 @@ void Test::testComplexCut() {
 #endif
 }
 
+void Test::testInvalidUtf8() {
+    // UTF-8, invalid bytes:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80\xBF\xFE\xFF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD\uFFFD\uFFFD\uFFFD"),
+            OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest two-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0\x80"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut two-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+        CPPUNIT_ASSERT(converted <= 1);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest three-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x9F\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut three-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+        CPPUNIT_ASSERT(converted <= 2);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, cut three-byte sequence followed by more:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80."), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD."), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, surrogates:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xED\xA0\x80\xED\xB0\x80"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, non-shortest four-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF0\x8F\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, too-large four-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF4\x90\x80\x80"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, five-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xFB\xBF\xBF\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(5), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // UTF-8, six-byte sequence:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr,
+            RTL_CONSTASCII_STRINGPARAM("\xFD\xBF\xBF\xBF\xBF\xBF"),
+            buf, TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // Java UTF-8, U+0000:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_JAVA_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\0"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+    // Java UTF-8, U+10000:
+    {
+        auto const converter = rtl_createTextToUnicodeConverter(
+            RTL_TEXTENCODING_JAVA_UTF8);
+        CPPUNIT_ASSERT(converter != nullptr);
+        sal_Unicode buf[TEST_STRING_SIZE];
+        sal_uInt32 info;
+        sal_Size converted;
+        auto const size = rtl_convertTextToUnicode(
+            converter, nullptr, RTL_CONSTASCII_STRINGPARAM(u8"\U00010000"), buf,
+            TEST_STRING_SIZE,
+            (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+             | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+             | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+            &info, &converted);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+        CPPUNIT_ASSERT_EQUAL(
+            OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+        CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+        CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+        rtl_destroyTextToUnicodeConverter(converter);
+    }
+}
+
 void Test::testSRCBUFFERTOSMALL() {
     rtl_TextToUnicodeConverter cv = rtl_createTextToUnicodeConverter(
         RTL_TEXTENCODING_EUC_JP);
diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx
index c1a6949c01c5..4943f6987a29 100644
--- a/sal/textenc/tcvtutf8.cxx
+++ b/sal/textenc/tcvtutf8.cxx
@@ -30,6 +30,7 @@
 struct ImplUtf8ToUnicodeContext
 {
     sal_uInt32 nUtf32;
+    int nBytes;
     int nShift;
     bool bCheckBom;
 };
@@ -65,18 +66,9 @@ sal_Size ImplConvertUtf8ToUnicode(
     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
 {
-    /*
-       This function is very liberal with the UTF-8 input.  Accepted are:
-       - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
-       - surrogates (e.g., ED A0 80 to represent U+D800)
-       - encodings with up to six bytes (everything outside the range
-         U+0000..10FFFF is considered "undefined")
-       The first two of these points allow this routine to translate from both
-       RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
-      */
-
     bool bJavaUtf8 = pData != nullptr;
     sal_uInt32 nUtf32 = 0;
+    int nBytes;
     int nShift = -1;
     bool bCheckBom = true;
     sal_uInt32 nInfo = 0;
@@ -88,19 +80,22 @@ sal_Size ImplConvertUtf8ToUnicode(
     if (pContext != nullptr)
     {
         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
+        nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
     }
 
     while (pSrcBufPtr < pSrcBufEnd)
     {
-        bool bUndefined = false;
         bool bConsume = true;
         sal_uInt32 nChar = *pSrcBufPtr++;
         if (nShift < 0)
+            // Allow (illegal) 5 and 6 byte sequences, so they are read as a
+            // single individual bad character:
             if (nChar <= 0x7F)
             {
                 nUtf32 = nChar;
+                nBytes = 1;
                 goto transform;
             }
             else if (nChar <= 0xBF)
@@ -108,26 +103,31 @@ sal_Size ImplConvertUtf8ToUnicode(
             else if (nChar <= 0xDF)
             {
                 nUtf32 = (nChar & 0x1F) << 6;
+                nBytes = 2;
                 nShift = 0;
             }
             else if (nChar <= 0xEF)
             {
                 nUtf32 = (nChar & 0x0F) << 12;
+                nBytes = 3;
                 nShift = 6;
             }
             else if (nChar <= 0xF7)
             {
                 nUtf32 = (nChar & 0x07) << 18;
+                nBytes = 4;
                 nShift = 12;
             }
             else if (nChar <= 0xFB)
             {
                 nUtf32 = (nChar & 0x03) << 24;
+                nBytes = 5;
                 nShift = 18;
             }
             else if (nChar <= 0xFD)
             {
                 nUtf32 = (nChar & 0x01) << 30;
+                nBytes = 6;
                 nShift = 24;
             }
             else
@@ -154,28 +154,52 @@ sal_Size ImplConvertUtf8ToUnicode(
         continue;
 
     transform:
-        if (!bCheckBom || nUtf32 != 0xFEFF
+        if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
             || bJavaUtf8)
         {
+            switch (nBytes) {
+            case 1:
+                if (bJavaUtf8 && nUtf32 == 0) {
+                    goto bad_input;
+                }
+                break;
+            case 2:
+                if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
+                    goto bad_input;
+                }
+                break;
+            case 3:
+                if (nUtf32 < 0x800
+                    || (!bJavaUtf8
+                        && (rtl::isHighSurrogate(nUtf32)
+                            || rtl::isLowSurrogate(nUtf32))))
+                {
+                    goto bad_input;
+                }
+                break;
+            case 4:
+                if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
+                    || bJavaUtf8)
+                {
+                    goto bad_input;
+                }
+                break;
+            default:
+                goto bad_input;
+            }
             if (nUtf32 <= 0xFFFF)
                 if (pDestBufPtr != pDestBufEnd)
                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
                 else
                     goto no_output;
-            else if (rtl::isUnicodeCodePoint(nUtf32))
-                if (pDestBufEnd - pDestBufPtr >= 2)
-                {
-                    *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
-                    *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
-                }
-                else
-                    goto no_output;
-            else
+            else if (pDestBufEnd - pDestBufPtr >= 2)
             {
-                bUndefined = true;
-                goto bad_input;
+                *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
+                *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
             }
+            else
+                goto no_output;
         }
         nShift = -1;
         bCheckBom = false;
@@ -183,7 +207,7 @@ sal_Size ImplConvertUtf8ToUnicode(
 
     bad_input:
         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
-                    bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
+                    false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
                     &nInfo))
         {
         case sal::detail::textenc::BAD_INPUT_STOP:
@@ -238,6 +262,7 @@ sal_Size ImplConvertUtf8ToUnicode(
     if (pContext != nullptr)
     {
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
+        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
     }