diff options
author | Stephan Bergmann <sbergman@redhat.com> | 2019-09-04 09:36:03 +0200 |
---|---|---|
committer | Stephan Bergmann <sbergman@redhat.com> | 2019-09-04 13:02:11 +0200 |
commit | ca6ddfcc9385f1c31531eae31dfa81a9dda246f0 (patch) | |
tree | 554bf51eb5de4b9bd29ec1cbe966bf693da25f99 /sal | |
parent | c6ad32e03de01525a863171ed58df05e89e9f105 (diff) |
[API CHANGE] rtl_convertTextToUnicode behavior upon erroneous input
<http://udk.openoffice.org/cpp/man/spec/textconversion.html> specifies that
FLAGS_UNDEFINED_ERROR, FLAGS_MBUNDEFINED_ERROR, and FLAGS_INVALID_ERROR: "Read
past the [erroneous] code in the input buffer [...]" But actual behavior of
rtl_convertTextToUnicode for the various rtl_TextEncoding values has been
inconsistent. Some erroneous input (mostly single-byte UNDEFINED and INVALID
ones) has not been consumed at all, some (multi-byte MBUNDEFINED and INVALID)
has been consumed partly, and some has been consumed fully as required.
However, at least since 8dd4265b9ddbd7786b6237676909eae5b540da0e "CWS-TOOLING:
integrate CWS hb18", Custom8BitToUnicode in sw/source/filter/ww8/ww8par.cxx
appears to rely on the broken behavior of not consuming erroneous input. (It
reads the chunk of valid input with e.g. some RTL_TEXTENCODING_MS_125x that
happens to exhibit the broken behavior of not consuming erroneous input, then
wants to try to re-read the erroneous input with RTL_TEXTENCODING_MS_1252. For
example, opening sw/qa/core/data/ww8/pass/forcepoint50-grfanchor-1.doc triggers
that code. For whatever reason, the am_faksas.dot attached to
<https://bz.apache.org/ooo/show_bug.cgi?id=9240#c1> "Do not show lithuanian
letter 'Š'" appears to not, or at least no longer, trigger that code.)
Therefore, it would be useful to have a mode in which rtl_convertTextToUnicode
does not consume erroneous input. (And I plan on doing changes in
sal/osl/unx/file* that would benefit from that behavior, too.) But changing
rtl_convertTextToUnicode to generally not consume erroneous input would not be
feasible: If calls do not set RTL_TEXTTOUNICODE_FLAGS_FLUSH, part of an
erroneous input can already have been consumed by a previous call, so the
current call cannot undo that.
But a change that looks like it can work is to change the behavior only if
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set. In that case we can at least not consume
the part of an erroneous input that has not yet been consumed by a previous call
(which would necessarily have been done with RTL_TEXTTOUNICODE_FLAGS_FLUSH
unset). The expecation is that code that relies on the don't-consume behavior
will do only single calls with RTL_TEXTTOUNICODE_FLAGS_FLUSH set (so reliably
not consume the complete erroneous input), while other code (which might do
calls in a loop) will not care whether erroneous input has been consumed,
anyway. This can be considered a mild form of behavioral API CHANGE (but note
that the old implementation didn't exhibit the requested behavior anyway).
So all implementations of rtl_convertTextToUnicode for the various
rtl_TextEncoding values have been adapted to the new behavior. The only
exceptions are ImplDummyToUnicode (sal/textenc/textcvt.cxx), which is a special
case anyway used by RTL_TEXTENCODING_DONTKNOW, and two out of three places
(marked with a "TODO" each) in ImplUTF7ToUnicode (sal/textenc/tcvtutf7.cxx),
where it is hard to retrofit the expected behaivor, and RTL_TEXTENCODING_UTF7 is
probably not relevant for the use cases relying on the don't-consume--behavior,
anyway.
Whether a similar change should be done for rtl_convertUnicodeToText can be
examined later.
Change-Id: I1ac2c4cfd99e2a0eca219f9a3855ef110b254855
Reviewed-on: https://gerrit.libreoffice.org/78584
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sal')
-rw-r--r-- | sal/qa/rtl/textenc/rtl_textcvt.cxx | 2 | ||||
-rw-r--r-- | sal/textenc/convertbig5hkscs.cxx | 22 | ||||
-rw-r--r-- | sal/textenc/converteuctw.cxx | 22 | ||||
-rw-r--r-- | sal/textenc/convertgb18030.cxx | 32 | ||||
-rw-r--r-- | sal/textenc/convertisciidevangari.cxx | 19 | ||||
-rw-r--r-- | sal/textenc/convertiso2022cn.cxx | 23 | ||||
-rw-r--r-- | sal/textenc/convertiso2022jp.cxx | 18 | ||||
-rw-r--r-- | sal/textenc/convertiso2022kr.cxx | 17 | ||||
-rw-r--r-- | sal/textenc/convertsimple.cxx | 3 | ||||
-rw-r--r-- | sal/textenc/convertsinglebytetobmpunicode.cxx | 3 | ||||
-rw-r--r-- | sal/textenc/tcvtmb.cxx | 34 | ||||
-rw-r--r-- | sal/textenc/tcvtutf7.cxx | 17 | ||||
-rw-r--r-- | sal/textenc/tcvtutf8.cxx | 15 |
13 files changed, 195 insertions, 32 deletions
diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx index 6b5a7e55fe21..2f5359b32c77 100644 --- a/sal/qa/rtl/textenc/rtl_textcvt.cxx +++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx @@ -165,7 +165,7 @@ void testSingleByteCharSet(SingleByteCharSet const & rSet) { CPPUNIT_ASSERT_EQUAL(sal_Size(0), nSize); CPPUNIT_ASSERT_EQUAL(nExpectedInfo, nInfo); - CPPUNIT_ASSERT_EQUAL(sal_Size(0), nConverted); + CPPUNIT_ASSERT_EQUAL(sal_Size(1), nConverted); rtl_destroyTextToUnicodeContext(aConverter, aContext); rtl_destroyTextToUnicodeConverter(aConverter); diff --git a/sal/textenc/convertbig5hkscs.cxx b/sal/textenc/convertbig5hkscs.cxx index 3d66bdfcc432..eaed0c7a36d2 100644 --- a/sal/textenc/convertbig5hkscs.cxx +++ b/sal/textenc/convertbig5hkscs.cxx @@ -82,6 +82,7 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) nRow = static_cast< ImplBig5HkscsToUnicodeContext * >(pContext)->m_nRow; @@ -92,9 +93,10 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData, sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++); if (nRow == 0) if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else if (nChar >= 0x81 && nChar <= 0xFE) nRow = nChar; @@ -202,13 +204,15 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData, *pDestBufPtr++ = static_cast<sal_Unicode>(pBig5Hkscs2001Data[ nOffset + (nChar - nFirst)]); + startOfCurrentChar = nConverted + 1; } else goto no_output; else - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; nRow = 0; } @@ -226,10 +230,16 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData, { case sal::detail::textenc::BAD_INPUT_STOP: nRow = 0; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: nRow = 0; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -256,6 +266,10 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: nRow = 0; break; diff --git a/sal/textenc/converteuctw.cxx b/sal/textenc/converteuctw.cxx index 87becd9b11ec..abc214402636 100644 --- a/sal/textenc/converteuctw.cxx +++ b/sal/textenc/converteuctw.cxx @@ -93,6 +93,7 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) { @@ -109,9 +110,10 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData, { case IMPL_EUC_TW_TO_UNICODE_STATE_0: if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else if (nChar >= 0xA1 && nChar <= 0xFE) { @@ -210,13 +212,15 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData, *pDestBufPtr++ = static_cast<sal_Unicode>(pCns116431992Data[ nOffset + (nChar - nFirst)]); + startOfCurrentChar = nConverted + 1; } else goto no_output; else - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; } else @@ -234,10 +238,16 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData, { case sal::detail::textenc::BAD_INPUT_STOP: eState = IMPL_EUC_TW_TO_UNICODE_STATE_0; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_EUC_TW_TO_UNICODE_STATE_0; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -264,6 +274,10 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_EUC_TW_TO_UNICODE_STATE_0; break; diff --git a/sal/textenc/convertgb18030.cxx b/sal/textenc/convertgb18030.cxx index 87e814674bb7..9aa64970421b 100644 --- a/sal/textenc/convertgb18030.cxx +++ b/sal/textenc/convertgb18030.cxx @@ -86,6 +86,7 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) { @@ -101,9 +102,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, { case IMPL_GB_18030_TO_UNICODE_STATE_0: if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else if (nChar == 0x80) goto bad_input; @@ -130,9 +132,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, { nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 : nChar - 0x80 + 63); - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = pGb18030Data[nCode]; - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; eState = IMPL_GB_18030_TO_UNICODE_STATE_0; } @@ -170,6 +173,7 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, = static_cast<sal_Unicode>(ImplGetHighSurrogate(nCode)); *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetLowSurrogate(nCode)); + startOfCurrentChar = nConverted + 1; } else goto no_output; @@ -184,24 +188,26 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, goto bad_input; else if (nCode < pRange->m_nFirstLinear) { - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = pGb18030Data[ pRange->m_nNonRangeDataIndex + (nCode - nFirstNonRange)]; - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; break; } else if (nCode < pRange->m_nPastLinear) { - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(pRange->m_nFirstUnicode + (nCode - pRange-> m_nFirstLinear)); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; break; } @@ -226,10 +232,16 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, { case sal::detail::textenc::BAD_INPUT_STOP: eState = IMPL_GB_18030_TO_UNICODE_STATE_0; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_GB_18030_TO_UNICODE_STATE_0; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -256,6 +268,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_GB_18030_TO_UNICODE_STATE_0; break; diff --git a/sal/textenc/convertisciidevangari.cxx b/sal/textenc/convertisciidevangari.cxx index 16c453fb8487..9e6583119252 100644 --- a/sal/textenc/convertisciidevangari.cxx +++ b/sal/textenc/convertisciidevangari.cxx @@ -96,6 +96,7 @@ sal_Size IsciiDevanagariToUnicode::convert( sal_Size nConverted = 0; sal_Unicode* pDestBufPtr = pDestBuf; sal_Unicode* pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; while (nConverted < nSrcBytes) { @@ -180,6 +181,10 @@ sal_Size IsciiDevanagariToUnicode::convert( } } + ++nConverted; + if (bDouble) + ++nConverted; + if (bNormal) cChar = IsciiDevanagariMap[nIn]; @@ -190,20 +195,24 @@ sal_Size IsciiDevanagariToUnicode::convert( BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion( bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo); - if (eAction == BAD_INPUT_CONTINUE) + if (eAction == BAD_INPUT_CONTINUE) { + startOfCurrentChar = nConverted; continue; - if (eAction == BAD_INPUT_STOP) + } + if (eAction == BAD_INPUT_STOP) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } break; + } assert(eAction == BAD_INPUT_NO_OUTPUT); nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; break; } - ++nConverted; - if (bDouble) - ++nConverted; *pDestBufPtr++ = cChar; m_cPrevChar = bNormal ? nIn : 0; + startOfCurrentChar = nConverted; } if (pInfo) diff --git a/sal/textenc/convertiso2022cn.cxx b/sal/textenc/convertiso2022cn.cxx index e931d68f1f35..76aee21f04b4 100644 --- a/sal/textenc/convertiso2022cn.cxx +++ b/sal/textenc/convertiso2022cn.cxx @@ -124,6 +124,7 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) { @@ -149,9 +150,10 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, else if (nChar == 0x1B) // ESC eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ESC; else if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else { @@ -203,6 +205,7 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_SO; + startOfCurrentChar = nConverted + 1; } else goto no_output; @@ -332,13 +335,15 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, *pDestBufPtr++ = static_cast<sal_Unicode>(pCns116431992Data[ nOffset + (nChar - nFirst)]); + startOfCurrentChar = nConverted + 1; } else goto no_output; else - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; } else @@ -358,11 +363,17 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, case sal::detail::textenc::BAD_INPUT_STOP: eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII; b116431 = false; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII; b116431 = false; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -389,6 +400,10 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII; b116431 = false; diff --git a/sal/textenc/convertiso2022jp.cxx b/sal/textenc/convertiso2022jp.cxx index f0eb5eb9a936..565c09ab36f5 100644 --- a/sal/textenc/convertiso2022jp.cxx +++ b/sal/textenc/convertiso2022jp.cxx @@ -94,6 +94,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) { @@ -111,9 +112,10 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, if (nChar == 0x1B) // ESC eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; else if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else { @@ -139,6 +141,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, break; } *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); + startOfCurrentChar = nConverted + 1; } else goto no_output; @@ -178,6 +181,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208; + startOfCurrentChar = nConverted + 1; } else goto no_output; @@ -248,10 +252,16 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, { case sal::detail::textenc::BAD_INPUT_STOP: eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -278,6 +288,10 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; break; diff --git a/sal/textenc/convertiso2022kr.cxx b/sal/textenc/convertiso2022kr.cxx index c65bc8597414..6169969812d2 100644 --- a/sal/textenc/convertiso2022kr.cxx +++ b/sal/textenc/convertiso2022kr.cxx @@ -100,6 +100,7 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData, sal_Size nConverted = 0; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; + sal_Size startOfCurrentChar = 0; if (pContext) { @@ -119,9 +120,10 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData, else if (nChar == 0x1B) // ESC eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC; else if (nChar < 0x80) - if (pDestBufPtr != pDestBufEnd) + if (pDestBufPtr != pDestBufEnd) { *pDestBufPtr++ = static_cast<sal_Unicode>(nChar); - else + startOfCurrentChar = nConverted + 1; + } else goto no_output; else { @@ -159,6 +161,7 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData, { *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode); eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001; + startOfCurrentChar = nConverted + 1; } else goto no_output; @@ -211,10 +214,16 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData, { case sal::detail::textenc::BAD_INPUT_STOP: eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++nConverted; + } else { + nConverted = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; + startOfCurrentChar = nConverted + 1; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -241,6 +250,10 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData, &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + nConverted = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; break; diff --git a/sal/textenc/convertsimple.cxx b/sal/textenc/convertsimple.cxx index 386c77ba80a8..9e4950edbac1 100644 --- a/sal/textenc/convertsimple.cxx +++ b/sal/textenc/convertsimple.cxx @@ -540,6 +540,9 @@ sal_Size sal::detail::textenc::convertCharToUnicode( *pInfo |= RTL_TEXTTOUNICODE_INFO_UNDEFINED; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } diff --git a/sal/textenc/convertsinglebytetobmpunicode.cxx b/sal/textenc/convertsinglebytetobmpunicode.cxx index 3458668a1f57..b883411cbffa 100644 --- a/sal/textenc/convertsinglebytetobmpunicode.cxx +++ b/sal/textenc/convertsinglebytetobmpunicode.cxx @@ -58,6 +58,9 @@ sal_Size rtl_textenc_convertSingleByteToBmpUnicode( &infoFlags)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((flags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++converted; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: diff --git a/sal/textenc/tcvtmb.cxx b/sal/textenc/tcvtmb.cxx index 6faa56b7c362..4e990cc1e91e 100644 --- a/sal/textenc/tcvtmb.cxx +++ b/sal/textenc/tcvtmb.cxx @@ -43,6 +43,7 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*, const ImplDBCSToUniLeadTab* pLeadTab = pConvertData->mpToUniLeadTab; sal_Unicode* pEndDestBuf; const char* pEndSrcBuf; + char const * startOfCurrentChar = pSrcBuf; *pInfo = 0; pEndDestBuf = pDestBuf+nDestChars; @@ -65,12 +66,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*, *pInfo |= RTL_TEXTTOUNICODE_INFO_UNDEFINED; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } else { + pSrcBuf = startOfCurrentChar; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE ) { pSrcBuf++; + startOfCurrentChar = pSrcBuf; continue; } cConv = ImplGetUndefinedUnicodeChar(cLead, nFlags); @@ -158,12 +165,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*, *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } else { + pSrcBuf = startOfCurrentChar; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) { pSrcBuf++; + startOfCurrentChar = pSrcBuf; continue; } cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; @@ -176,12 +189,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*, *pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } else { + pSrcBuf = startOfCurrentChar; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE ) { pSrcBuf++; + startOfCurrentChar = pSrcBuf; continue; } cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; @@ -197,6 +216,7 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*, *pDestBuf = cConv; pDestBuf++; pSrcBuf++; + startOfCurrentChar = pSrcBuf; } *pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf); @@ -372,6 +392,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData, const ImplEUCJPConvertData* pConvertData = static_cast<const ImplEUCJPConvertData*>(pData); sal_Unicode* pEndDestBuf; const char* pEndSrcBuf; + char const * startOfCurrentChar = pSrcBuf; *pInfo = 0; pEndDestBuf = pDestBuf+nDestChars; @@ -471,18 +492,29 @@ sal_Size ImplEUCJPToUnicode( const void* pData, *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } else { + pSrcBuf = startOfCurrentChar; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) { pSrcBuf++; + startOfCurrentChar = pSrcBuf; continue; } cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; } else { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } else { + pSrcBuf = startOfCurrentChar; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR ) { @@ -492,6 +524,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData, if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE ) { pSrcBuf++; + startOfCurrentChar = pSrcBuf; continue; } cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; @@ -508,6 +541,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData, *pDestBuf = cConv; pDestBuf++; pSrcBuf++; + startOfCurrentChar = pSrcBuf; } *pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf); diff --git a/sal/textenc/tcvtutf7.cxx b/sal/textenc/tcvtutf7.cxx index dd97b213750b..b09ccd47f1d8 100644 --- a/sal/textenc/tcvtutf7.cxx +++ b/sal/textenc/tcvtutf7.cxx @@ -254,6 +254,13 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext, *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + if (!bEnd) { + ++pSrcBuf; + } + } else { + //TODO: move pSrcBuf back to a reasonable starting place + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } @@ -303,6 +310,13 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext, *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + if (!bEnd) { + ++pSrcBuf; + } + } else { + //TODO: move pSrcBuf back to a reasonable starting place + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } @@ -344,6 +358,9 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext, *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; break; } diff --git a/sal/textenc/tcvtutf8.cxx b/sal/textenc/tcvtutf8.cxx index 72b336b9ded4..a2430ff7c075 100644 --- a/sal/textenc/tcvtutf8.cxx +++ b/sal/textenc/tcvtutf8.cxx @@ -76,6 +76,7 @@ sal_Size ImplConvertUtf8ToUnicode( unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars; + unsigned char const * startOfCurrentChar = pSrcBufPtr; if (pContext != nullptr) { @@ -200,6 +201,7 @@ sal_Size ImplConvertUtf8ToUnicode( } nShift = -1; bCheckBom = false; + startOfCurrentChar = pSrcBufPtr; continue; bad_input: @@ -210,8 +212,12 @@ sal_Size ImplConvertUtf8ToUnicode( case sal::detail::textenc::BAD_INPUT_STOP: nShift = -1; bCheckBom = false; - if (!bConsume) - --pSrcBufPtr; + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + if (!bConsume) + --pSrcBufPtr; + } else { + pSrcBufPtr = startOfCurrentChar; + } break; case sal::detail::textenc::BAD_INPUT_CONTINUE: @@ -219,6 +225,7 @@ sal_Size ImplConvertUtf8ToUnicode( bCheckBom = false; if (!bConsume) --pSrcBufPtr; + startOfCurrentChar = pSrcBufPtr; continue; case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: @@ -245,6 +252,10 @@ sal_Size ImplConvertUtf8ToUnicode( &nInfo)) { case sal::detail::textenc::BAD_INPUT_STOP: + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) { + pSrcBufPtr = startOfCurrentChar; + } + [[fallthrough]]; case sal::detail::textenc::BAD_INPUT_CONTINUE: nShift = -1; bCheckBom = false; |