diff options
author | Mike Kaganski <mike.kaganski@collabora.com> | 2023-05-19 14:01:02 +0300 |
---|---|---|
committer | Mike Kaganski <mike.kaganski@collabora.com> | 2023-05-20 13:31:21 +0200 |
commit | 075ecc1c31199d0fd0f930cf1b803b04a3b17ce8 (patch) | |
tree | e34866de1da909795b8a6a54b7ac679b11857c58 /editeng | |
parent | f692b5c5e4e9c57fb69d0054c654486a737d19bd (diff) |
tdf#155407: fix the second replacement in FnChgToEnEmDash
It was broken from the beginning. The second replacement could
look into a wrong string when checking if the characters around
the "--" are eligible; it could use obsolete indices in the
document, ignoring the previous replacement that changed the
lendth of the text.
This also replaces a use of char* to hold Unicode codepoints to
pass to lcl_IsInAsciiArr, with an array of sal_Unicode (because
all the checked values fit into it).
Change-Id: I949630abc564fc0875be0b92228846497bb1a022
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/152002
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'editeng')
-rw-r--r-- | editeng/source/misc/svxacorr.cxx | 102 |
1 files changed, 58 insertions, 44 deletions
diff --git a/editeng/source/misc/svxacorr.cxx b/editeng/source/misc/svxacorr.cxx index dfb1e6c0d726..318da6536df9 100644 --- a/editeng/source/misc/svxacorr.cxx +++ b/editeng/source/misc/svxacorr.cxx @@ -100,11 +100,13 @@ constexpr OUStringLiteral pXMLImplWordStart_ExcptLstStr = u"WordExceptList.xml"; constexpr OUStringLiteral pXMLImplCplStt_ExcptLstStr = u"SentenceExceptList.xml"; constexpr OUStringLiteral pXMLImplAutocorr_ListStr = u"DocumentList.xml"; -const char +// tdf#54409 check also typographical quotation marks in the case of skipped ASCII quotation marks +// Curious, why these \u0083\u0084\u0089\u0091\u0092\u0093\u0094 are handled as "begin characters"? +constexpr std::u16string_view /* also at these beginnings - Brackets and all kinds of begin characters */ - sImplSttSkipChars[] = "\"\'([{\x83\x84\x89\x91\x92\x93\x94", + sImplSttSkipChars = u"\"'([{\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u0083\u0084\u0089\u0091\u0092\u0093\u0094", /* also at these ends - Brackets and all kinds of begin characters */ - sImplEndSkipChars[] = "\"\')]}\x83\x84\x89\x91\x92\x93\x94"; + sImplEndSkipChars = u"\"')]}\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u0083\u0084\u0089\u0091\u0092\u0093\u0094"; static OUString EncryptBlockName_Imp(std::u16string_view rName); @@ -171,20 +173,12 @@ static bool lcl_IsSymbolChar( CharClass const & rCC, const OUString& rTxt, return false; } -static bool lcl_IsInAsciiArr( const char* pArr, const sal_Unicode c ) +static bool lcl_IsInArr(std::u16string_view arr, const sal_uInt32 c) { - // tdf#54409 check also typographical quotation marks in the case of skipped ASCII quotation marks - if ( 0x2018 <= c && c <= 0x201F && (pArr == sImplSttSkipChars || pArr == sImplEndSkipChars) ) - return true; - - bool bRet = false; - for( ; *pArr; ++pArr ) - if( *pArr == c ) - { - bRet = true; - break; - } - return bRet; + for (const auto c1 : arr) + if (c1 == c) + return true; + return false; } SvxAutoCorrDoc::~SvxAutoCorrDoc() @@ -312,6 +306,8 @@ ACFlags SvxAutoCorrect::GetDefaultFlags() constexpr sal_Unicode cEmDash = 0x2014; constexpr sal_Unicode cEnDash = 0x2013; +constexpr OUStringLiteral sEmDash(u"\u2014"); +constexpr OUStringLiteral sEnDash(u"\u2013"); constexpr sal_Unicode cApostrophe = 0x2019; constexpr sal_Unicode cLeftDoubleAngleQuote = 0xAB; constexpr sal_Unicode cRightDoubleAngleQuote = 0xBB; @@ -485,10 +481,10 @@ bool SvxAutoCorrect::FnChgOrdinalNumber( CharClass& rCC = GetCharClass(eLang); for (; nSttPos < nEndPos; ++nSttPos) - if (!lcl_IsInAsciiArr(sImplSttSkipChars, rTxt[nSttPos])) + if (!lcl_IsInArr(sImplSttSkipChars, rTxt[nSttPos])) break; for (; nSttPos < nEndPos; --nEndPos) - if (!lcl_IsInAsciiArr(sImplEndSkipChars, rTxt[nEndPos - 1])) + if (!lcl_IsInArr(sImplEndSkipChars, rTxt[nEndPos - 1])) break; @@ -558,6 +554,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash( // rTxt may refer to the frame text that will change in the calls to rDoc.Delete / rDoc.Insert; // keep a local copy for later use OUString aOrigTxt = rTxt; + sal_Int32 nFirstReplacementTextLengthChange = 0; // replace " - " or " --" with "enDash" if( 1 < nSttPos && 1 <= nEndPos - nSttPos ) @@ -570,7 +567,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash( '-' == rTxt[ nSttPos+1 ]) { sal_Int32 n; - for( n = nSttPos+2; n < nEndPos && lcl_IsInAsciiArr( + for( n = nSttPos+2; n < nEndPos && lcl_IsInArr( sImplSttSkipChars,(cCh = rTxt[ n ])); ++n ) ; @@ -578,7 +575,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash( // found: " --[<AnySttChars>][A-z0-9] if( rCC.isLetterNumeric( OUString(cCh) ) ) { - for( n = nSttPos-1; n && lcl_IsInAsciiArr( + for( n = nSttPos-1; n && lcl_IsInArr( sImplEndSkipChars,(cCh = rTxt[ --n ])); ) ; @@ -586,7 +583,8 @@ bool SvxAutoCorrect::FnChgToEnEmDash( if( rCC.isLetterNumeric( OUString(cCh) )) { rDoc.Delete( nSttPos, nSttPos + 2 ); - rDoc.Insert( nSttPos, bAlwaysUseEmDash ? OUString(cEmDash) : OUString(cEnDash) ); + rDoc.Insert( nSttPos, bAlwaysUseEmDash ? sEmDash : sEnDash ); + nFirstReplacementTextLengthChange = -1; // 2 ch -> 1 ch bRet = true; } } @@ -605,7 +603,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash( } if( ' ' == cCh ) { - for( n = nSttPos; n < nEndPos && lcl_IsInAsciiArr( + for( n = nSttPos; n < nEndPos && lcl_IsInArr( sImplSttSkipChars,(cCh = rTxt[ n ])); ++n ) ; @@ -614,14 +612,15 @@ bool SvxAutoCorrect::FnChgToEnEmDash( if( rCC.isLetterNumeric( OUString(cCh) ) ) { cCh = ' '; - for( n = nTmpPos-1; n && lcl_IsInAsciiArr( + for( n = nTmpPos-1; n && lcl_IsInArr( sImplEndSkipChars,(cCh = rTxt[ --n ])); ) ; // found: "[A-z0-9][<AnyEndChars>] - [<AnySttChars>][A-z0-9] if( rCC.isLetterNumeric( OUString(cCh) )) { rDoc.Delete( nTmpPos, nTmpPos + nLen ); - rDoc.Insert( nTmpPos, bAlwaysUseEmDash ? OUString(cEmDash) : OUString(cEnDash) ); + rDoc.Insert( nTmpPos, bAlwaysUseEmDash ? sEmDash : sEnDash ); + nFirstReplacementTextLengthChange = 1 - nLen; // nLen ch -> 1 ch bRet = true; } } @@ -635,20 +634,35 @@ bool SvxAutoCorrect::FnChgToEnEmDash( bool bEnDash = (eLang == LANGUAGE_HUNGARIAN || eLang == LANGUAGE_FINNISH); if( 4 <= nEndPos - nSttPos ) { - OUString sTmp( aOrigTxt.subView( nSttPos, nEndPos - nSttPos ) ); - sal_Int32 nFndPos = sTmp.indexOf("--"); - if( nFndPos != -1 && nFndPos && - nFndPos + 2 < sTmp.getLength() && - ( rCC.isLetterNumeric( sTmp, nFndPos - 1 ) || - lcl_IsInAsciiArr( sImplEndSkipChars, aOrigTxt[ nFndPos - 1 ] )) && - ( rCC.isLetterNumeric( sTmp, nFndPos + 2 ) || - lcl_IsInAsciiArr( sImplSttSkipChars, aOrigTxt[ nFndPos + 2 ] ))) + std::u16string_view sTmpView( aOrigTxt.subView( nSttPos, nEndPos - nSttPos ) ); + size_t nFndPos = sTmpView.find(u"--"); + if (nFndPos > 0 && nFndPos < sTmpView.size() - 2) { - nSttPos = nSttPos + nFndPos; - rDoc.Delete( nSttPos, nSttPos + 2 ); - rDoc.Insert( nSttPos, (bEnDash || (rCC.isDigit( sTmp, nFndPos - 1 ) && - rCC.isDigit( sTmp, nFndPos + 2 )) ? OUString(cEnDash) : OUString(cEmDash)) ); - bRet = true; + // Use proper codepoints. Currently, CharClass::isLetterNumeric is broken, it + // uses the index *both* as code unit index (when checking it as ASCII), *and* + // as code point index (when passes to css::i18n::XCharacterClassification). + // Oh well... Anyway, single-codepoint strings will workaround it. + sal_Int32 nStart = nSttPos + nFndPos; + sal_uInt32 chStart = aOrigTxt.iterateCodePoints(&nStart, -1); + OUString sStart(&chStart, 1); + // No idea why sImplEndSkipChars is checked at start + if (rCC.isLetterNumeric(sStart, 0) || lcl_IsInArr(sImplEndSkipChars, chStart)) + { + sal_Int32 nEnd = nSttPos + nFndPos + 2; + sal_uInt32 chEnd = aOrigTxt.iterateCodePoints(&nEnd, 1); + OUString sEnd(&chEnd, 1); + // No idea why sImplSttSkipChars is checked at end + if (rCC.isLetterNumeric(sEnd, 0) || lcl_IsInArr(sImplSttSkipChars, chEnd)) + { + nSttPos = nSttPos + nFndPos + nFirstReplacementTextLengthChange; + rDoc.Delete(nSttPos, nSttPos + 2); + rDoc.Insert(nSttPos, + (bEnDash || (rCC.isDigit(sStart, 0) && rCC.isDigit(sEnd, 0)) + ? sEnDash + : sEmDash)); + bRet = true; + } + } } } return bRet; @@ -885,7 +899,7 @@ void SvxAutoCorrect::FnCapitalStartSentence( SvxAutoCorrDoc& rDoc, } else if (pWordStt && !rCC.isDigit(aText, pStr - pStart)) { - if( (lcl_IsInAsciiArr( "-'", *pStr ) || *pStr == cApostrophe) && // These characters are allowed in words + if( (lcl_IsInArr( u"-'", *pStr ) || *pStr == cApostrophe) && // These characters are allowed in words pWordStt - 1 == pStr && // Installation at beginning of paragraph. Replaced < by <= (#i38971#) (pStart + 1) <= pStr && @@ -922,7 +936,7 @@ void SvxAutoCorrect::FnCapitalStartSentence( SvxAutoCorrDoc& rDoc, // Only capitalize, if string before specified characters is long enough if( *pDelim && 2 >= pDelim - pWordStt && - lcl_IsInAsciiArr( ".-)>", *pDelim ) ) + lcl_IsInArr( u".-)>", *pDelim ) ) return; // tdf#59666 don't capitalize single Greek letters (except in Greek texts) @@ -1350,7 +1364,7 @@ void SvxAutoCorrect::DoAutoCorrect( SvxAutoCorrDoc& rDoc, const OUString& rTxt, { sal_Unicode cPrev = rTxt[ nInsPos-1 ]; bSttQuote = NonFieldWordDelim(cPrev) || - lcl_IsInAsciiArr( "([{", cPrev ) || + lcl_IsInArr( u"([{", cPrev ) || ( cEmDash == cPrev ) || ( cEnDash == cPrev ); // tdf#38394 use opening quotation mark << in French l'<<word>> @@ -1570,11 +1584,11 @@ void SvxAutoCorrect::DoAutoCorrect( SvxAutoCorrDoc& rDoc, const OUString& rTxt, { sal_Int32 nCapLttrPos1 = nCapLttrPos, nInsPos1 = nInsPos; while( nCapLttrPos1 < nInsPos && - lcl_IsInAsciiArr( sImplSttSkipChars, rTxt[ nCapLttrPos1 ] ) + lcl_IsInArr( sImplSttSkipChars, rTxt[ nCapLttrPos1 ] ) ) ++nCapLttrPos1; while( nCapLttrPos1 < nInsPos1 && nInsPos1 && - lcl_IsInAsciiArr( sImplEndSkipChars, rTxt[ nInsPos1-1 ] ) + lcl_IsInArr( sImplEndSkipChars, rTxt[ nInsPos1-1 ] ) ) --nInsPos1; @@ -1767,7 +1781,7 @@ OUString SvxAutoCorrect::GetPrevAutoCorrWord(SvxAutoCorrDoc const& rDoc, const O if( !nPos && !IsWordDelim( rTxt[ 0 ])) --nCapLttrPos; // Beginning of paragraph and no Blank! - while( lcl_IsInAsciiArr( sImplSttSkipChars, rTxt[ nCapLttrPos ]) ) + while( lcl_IsInArr( sImplSttSkipChars, rTxt[ nCapLttrPos ]) ) if( ++nCapLttrPos >= nEnd ) return sRet; @@ -1923,7 +1937,7 @@ OUString EncryptBlockName_Imp(std::u16string_view rName) aName.append('#').append(rName); for (size_t nLen = rName.size(), nPos = 1; nPos < nLen; ++nPos) { - if (lcl_IsInAsciiArr( "!/:.\\", aName[nPos])) + if (lcl_IsInArr( u"!/:.\\", aName[nPos])) aName[nPos] &= 0x0f; } return aName.makeStringAndClear(); |