tdf#155407: fix the second replacement in FnChgToEnEmDash

It was broken from the beginning. The second replacement could look into a wrong string when checking if the characters around the "--" are eligible; it could use obsolete indices in the document, ignoring the previous replacement that changed the lendth of the text. This also replaces a use of char* to hold Unicode codepoints to pass to lcl_IsInAsciiArr, with an array of sal_Unicode (because all the checked values fit into it). Change-Id: I949630abc564fc0875be0b92228846497bb1a022 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/152002 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
author: Mike Kaganski <mike.kaganski@collabora.com> 2023-05-19 14:01:02 +0300
committer: Mike Kaganski <mike.kaganski@collabora.com> 2023-05-20 13:31:21 +0200
commit: 075ecc1c31199d0fd0f930cf1b803b04a3b17ce8 (patch)
tree: e34866de1da909795b8a6a54b7ac679b11857c58 /editeng
parent: f692b5c5e4e9c57fb69d0054c654486a737d19bd (diff)
1 files changed, 58 insertions, 44 deletions
diff --git a/editeng/source/misc/svxacorr.cxx b/editeng/source/misc/svxacorr.cxx
index dfb1e6c0d726..318da6536df9 100644
--- a/editeng/source/misc/svxacorr.cxx
+++ b/editeng/source/misc/svxacorr.cxx
@@ -100,11 +100,13 @@ constexpr OUStringLiteral pXMLImplWordStart_ExcptLstStr = u"WordExceptList.xml";
 constexpr OUStringLiteral pXMLImplCplStt_ExcptLstStr = u"SentenceExceptList.xml";
 constexpr OUStringLiteral pXMLImplAutocorr_ListStr = u"DocumentList.xml";
 
-const char
+// tdf#54409 check also typographical quotation marks in the case of skipped ASCII quotation marks
+// Curious, why these \u0083\u0084\u0089\u0091\u0092\u0093\u0094 are handled as "begin characters"?
+constexpr std::u16string_view
     /* also at these beginnings - Brackets and all kinds of begin characters */
-    sImplSttSkipChars[] = "\"\'([{\x83\x84\x89\x91\x92\x93\x94",
+    sImplSttSkipChars = u"\"'([{\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u0083\u0084\u0089\u0091\u0092\u0093\u0094",
     /* also at these ends - Brackets and all kinds of begin characters */
-    sImplEndSkipChars[] = "\"\')]}\x83\x84\x89\x91\x92\x93\x94";
+    sImplEndSkipChars = u"\"')]}\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u0083\u0084\u0089\u0091\u0092\u0093\u0094";
 
 static OUString EncryptBlockName_Imp(std::u16string_view rName);
 
@@ -171,20 +173,12 @@ static bool lcl_IsSymbolChar( CharClass const & rCC, const OUString& rTxt,
     return false;
 }
 
-static bool lcl_IsInAsciiArr( const char* pArr, const sal_Unicode c )
+static bool lcl_IsInArr(std::u16string_view arr, const sal_uInt32 c)
 {
-    // tdf#54409 check also typographical quotation marks in the case of skipped ASCII quotation marks
-    if ( 0x2018 <= c && c <= 0x201F && (pArr == sImplSttSkipChars || pArr == sImplEndSkipChars) )
-        return true;
-
-    bool bRet = false;
-    for( ; *pArr; ++pArr )
-        if( *pArr == c )
-        {
-            bRet = true;
-            break;
-        }
-    return bRet;
+    for (const auto c1 : arr)
+        if (c1 == c)
+            return true;
+    return false;
 }
 
 SvxAutoCorrDoc::~SvxAutoCorrDoc()
@@ -312,6 +306,8 @@ ACFlags SvxAutoCorrect::GetDefaultFlags()
 
 constexpr sal_Unicode cEmDash = 0x2014;
 constexpr sal_Unicode cEnDash = 0x2013;
+constexpr OUStringLiteral sEmDash(u"\u2014");
+constexpr OUStringLiteral sEnDash(u"\u2013");
 constexpr sal_Unicode cApostrophe = 0x2019;
 constexpr sal_Unicode cLeftDoubleAngleQuote = 0xAB;
 constexpr sal_Unicode cRightDoubleAngleQuote = 0xBB;
@@ -485,10 +481,10 @@ bool SvxAutoCorrect::FnChgOrdinalNumber(
         CharClass& rCC = GetCharClass(eLang);
 
         for (; nSttPos < nEndPos; ++nSttPos)
-            if (!lcl_IsInAsciiArr(sImplSttSkipChars, rTxt[nSttPos]))
+            if (!lcl_IsInArr(sImplSttSkipChars, rTxt[nSttPos]))
                 break;
         for (; nSttPos < nEndPos; --nEndPos)
-            if (!lcl_IsInAsciiArr(sImplEndSkipChars, rTxt[nEndPos - 1]))
+            if (!lcl_IsInArr(sImplEndSkipChars, rTxt[nEndPos - 1]))
                 break;
 
 
@@ -558,6 +554,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
     // rTxt may refer to the frame text that will change in the calls to rDoc.Delete / rDoc.Insert;
     // keep a local copy for later use
     OUString aOrigTxt = rTxt;
+    sal_Int32 nFirstReplacementTextLengthChange = 0;
 
     // replace " - " or " --" with "enDash"
     if( 1 < nSttPos && 1 <= nEndPos - nSttPos )
@@ -570,7 +567,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
                 '-' == rTxt[ nSttPos+1 ])
             {
                 sal_Int32 n;
-                for( n = nSttPos+2; n < nEndPos && lcl_IsInAsciiArr(
+                for( n = nSttPos+2; n < nEndPos && lcl_IsInArr(
                             sImplSttSkipChars,(cCh = rTxt[ n ]));
                         ++n )
                     ;
@@ -578,7 +575,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
                 // found: " --[<AnySttChars>][A-z0-9]
                 if( rCC.isLetterNumeric( OUString(cCh) ) )
                 {
-                    for( n = nSttPos-1; n && lcl_IsInAsciiArr(
+                    for( n = nSttPos-1; n && lcl_IsInArr(
                             sImplEndSkipChars,(cCh = rTxt[ --n ])); )
                         ;
 
@@ -586,7 +583,8 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
                     if( rCC.isLetterNumeric( OUString(cCh) ))
                     {
                         rDoc.Delete( nSttPos, nSttPos + 2 );
-                        rDoc.Insert( nSttPos, bAlwaysUseEmDash ? OUString(cEmDash) : OUString(cEnDash) );
+                        rDoc.Insert( nSttPos, bAlwaysUseEmDash ? sEmDash : sEnDash );
+                        nFirstReplacementTextLengthChange = -1; // 2 ch -> 1 ch
                         bRet = true;
                     }
                 }
@@ -605,7 +603,7 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
             }
             if( ' ' == cCh )
             {
-                for( n = nSttPos; n < nEndPos && lcl_IsInAsciiArr(
+                for( n = nSttPos; n < nEndPos && lcl_IsInArr(
                             sImplSttSkipChars,(cCh = rTxt[ n ]));
                         ++n )
                     ;
@@ -614,14 +612,15 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
                 if( rCC.isLetterNumeric( OUString(cCh) ) )
                 {
                     cCh = ' ';
-                    for( n = nTmpPos-1; n && lcl_IsInAsciiArr(
+                    for( n = nTmpPos-1; n && lcl_IsInArr(
                             sImplEndSkipChars,(cCh = rTxt[ --n ])); )
                             ;
                     // found: "[A-z0-9][<AnyEndChars>] - [<AnySttChars>][A-z0-9]
                     if( rCC.isLetterNumeric( OUString(cCh) ))
                     {
                         rDoc.Delete( nTmpPos, nTmpPos + nLen );
-                        rDoc.Insert( nTmpPos, bAlwaysUseEmDash ? OUString(cEmDash) : OUString(cEnDash) );
+                        rDoc.Insert( nTmpPos, bAlwaysUseEmDash ? sEmDash : sEnDash );
+                        nFirstReplacementTextLengthChange = 1 - nLen; // nLen ch -> 1 ch
                         bRet = true;
                     }
                 }
@@ -635,20 +634,35 @@ bool SvxAutoCorrect::FnChgToEnEmDash(
     bool bEnDash = (eLang == LANGUAGE_HUNGARIAN || eLang == LANGUAGE_FINNISH);
     if( 4 <= nEndPos - nSttPos )
     {
-        OUString sTmp( aOrigTxt.subView( nSttPos, nEndPos - nSttPos ) );
-        sal_Int32 nFndPos = sTmp.indexOf("--");
-        if( nFndPos != -1 && nFndPos &&
-            nFndPos + 2 < sTmp.getLength() &&
-            ( rCC.isLetterNumeric( sTmp, nFndPos - 1 ) ||
-              lcl_IsInAsciiArr( sImplEndSkipChars, aOrigTxt[ nFndPos - 1 ] )) &&
-            ( rCC.isLetterNumeric( sTmp, nFndPos + 2 ) ||
-            lcl_IsInAsciiArr( sImplSttSkipChars, aOrigTxt[ nFndPos + 2 ] )))
+        std::u16string_view sTmpView( aOrigTxt.subView( nSttPos, nEndPos - nSttPos ) );
+        size_t nFndPos = sTmpView.find(u"--");
+        if (nFndPos > 0 && nFndPos < sTmpView.size() - 2)
         {
-            nSttPos = nSttPos + nFndPos;
-            rDoc.Delete( nSttPos, nSttPos + 2 );
-            rDoc.Insert( nSttPos, (bEnDash || (rCC.isDigit( sTmp, nFndPos - 1 ) &&
-                rCC.isDigit( sTmp, nFndPos + 2 )) ? OUString(cEnDash) : OUString(cEmDash)) );
-            bRet = true;
+            // Use proper codepoints. Currently, CharClass::isLetterNumeric is broken, it
+            // uses the index *both* as code unit index (when checking it as ASCII), *and*
+            // as code point index (when passes to css::i18n::XCharacterClassification).
+            // Oh well... Anyway, single-codepoint strings will workaround it.
+            sal_Int32 nStart = nSttPos + nFndPos;
+            sal_uInt32 chStart = aOrigTxt.iterateCodePoints(&nStart, -1);
+            OUString sStart(&chStart, 1);
+            // No idea why sImplEndSkipChars is checked at start
+            if (rCC.isLetterNumeric(sStart, 0) || lcl_IsInArr(sImplEndSkipChars, chStart))
+            {
+                sal_Int32 nEnd = nSttPos + nFndPos + 2;
+                sal_uInt32 chEnd = aOrigTxt.iterateCodePoints(&nEnd, 1);
+                OUString sEnd(&chEnd, 1);
+                // No idea why sImplSttSkipChars is checked at end
+                if (rCC.isLetterNumeric(sEnd, 0) || lcl_IsInArr(sImplSttSkipChars, chEnd))
+                {
+                    nSttPos = nSttPos + nFndPos + nFirstReplacementTextLengthChange;
+                    rDoc.Delete(nSttPos, nSttPos + 2);
+                    rDoc.Insert(nSttPos,
+                                (bEnDash || (rCC.isDigit(sStart, 0) && rCC.isDigit(sEnd, 0))
+                                     ? sEnDash
+                                     : sEmDash));
+                    bRet = true;
+                }
+            }
         }
     }
     return bRet;
@@ -885,7 +899,7 @@ void SvxAutoCorrect::FnCapitalStartSentence( SvxAutoCorrDoc& rDoc,
         }
         else if (pWordStt && !rCC.isDigit(aText, pStr - pStart))
         {
-            if( (lcl_IsInAsciiArr( "-'", *pStr ) || *pStr == cApostrophe) && // These characters are allowed in words
+            if( (lcl_IsInArr( u"-'", *pStr ) || *pStr == cApostrophe) && // These characters are allowed in words
                 pWordStt - 1 == pStr &&
                 // Installation at beginning of paragraph. Replaced < by <= (#i38971#)
                 (pStart + 1) <= pStr &&
@@ -922,7 +936,7 @@ void SvxAutoCorrect::FnCapitalStartSentence( SvxAutoCorrDoc& rDoc,
 
     // Only capitalize, if string before specified characters is long enough
     if( *pDelim && 2 >= pDelim - pWordStt &&
-        lcl_IsInAsciiArr( ".-)>", *pDelim ) )
+        lcl_IsInArr( u".-)>", *pDelim ) )
         return;
 
     // tdf#59666 don't capitalize single Greek letters (except in Greek texts)
@@ -1350,7 +1364,7 @@ void SvxAutoCorrect::DoAutoCorrect( SvxAutoCorrDoc& rDoc, const OUString& rTxt,
                 {
                     sal_Unicode cPrev = rTxt[ nInsPos-1 ];
                     bSttQuote = NonFieldWordDelim(cPrev) ||
-                        lcl_IsInAsciiArr( "([{", cPrev ) ||
+                        lcl_IsInArr( u"([{", cPrev ) ||
                         ( cEmDash == cPrev ) ||
                         ( cEnDash == cPrev );
                     // tdf#38394 use opening quotation mark << in French l'<<word>>
@@ -1570,11 +1584,11 @@ void SvxAutoCorrect::DoAutoCorrect( SvxAutoCorrDoc& rDoc, const OUString& rTxt,
             {
                 sal_Int32 nCapLttrPos1 = nCapLttrPos, nInsPos1 = nInsPos;
                 while( nCapLttrPos1 < nInsPos &&
-                        lcl_IsInAsciiArr( sImplSttSkipChars, rTxt[ nCapLttrPos1 ] )
+                        lcl_IsInArr( sImplSttSkipChars, rTxt[ nCapLttrPos1 ] )
                         )
                         ++nCapLttrPos1;
                 while( nCapLttrPos1 < nInsPos1 && nInsPos1 &&
-                        lcl_IsInAsciiArr( sImplEndSkipChars, rTxt[ nInsPos1-1 ] )
+                        lcl_IsInArr( sImplEndSkipChars, rTxt[ nInsPos1-1 ] )
                         )
                         --nInsPos1;
 
@@ -1767,7 +1781,7 @@ OUString SvxAutoCorrect::GetPrevAutoCorrWord(SvxAutoCorrDoc const& rDoc, const O
     if( !nPos && !IsWordDelim( rTxt[ 0 ]))
         --nCapLttrPos;          // Beginning of paragraph and no Blank!
 
-    while( lcl_IsInAsciiArr( sImplSttSkipChars, rTxt[ nCapLttrPos ]) )
+    while( lcl_IsInArr( sImplSttSkipChars, rTxt[ nCapLttrPos ]) )
         if( ++nCapLttrPos >= nEnd )
             return sRet;
 
@@ -1923,7 +1937,7 @@ OUString EncryptBlockName_Imp(std::u16string_view rName)
     aName.append('#').append(rName);
     for (size_t nLen = rName.size(), nPos = 1; nPos < nLen; ++nPos)
     {
-        if (lcl_IsInAsciiArr( "!/:.\\", aName[nPos]))
+        if (lcl_IsInArr( u"!/:.\\", aName[nPos]))
             aName[nPos] &= 0x0f;
     }
     return aName.makeStringAndClear();
author	Mike Kaganski <mike.kaganski@collabora.com>	2023-05-19 14:01:02 +0300
committer	Mike Kaganski <mike.kaganski@collabora.com>	2023-05-20 13:31:21 +0200
commit	075ecc1c31199d0fd0f930cf1b803b04a3b17ce8 (patch)
tree	e34866de1da909795b8a6a54b7ac679b11857c58 /editeng
parent	f692b5c5e4e9c57fb69d0054c654486a737d19bd (diff)