tdf#161737 i18npool: fix fake spelling alarms with NNBSP

Fix word break by excluding narrow no-break space at the end of the words for spell checking. This was a problem e.g. for French, where (automatically? or manually) inserted narrow no-break space is used to get correct typography before exclamation and question marks, also after and before guillemets, if the OpenType/Graphite font doesn't have this feature). Regression from commit 44699b3de37f07090ac6fee1cd97aa76036e9700 "tdf#49885 BreakIterator rule upgrades". Note: this fixes also the problem, when digits separated by NNBSP thousand separator weren't handled by spell checking, alarming fake spelling mistakes, when "Check words with numbers" was enabled in Tools->Options->Languages and Locales->Writing Aids. (TODO: at the case of thousand separators, remove NBSP by the linguistic module or by the spell checking dictionaries to allow to check numbers with thousand separators and with correct suffix.) Change-Id: I36e10add7e0ba840f207a375ccc8668dbfef9572 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/169618 Tested-by: Jenkins Reviewed-by: László Németh <nemeth@numbertext.org> (cherry picked from commit 6e002da1615b52cda4e9331e87878458b1fe9677) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/169593 Reviewed-by: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com>
author: László Németh <nemeth@numbertext.org> 2024-06-27 10:06:03 +0200
committer: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com> 2024-07-08 17:09:02 +0200
commit: fc2bba731459b5ba2ed88fc8212f90b6ae08c15a (patch)
tree: d46581b263ef538f383075eda55369e9b84fbb71
parent: 0c3b1fec87b1d1f32832e2265918f68f93e2aca7 (diff)
2 files changed, 31 insertions, 1 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index e790c17e1155..6fbde026f565 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -992,6 +992,36 @@ void TestBreakIterator::testWordBoundaries()
         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
     }
+
+    //  tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
+    {
+        aLocale.Language = "en";
+        aLocale.Country = "US";
+
+        OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
+        // This was 24 (word + NNBSP)
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
+    }
+
+    //  tdf#161737: narrow no-break space between digits resulted spelling mistakes
+    //  as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
+    //  TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
+    //  to check numbers with thousand separators and with correct suffix
+    {
+        aLocale.Language = "en";
+        aLocale.Country = "US";
+
+        OUString aTest(u"1\u202F000\u202F000"_ustr);
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+        // This was 0 (word + NNBSP)
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        // This was 8 (word + NNBSP)
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+    }
 }
 
 void TestBreakIterator::testSentenceBoundaries()
diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt
index f804b0eec214..deeec7dd659e 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -54,7 +54,7 @@ $Double_Quote       = [\p{Word_Break = Double_Quote}];
 $MidNumLet          = [\p{Word_Break = MidNumLet}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}];
-$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name = NARROW NO-BREAK SPACE:]];
 $WSegSpace          = [\p{Word_Break = WSegSpace}];
 $Extended_Pict      = [\p{Extended_Pictographic}];
author	László Németh <nemeth@numbertext.org>	2024-06-27 10:06:03 +0200
committer	Christian Lohmaier <lohmaier+LibreOffice@googlemail.com>	2024-07-08 17:09:02 +0200
commit	fc2bba731459b5ba2ed88fc8212f90b6ae08c15a (patch)
tree	d46581b263ef538f383075eda55369e9b84fbb71
parent	0c3b1fec87b1d1f32832e2265918f68f93e2aca7 (diff)