diff options
author | Jonathan Clark <jonathan@libreoffice.org> | 2024-12-02 16:03:43 -0700 |
---|---|---|
committer | Christian Lohmaier <lohmaier+LibreOffice@googlemail.com> | 2024-12-10 16:00:44 +0100 |
commit | 497ca77d329aef04f30d74f1ddcea67ac62712f5 (patch) | |
tree | d234209c16754d53943be299bb2daafb09c752f9 | |
parent | 1338f45542a017b136a1c5b347ea24f7583015a4 (diff) |
tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules
Regression from commit 14c6cde779d64596eab0f4d3f32f181ce2243929:
"tdf#49885 Updated CJK BreakIterator to use ICU"
Previously, languages requiring dictionary-based break iterators were
handled by instantiating a stock ICU break iterator as a special case.
tdf#49885 upgraded our custom rules to support passthrough for
dictionary-based breaking, so this special case is no longer necessary.
Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713
Tested-by: Jenkins
Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
Signed-off-by: Xisco Fauli <xiscofauli@libreoffice.org>
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177754
(cherry picked from commit 10ee7d30f7c1c8c9b80155341c2bf1639ca21d5f)
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177764
Tested-by: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com>
Reviewed-by: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com>
Reviewed-by: Michael Stahl <michael.stahl@allotropia.de>
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 249 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_unicode.cxx | 12 |
2 files changed, 249 insertions, 12 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 81df29e5950e..8853416f5677 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -52,6 +52,7 @@ public: void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); + void testDictionaryIteratorLanguages(); CPPUNIT_TEST_SUITE(TestBreakIterator); CPPUNIT_TEST(testLineBreaking); @@ -76,6 +77,7 @@ public: CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); + CPPUNIT_TEST(testDictionaryIteratorLanguages); CPPUNIT_TEST_SUITE_END(); private: @@ -1597,6 +1599,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } + + { + // tdf#162912: Double-clicking should only select one Basic identifier + static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr; + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 15, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + } } void TestBreakIterator::testJapanese() @@ -1960,7 +1981,233 @@ void TestBreakIterator::testWordCount() const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr; - CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale)); + CPPUNIT_ASSERT_EQUAL(8, count_words_fn(str, aLocale)); + } +} + +void TestBreakIterator::testDictionaryIteratorLanguages() +{ + // Thai + { + lang::Locale aLocale{ "th", "TH", "" }; + + const OUString aStr = u"รอนานหรือเปล่า"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + } + + // Japanese + { + lang::Locale aLocale{ "ja", "JP", "" }; + + const OUString aStr = u"通産省工業技術院北海道"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + } + + // Chinese + { + lang::Locale aLocale{ "zh", "CN", "" }; + + const OUString aStr = u"很高兴认识你"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } } diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx index 9b47c433f296..a0fe58aae43e 100644 --- a/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator }; -bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale) -{ - return rLocale.Language == "bo" || // Tibetan - rLocale.Language == "dz" || // Dzongkha - rLocale.Language == "ja" || // Japanese - rLocale.Language == "km" || // Khmer - rLocale.Language == "lo" || // Lao - rLocale.Language == "th" || // Thai - rLocale.Language == "zh"; // Chinese -} } // loading ICU breakiterator on demand. @@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal rbi.reset(); } } - else if(!locale_requires_dictionary_iterator(rLocale)) + else { // language;rule (not langtag, unless we'd actually load such) OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8()); |