summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Clark <jonathan@libreoffice.org>2024-12-02 16:03:43 -0700
committerChristian Lohmaier <lohmaier+LibreOffice@googlemail.com>2024-12-10 16:00:44 +0100
commit497ca77d329aef04f30d74f1ddcea67ac62712f5 (patch)
treed234209c16754d53943be299bb2daafb09c752f9
parent1338f45542a017b136a1c5b347ea24f7583015a4 (diff)
tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules
Regression from commit 14c6cde779d64596eab0f4d3f32f181ce2243929: "tdf#49885 Updated CJK BreakIterator to use ICU" Previously, languages requiring dictionary-based break iterators were handled by instantiating a stock ICU break iterator as a special case. tdf#49885 upgraded our custom rules to support passthrough for dictionary-based breaking, so this special case is no longer necessary. Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonathan@libreoffice.org> Signed-off-by: Xisco Fauli <xiscofauli@libreoffice.org> Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177754 (cherry picked from commit 10ee7d30f7c1c8c9b80155341c2bf1639ca21d5f) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177764 Tested-by: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com> Reviewed-by: Christian Lohmaier <lohmaier+LibreOffice@googlemail.com> Reviewed-by: Michael Stahl <michael.stahl@allotropia.de>
-rw-r--r--i18npool/qa/cppunit/test_breakiterator.cxx249
-rw-r--r--i18npool/source/breakiterator/breakiterator_unicode.cxx12
2 files changed, 249 insertions, 12 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index 81df29e5950e..8853416f5677 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -52,6 +52,7 @@ public:
void testHebrewGereshGershaim();
void testLegacySurrogatePairs();
void testWordCount();
+ void testDictionaryIteratorLanguages();
CPPUNIT_TEST_SUITE(TestBreakIterator);
CPPUNIT_TEST(testLineBreaking);
@@ -76,6 +77,7 @@ public:
CPPUNIT_TEST(testHebrewGereshGershaim);
CPPUNIT_TEST(testLegacySurrogatePairs);
CPPUNIT_TEST(testWordCount);
+ CPPUNIT_TEST(testDictionaryIteratorLanguages);
CPPUNIT_TEST_SUITE_END();
private:
@@ -1597,6 +1599,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
+
+ {
+ // tdf#162912: Double-clicking should only select one Basic identifier
+ static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr;
+
+ aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+
+ aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
+
+ aBounds = xBreak->getWordBoundary(aTest, 15, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+ }
}
void TestBreakIterator::testJapanese()
@@ -1960,7 +1981,233 @@ void TestBreakIterator::testWordCount()
const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
- CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
+ CPPUNIT_ASSERT_EQUAL(8, count_words_fn(str, aLocale));
+ }
+}
+
+void TestBreakIterator::testDictionaryIteratorLanguages()
+{
+ // Thai
+ {
+ lang::Locale aLocale{ "th", "TH", "" };
+
+ const OUString aStr = u"รอนานหรือเปล่า"_ustr;
+
+ i18n::Boundary aBounds;
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+ }
+
+ // Japanese
+ {
+ lang::Locale aLocale{ "ja", "JP", "" };
+
+ const OUString aStr = u"通産省工業技術院北海道"_ustr;
+
+ i18n::Boundary aBounds;
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+ }
+
+ // Chinese
+ {
+ lang::Locale aLocale{ "zh", "CN", "" };
+
+ const OUString aStr = u"很高兴认识你"_ustr;
+
+ i18n::Boundary aBounds;
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds
+ = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
}
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index 9b47c433f296..a0fe58aae43e 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
};
-bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale)
-{
- return rLocale.Language == "bo" || // Tibetan
- rLocale.Language == "dz" || // Dzongkha
- rLocale.Language == "ja" || // Japanese
- rLocale.Language == "km" || // Khmer
- rLocale.Language == "lo" || // Lao
- rLocale.Language == "th" || // Thai
- rLocale.Language == "zh"; // Chinese
-}
}
// loading ICU breakiterator on demand.
@@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal
rbi.reset();
}
}
- else if(!locale_requires_dictionary_iterator(rLocale))
+ else
{
// language;rule (not langtag, unless we'd actually load such)
OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());