diff options
author | Caolán McNamara <caolanm@redhat.com> | 2011-07-19 09:21:10 +0100 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2011-07-19 09:32:43 +0100 |
commit | bf355b97ee4b53e38975f0e1847eda5b3e05f920 (patch) | |
tree | 019e24d4b7b803e01d61601865621d555d74e755 /i18npool | |
parent | b2262e8be35d51cc18afeb60486f19576a81adc0 (diff) |
Resolves: fdo#38095 half&full width forms need to remain asian
Diffstat (limited to 'i18npool')
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 51 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiteratorImpl.cxx | 200 |
2 files changed, 191 insertions, 60 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index fc81cc9860fb..7740cf9188f6 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -57,11 +57,13 @@ public: void testLineBreaking(); void testGraphemeIteration(); void testWeak(); + void testAsian(); CPPUNIT_TEST_SUITE(TestBreakIterator); CPPUNIT_TEST(testLineBreaking); CPPUNIT_TEST(testGraphemeIteration); CPPUNIT_TEST(testWeak); + CPPUNIT_TEST(testAsian); CPPUNIT_TEST_SUITE_END(); private: @@ -104,7 +106,7 @@ void TestBreakIterator::testGraphemeIteration() aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN")); { - sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF }; + const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF }; ::rtl::OUString aTest1(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA)); sal_Int32 nDone=0; @@ -118,7 +120,7 @@ void TestBreakIterator::testGraphemeIteration() } { - sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF }; + const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF }; ::rtl::OUString aTest1(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)); sal_Int32 nDone=0; @@ -132,7 +134,7 @@ void TestBreakIterator::testGraphemeIteration() } { - sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF }; + const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF }; ::rtl::OUString aTest1(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)); sal_Int32 nDone=0; @@ -156,10 +158,12 @@ void TestBreakIterator::testWeak() aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US")); { - sal_Unicode WEAKS[] = + const sal_Unicode WEAKS[] = { 0x0001, 0x0002, 0x0020, 0x00A0, + 0x2150, 0x215F, //Number Forms, fractions + 0x2160, 0x2180, //Number Forms, roman numerals 0x2200, 0x22FF, //Mathematical Operators 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B @@ -184,6 +188,45 @@ void TestBreakIterator::testWeak() } } +//A test to ensure that certain ranges and codepoints that are categorized as +//asian remain as asian, so that existing docs that depend on this don't silently +//change font for those asian chars. +//See https://bugs.freedesktop.org/show_bug.cgi?id=38095 +void TestBreakIterator::testAsian() +{ + lang::Locale aLocale; + aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en")); + aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US")); + + { + const sal_Unicode ASIANS[] = + { + //some typical CJK chars + 0x4E00, 0x62FF, + //The full HalfWidth and FullWidth block has historically been + //designated as taking the CJK font :-( + //HalfWidth and FullWidth forms of ASCII 0-9, categorized under + //UAX24 as "Common" i.e. by that logic WEAK + 0xFF10, 0xFF19, + //HalfWidth and FullWidth forms of ASCII A-z, categorized under + //UAX25 as "Latin", i.e. by that logic LATIN + 0xFF21, 0xFF5A + }; + ::rtl::OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS)); + + for (sal_Int32 i = 0; i < aAsians.getLength(); ++i) + { + sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i); + rtl::OStringBuffer aMsg; + aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x")); + aMsg.append(static_cast<sal_Int32>(aAsians.getStr()[i]), 16); + aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been asian")); + CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(), + nScript == i18n::ScriptType::ASIAN); + } + } +} + TestBreakIterator::TestBreakIterator() { m_xContext = cppu::defaultBootstrap_InitialComponentContext(); diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx b/i18npool/source/breakiterator/breakiteratorImpl.cxx index 3cc974870c3d..9c6d09d7d6ec 100644 --- a/i18npool/source/breakiterator/breakiteratorImpl.cxx +++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx @@ -443,67 +443,155 @@ sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/, return 0; } -static sal_Int16 scriptTypes[] = { - ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, - ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, -// 15 - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX, - ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, -// 30 - ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, -// 45 - ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, - ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, -// 60 - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, -// 75 - ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, -// 90 - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, -// 105 - ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, -// 120 - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, -// 135 - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, - ScriptType::COMPLEX, - ScriptType::WEAK}; - -#define scriptListCount SAL_N_ELEMENTS(scriptTypes) +namespace +{ + //See unicode/uscript.h + static sal_Int16 scriptTypes[] = + { + ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, + // 15 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, + // 30 + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + // 45 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + // 60 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, + // 75 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + // 90 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, + // 105 + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, + // 120 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + // 135 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, + ScriptType::WEAK + }; + +# define scriptTypesCount SAL_N_ELEMENTS(scriptTypes) + + sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar) + { + sal_Int16 nRet; + int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); + if (script < 0) + nRet = ScriptType::WEAK; + else if (static_cast<size_t>(script) >= SAL_N_ELEMENTS(scriptTypes)) + nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild + else + nRet = scriptTypes[script]; + return nRet; + } + + struct UBlock2Script + { + UBlockCode from; + UBlockCode to; + sal_Int16 script; + }; + + static UBlock2Script scriptList[] = + { + {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK}, + {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN}, + {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX}, + {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN}, + {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN}, + {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX}, + {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN}, + {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX}, + {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN}, + {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK}, + {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN}, + {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN}, + {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX}, + {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN}, + {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX}, + {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN}, + {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN}, + {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN}, + {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN} + }; + + #define scriptListCount SAL_N_ELEMENTS(scriptTypes) + + //always sets rScriptType + // + //returns true for characters historically explicitly assigned to + //latin/weak/asian + // + //returns false for characters that historically implicitly assigned to + //weak as unknown + bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType) + { + bool bKnown = true; + //handle specific characters always as weak: + // 0x01 - this breaks a word + // 0x02 - this can be inside a word + // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. + if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) + rScriptType = ScriptType::WEAK; + // workaround for Coptic + else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar) + rScriptType = ScriptType::LATIN; + else + { + UBlockCode block=ublock_getCode(currentChar); + size_t i = 0; + while (i < scriptListCount) + { + if (block <= scriptList[i].to) + break; + ++i; + } + if (i < scriptListCount && block >= scriptList[i].from) + rScriptType = scriptList[i].script; + else + { + rScriptType = ScriptType::WEAK; + bKnown = false; + } + } + return bKnown; + } +} sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) { - static sal_uInt32 lastChar = 0; - static sal_Int16 nRet = 0; + static sal_uInt32 lastChar = 0; + static sal_Int16 nRet = 0; - if (currentChar != lastChar) { - lastChar = currentChar; + if (currentChar != lastChar) + { + lastChar = currentChar; - int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); - if (script < 0) - nRet = ScriptType::WEAK; - else if (static_cast<size_t>(script) >= SAL_N_ELEMENTS(scriptTypes)) - nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild - else - nRet = scriptTypes[script]; - } - return nRet; + if (!getCompatibilityScriptClassByBlock(currentChar, nRet)) + nRet = getScriptClassByUAX24Script(currentChar); + } + + return nRet; } static inline sal_Bool operator == (const Locale& l1, const Locale& l2) { |