diff options
author | Martin Hosken <martin_hosken@sil.org> | 2011-03-04 19:57:29 +0700 |
---|---|---|
committer | Martin Hosken <martin_hosken@sil.org> | 2011-03-10 22:40:13 +0700 |
commit | d83e115bae521fcf193978496d62dbc677a1d6ef (patch) | |
tree | 197739d3bcccb00a5c213257ea83f2ed64f79f82 /i18npool | |
parent | 24dca3d2ea9582d3d48ecee55a943d1e41af902f (diff) |
break graphemes on new uax#29 updates, use ICU for script identification
Diffstat (limited to 'i18npool')
-rw-r--r-- | i18npool/source/breakiterator/breakiteratorImpl.cxx | 62 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/char.txt | 120 |
2 files changed, 166 insertions, 16 deletions
diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx b/i18npool/source/breakiterator/breakiteratorImpl.cxx index 540bb63c68e6..bf3cab15e602 100644 --- a/i18npool/source/breakiterator/breakiteratorImpl.cxx +++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx @@ -473,31 +473,61 @@ static UBlock2Script scriptList[] = { #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script) +static sal_Int16 scriptTypes[] = { + ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, +// 15 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, +// 30 + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 45 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 60 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, +// 75 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 90 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, +// 105 + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, +// 120 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, + ScriptType::WEAK}; + +#define scriptTypesCount sizeof(scriptTypes) / sizeof(sal_Int16) + sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) { static sal_uInt32 lastChar = 0; static sal_Int16 nRet = 0; + sal_uInt32 script; if (currentChar != lastChar) { lastChar = currentChar; - //JP 21.9.2001: handle specific characters - always as weak - // definition of 1 - this breaks a word - // 2 - this can be inside a word - // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. - if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) + script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); + if (script < 0) nRet = ScriptType::WEAK; - // workaround for Coptic - else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar) - nRet = ScriptType::LATIN; - else { - UBlockCode block=ublock_getCode(currentChar); - sal_uInt16 i; - for ( i = 0; i < scriptListCount; i++) { - if (block <= scriptList[i].to) break; - } - nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK; - } + else if (script >= scriptTypesCount) + nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild + else + nRet = scriptTypes[script]; } return nRet; } diff --git a/i18npool/source/breakiterator/data/char.txt b/i18npool/source/breakiterator/data/char.txt new file mode 100644 index 000000000000..c298b73dc77c --- /dev/null +++ b/i18npool/source/breakiterator/data/char.txt @@ -0,0 +1,120 @@ +# +# Copyright (C) 2002-2009, International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: char.txt +# +# ICU Character Break Rules, also known as Grapheme Cluster Boundaries +# See Unicode Standard Annex #29. +# These rules are based on TR29 Revision 13, for Unicode Version 5.1 +# Modifications to SpacingMark and Prepend by M. Hosken. +# + +# +# Character Class Definitions. +# +$CR = [\p{Grapheme_Cluster_Break = CR}]; +$LF = [\p{Grapheme_Cluster_Break = LF}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; +$Extend = [\p{Grapheme_Cluster_Break = Extend}]; +$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; +# True Indic wants to move by syllables. Break up SpacingMark. This based on Unicode 6.0 data +# In effect it is [\p{Grapheme_Cluster_Break = SpacingMark} - \u0E30 \u0E32 \u0E45 \u0EB0 \u0EB2 \u102B \u102C \u1038 +# \u1062-\u1064 \u1067-\u106D \u1083 \u1087-\u108C \u108F \u109A-\u109C \u19B0-\u19B4 \u19B8-\u19C0 \u19C8 \u19C9 +# \u1A61 \u1A63 \u1A64 \u1BE7 \u1BEA-\u1BEC \u1BEE \u1BF2 \u1BF3 \uAA7B +$IndicSpacing = [\u0903 \u093B \u093E-\u0940 \u0949-\u094C \u094E \u094F \u0982 \u0983 \u09BF \u09C0 \u09C7 \u09C8 \u09CB \u09CC \u0A03 \u0A3E-\u0A40 \u0A83 \u0ABE-\u0AC0 \u0AC9 \u0ACB \u0ACC \u0B02 \u0B03 \u0B40 \u0B47 \u0B48 \u0B4B-\u0B4C \u0BBF \u0BC1 \u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0C01-\u0C03 \u0C41-\u0C44 \u0C82 \u0C83 \u0CBE \u0CC0 \u0CC1 \u0CC3 \u0CC4 \u0CC7 \u0CC8 \u0CCA \u0CCB \u0D02 \u0D03 \u0D3F \u0D40 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D82 \u0D83 \u0DD0 \u0DD1 \u0DD8-\u0DDE \u0F3E \u0F3F \u0F7F \u1923-\u1926 \u1929-\u192B \u1930 \u1931 \u1933-\u1938 \u1A19-\u1A1B \u1B04 \u1B35 \u1B3B \u1B3D-\u1B41 \u1B43 \u1B44 \u1B82 \u1BA1 \u1BA6 \u1BA7 \u1BAA \u1C24-\u1C2B \u1C34 \u1C35 \u1CE1 \u1CF2 \uA880 \uA881 \uA8B4-\uA8C3 \uA952 \uA953 \uA983 \uA9B4 \uA9B5 \uA9BA \uA9BB \uA9BD-\uA9C0 \uAA2F \uAA30 \uAA33 \uAA34 \uABE3 \uABE4 \uABE6 \uABE7 \uABE9 \uABEA \uABEC \u11000 \u11002 \u11082 \u110B0-\u110B2 \u110B7 \u100B8 \u1D166 \u1D16D]; +# SEAsian (Thai, Lao, Burmese, Tai Lue, Tai Tham, Batak) are cluster based not syllable based +$SEASpacing = [\u0E33 \u0EB3 \u1031 \u103B \u103C \u1056 \u1057 \u1084 \u17B6 \u17BE-\u17C5 \u17C7 \u17C8 \u19B5-\u19B7 \u19BA \u1A55 \u1A57 \u1A6D-\u1A72 \uA823 \uA824 \uA827 \uAA4D]; +$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; +$BengaliSignVirama = \u09CD; +$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; +$GujaratiSignVirama = \u0ACD; +$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; +$DevanagariSignVirama = \u094D; +$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; +$KannadaSignVirama = \u0CCD; +$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; +$MalayalamSignVirama = \u0D4D; +$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; +$OriyaSignVirama = \u0B4D; +$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; +$GurmukhiSignVirama = \u0A4D; +$TamilLetter = [\u0B85-\u0BB9]; +$TamilSignVirama = \u0BCD; +$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; +$TeluguSignVirama = \u0C4D; + +# +# Korean Syllable Definitions +# +$L = [\p{Grapheme_Cluster_Break = L}]; +$V = [\p{Grapheme_Cluster_Break = V}]; +$T = [\p{Grapheme_Cluster_Break = T}]; + +$LV = [\p{Grapheme_Cluster_Break = LV}]; +$LVT = [\p{Grapheme_Cluster_Break = LVT}]; + + +## ------------------------------------------------- +!!chain; + +!!forward; + +$CR $LF; + +$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; +$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; +$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; +$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; +$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; +$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; +$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; +$TamilLetter ($TamilSignVirama $TamilLetter?)+; +$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; + +$L ($L | $V | $LV | $LVT); +($LV | $V) ($V | $T); +($LVT | $T) $T; + +[^$Control $CR $LF] $Extend; + +[^$Control $CR $LF] $IndicSpacing; +[^$Control $CR $LF] $SEASpacing; +#[^$Control $CR $LF] $SpacingMark; +# $Prepend [^$Control $CR $LF]; + + +## ------------------------------------------------- + +!!reverse; +$LF $CR; +($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; +($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; +($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; +($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; +($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; +($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; +($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; +($TamilLetter? $TamilSignVirama)+ $TamilLetter; +($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; +($L | $V | $LV | $LVT) $L; +($V | $T) ($LV | $V); +$T ($LVT | $T); + +$Extend [^$Control $CR $LF]; +$IndicSpacing [^$Control $CR $LF]; +$SEASpacing [^$Control $CR $LF]; +#$SpacingMark [^$Control $CR $LF]; +# [^$Control $CR $LF] $Prepend; + + +## ------------------------------------------------- + +!!safe_reverse; + + +## ------------------------------------------------- + +!!safe_forward; + |