diff options
author | Kurt Zenker <kz@openoffice.org> | 2008-04-02 08:49:09 +0000 |
---|---|---|
committer | Kurt Zenker <kz@openoffice.org> | 2008-04-02 08:49:09 +0000 |
commit | bedef98c24ef9ada6aaffe9bc5284d9759a31a9a (patch) | |
tree | ebc5050ba6375476665a49f6dc775f9a8b183e19 /i18npool | |
parent | 59144104b3f91a2e6ed816f0bde0fdb91ea218d7 (diff) |
INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED
2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word
2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word
Diffstat (limited to 'i18npool')
-rw-r--r-- | i18npool/source/breakiterator/data/count_word.txt | 34 |
1 files changed, 12 insertions, 22 deletions
diff --git a/i18npool/source/breakiterator/data/count_word.txt b/i18npool/source/breakiterator/data/count_word.txt index dd0799a61878..e94ef1bb95e5 100644 --- a/i18npool/source/breakiterator/data/count_word.txt +++ b/i18npool/source/breakiterator/data/count_word.txt @@ -22,21 +22,16 @@ $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND M [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +$dash = \u002d; + $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] + [:P:] [:S:] [:LineBreak = Numeric:] + - $dash - $Katakana - [:Script = Thai:] - [:Script = Lao:] - [:Script = Hiragana:]]; -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNumLet = [:name = FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; -$Numeric = [:LineBreak = Numeric:]; - - # # Character Class Definitions. # The names are those from TR29. @@ -70,20 +65,12 @@ $Format = [[:Cf:]]; # "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. # $ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidNumLetEx = $MidNumLet $Extend*; -$MidLetterEx = $MidLetter $Extend*; $KatakanaEx = $Katakana $Extend*; $FormatEx = $Format $Extend*; -$word_pad=[[:P:][:S:][:Z:][:C:]]; - # # Numbers. Rules 8, 11, 12 form the TR. # -$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*; -$NumberSequence $word_pad* {100}; # # Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 @@ -91,8 +78,11 @@ $NumberSequence $word_pad* {100}; # - may include both letters and numbers. # - may include MideLetter, MidNumber punctuation. # -$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $word_pad* {200}; +$LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*; # rules #6, #7 +$LetterSequence {200}; + +$ALetterEx* $dash+ {200}; +$ALetterEx* ($dash $LetterSequence)+ $dash* {200}; # # Do not break between Katakana. Rule #13. @@ -105,14 +95,14 @@ $KatakanaEx ($FormatEx* $KatakanaEx)* {300}; # Separated from the "Everything Else" rule, below, only so that they # can be tagged with a return value. TODO: is this what we want? # -# [:IDEOGRAPHIC:] $Extend* $word_pad* {400}; +# [:IDEOGRAPHIC:] $Extend* {400}; # # Everything Else, with no tag. # Non-Control chars combine with $Extend (combining) chars. # Controls are do not. # -[^$Control [:Ideographic:]] $Extend* $word_pad*; +[^$Control [:Ideographic:]] $Extend*; $CR $LF; # @@ -126,7 +116,7 @@ $CR $LF; # reaches something that can only be the start (and probably only) char in a "word". # A space or punctuation meets the test. # -$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format]; +$NonStarters = [$ALetter $Katakana $Extend $Format]; #!.*; ! ($NonStarters* | \n \r) .; |