INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED

2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word 2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word
author: Kurt Zenker <kz@openoffice.org> 2008-04-02 08:49:09 +0000
committer: Kurt Zenker <kz@openoffice.org> 2008-04-02 08:49:09 +0000
commit: bedef98c24ef9ada6aaffe9bc5284d9759a31a9a (patch)
tree: ebc5050ba6375476665a49f6dc775f9a8b183e19 /i18npool
parent: 59144104b3f91a2e6ed816f0bde0fdb91ea218d7 (diff)
1 files changed, 12 insertions, 22 deletions
diff --git a/i18npool/source/breakiterator/data/count_word.txt b/i18npool/source/breakiterator/data/count_word.txt
index dd0799a61878..e94ef1bb95e5 100644
--- a/i18npool/source/breakiterator/data/count_word.txt
+++ b/i18npool/source/breakiterator/data/count_word.txt
@@ -22,21 +22,16 @@ $Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND M
                                    [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
 
 
+$dash = \u002d;
+
 $ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
+                           [:P:] [:S:] [:LineBreak = Numeric:]
+                           - $dash
                            - $Katakana
                            - [:Script = Thai:]
                            - [:Script = Lao:]
                            - [:Script = Hiragana:]];
                            
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
-              
-$MidNumLet = [:name = FULL STOP:];
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
-$Numeric   = [:LineBreak = Numeric:];
-
-
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
@@ -70,20 +65,12 @@ $Format    = [[:Cf:]];
 #  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
 #
 $ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidNumLetEx  = $MidNumLet $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
 $KatakanaEx   = $Katakana  $Extend*;
 $FormatEx     = $Format    $Extend*;
 
-$word_pad=[[:P:][:S:][:Z:][:C:]];
-
 #
 #  Numbers.  Rules 8, 11, 12 form the TR.
 #
-$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*;
-$NumberSequence $word_pad* {100};
 
 #
 #  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
@@ -91,8 +78,11 @@ $NumberSequence $word_pad* {100};
 #     - may include both letters and numbers.
 #     - may include  MideLetter, MidNumber punctuation.
 #
-$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $word_pad* {200};
+$LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*;     # rules #6, #7
+$LetterSequence {200};
+
+$ALetterEx* $dash+ {200};
+$ALetterEx* ($dash $LetterSequence)+ $dash* {200};
 
 #
 #  Do not break between Katakana.   Rule #13.
@@ -105,14 +95,14 @@ $KatakanaEx ($FormatEx* $KatakanaEx)* {300};
 #                           Separated from the "Everything Else" rule, below, only so that they
 #                           can be tagged with a return value.   TODO:  is this what we want?
 #
-# [:IDEOGRAPHIC:] $Extend* $word_pad* {400};
+# [:IDEOGRAPHIC:] $Extend* {400};
 
 #
 #  Everything Else, with no tag.
 #                   Non-Control chars combine with $Extend (combining) chars.
 #                   Controls are do not.
 #
-[^$Control [:Ideographic:]] $Extend* $word_pad*;
+[^$Control [:Ideographic:]] $Extend*;
 $CR $LF;
 
 #
@@ -126,7 +116,7 @@ $CR $LF;
 #    reaches something that can only be the start (and probably only) char in a "word".
 #    A space or punctuation meets the test.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
+$NonStarters = [$ALetter $Katakana $Extend $Format];
 
 #!.*;
 ! ($NonStarters* | \n \r) .;
author	Kurt Zenker <kz@openoffice.org>	2008-04-02 08:49:09 +0000
committer	Kurt Zenker <kz@openoffice.org>	2008-04-02 08:49:09 +0000
commit	bedef98c24ef9ada6aaffe9bc5284d9759a31a9a (patch)
tree	ebc5050ba6375476665a49f6dc775f9a8b183e19 /i18npool
parent	59144104b3f91a2e6ed816f0bde0fdb91ea218d7 (diff)