summaryrefslogtreecommitdiff
path: root/i18npool
diff options
context:
space:
mode:
authorKurt Zenker <kz@openoffice.org>2008-04-02 08:49:09 +0000
committerKurt Zenker <kz@openoffice.org>2008-04-02 08:49:09 +0000
commitbedef98c24ef9ada6aaffe9bc5284d9759a31a9a (patch)
treeebc5050ba6375476665a49f6dc775f9a8b183e19 /i18npool
parent59144104b3f91a2e6ed816f0bde0fdb91ea218d7 (diff)
INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED
2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word 2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word
Diffstat (limited to 'i18npool')
-rw-r--r--i18npool/source/breakiterator/data/count_word.txt34
1 files changed, 12 insertions, 22 deletions
diff --git a/i18npool/source/breakiterator/data/count_word.txt b/i18npool/source/breakiterator/data/count_word.txt
index dd0799a61878..e94ef1bb95e5 100644
--- a/i18npool/source/breakiterator/data/count_word.txt
+++ b/i18npool/source/breakiterator/data/count_word.txt
@@ -22,21 +22,16 @@ $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND M
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+$dash = \u002d;
+
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
+ [:P:] [:S:] [:LineBreak = Numeric:]
+ - $dash
- $Katakana
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
-
-$MidNumLet = [:name = FULL STOP:];
-
-$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
-$Numeric = [:LineBreak = Numeric:];
-
-
#
# Character Class Definitions.
# The names are those from TR29.
@@ -70,20 +65,12 @@ $Format = [[:Cf:]];
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
#
$ALetterEx = $ALetter $Extend*;
-$NumericEx = $Numeric $Extend*;
-$MidNumEx = $MidNum $Extend*;
-$MidNumLetEx = $MidNumLet $Extend*;
-$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
$FormatEx = $Format $Extend*;
-$word_pad=[[:P:][:S:][:Z:][:C:]];
-
#
# Numbers. Rules 8, 11, 12 form the TR.
#
-$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*;
-$NumberSequence $word_pad* {100};
#
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
@@ -91,8 +78,11 @@ $NumberSequence $word_pad* {100};
# - may include both letters and numbers.
# - may include MideLetter, MidNumber punctuation.
#
-$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*; # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $word_pad* {200};
+$LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*; # rules #6, #7
+$LetterSequence {200};
+
+$ALetterEx* $dash+ {200};
+$ALetterEx* ($dash $LetterSequence)+ $dash* {200};
#
# Do not break between Katakana. Rule #13.
@@ -105,14 +95,14 @@ $KatakanaEx ($FormatEx* $KatakanaEx)* {300};
# Separated from the "Everything Else" rule, below, only so that they
# can be tagged with a return value. TODO: is this what we want?
#
-# [:IDEOGRAPHIC:] $Extend* $word_pad* {400};
+# [:IDEOGRAPHIC:] $Extend* {400};
#
# Everything Else, with no tag.
# Non-Control chars combine with $Extend (combining) chars.
# Controls are do not.
#
-[^$Control [:Ideographic:]] $Extend* $word_pad*;
+[^$Control [:Ideographic:]] $Extend*;
$CR $LF;
#
@@ -126,7 +116,7 @@ $CR $LF;
# reaches something that can only be the start (and probably only) char in a "word".
# A space or punctuation meets the test.
#
-$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
+$NonStarters = [$ALetter $Katakana $Extend $Format];
#!.*;
! ($NonStarters* | \n \r) .;