diff options
author | Jens-Heiner Rechtien <hr@openoffice.org> | 2003-11-07 14:14:53 +0000 |
---|---|---|
committer | Jens-Heiner Rechtien <hr@openoffice.org> | 2003-11-07 14:14:53 +0000 |
commit | 825b2bb5d875a9d94e2ee5549d7cfb78b0aed933 (patch) | |
tree | fef388bd3342d49a57990548b1dae87ac70e8730 | |
parent | 952477f8ec8d01ed243b77cb9a2def654ff905ed (diff) |
INTEGRATION: CWS i18n08 (1.1.2); FILE ADDED
2003/08/08 23:30:57 khong 1.1.2.1: #i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break
-rw-r--r-- | i18npool/source/breakiterator/data/line.txt | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt new file mode 100644 index 000000000000..fc8a38567faa --- /dev/null +++ b/i18npool/source/breakiterator/data/line.txt @@ -0,0 +1,144 @@ +# Copyright (c) 2002-2003 International Business Machines Corporation and +# others. All Rights Reserved. +# +# file: line.txt +# +# Line Breaking Rules +# Implement default line breaking as defined by Unicode TR 14. +# + + +# +# Character Classes defined by TR 14. +# + +$AI = [:LineBreak = Ambiguous:]; +$AL = [:LineBreak = Alphabetic:]; +$BA = [:LineBreak = Break_After:]; +$BB = [:LineBreak = Break_Before:]; +$BK = [:LineBreak = Mandatory_Break:]; +$B2 = [:LineBreak = Break_Both:]; +$CB = [:LineBreak = Contingent_Break:]; +$CL = [:LineBreak = Close_Punctuation:]; +$CM = [:LineBreak = Combining_Mark:]; +$CR = [:LineBreak = Carriage_Return:]; +$EX = [:LineBreak = Exclamation:]; +$GL = [:LineBreak = Glue:]; +$HY = [:LineBreak = Hyphen:]; +$ID = [:LineBreak = Ideographic:]; +$IN = [:LineBreak = Inseperable:]; +$IS = [:LineBreak = Infix_Numeric:]; +$LF = [:LineBreak = Line_Feed:]; +$NS = [:LineBreak = Nonstarter:]; +$NU = [:LineBreak = Numeric:]; +$OP = [:LineBreak = Open_Punctuation:]; +$PO = [:LineBreak = Postfix_Numeric:]; +$PR = [:LineBreak = Prefix_Numeric:]; +$QU = [:LineBreak = Quotation:]; +$SA = [:LineBreak = Complex_Context:]; +$SG = [:LineBreak = Surrogate:]; +$SP = [:LineBreak = Space:]; +$SY = [:LineBreak = Break_Symbols:]; +$XX = [:LineBreak = Unknown:]; +$ZW = [:LineBreak = ZWSpace:]; + + +# +# Character classes from TR 29. Needed for finding characters. +# +# +$Extend = [:Grapheme_Extend = TRUE:]; + + +# +# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and +# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic) +# +$ALPlus = $AL | $AI | $SA; + +# +# Combining Marks. X $CM* behaves as if it were X. Rule LB6. +# +$ALcm = $ALPlus $CM*; +$IDcm = ($ID $CM* | $SP $CM+); +$NUcm = $NU $Extend*; +$HYcm = $HY $Extend*; +$QUcm = $QU $Extend*; +$POcm = $PO $Extend*; +$OPcm = $OP $Extend*; +$BAcm = $BA $Extend*; +$BBcm = $BB $Extend*; +$NScm = $NS $Extend*; +$GLcm = $GL $Extend*; +$B2cm = $B2 $Extend*; +$INcm = $IN $Extend*; + + +# New Lines. Always break after, never break before. +# Rule LB 3 +# +# Endings. NewLine or Zero Width Space, or both. Rules 4, 5 +# Because we never break before these things, $Endings +# appears at the end of line break rule. +# +$NLF = $BK | $CR | $LF | $CR $LF; +$Endings = $SP* $ZW* $NLF?; + + +# +# Openings Sequences that can precede Words, and that should not be separated from them. +# Rules LB 9, 10 +# +$Openings = (($QUcm $SP*)? $OPcm $SP*)*; + +# +# Closings Seqences that follow words, and that should not be separated from them, +# Rule LB 8, 11, 15 +$Closings = ($SP*( ($CL ($SP* $NScm)? | $EX | $IS ) $Extend*) | $BAcm | $HYcm | $NScm)*; + +# +# Words. Includes mixed Alpha-numerics. +# Rules 11a, 16, 17, 19, more or less. +# +$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+; +$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18 +$Word = (($IDcm | ($ALcm | $NUcm | $EX | $HY | $IS | $SY | $PR)+) ($POcm? | $INcm?)) ; # Alpha-numeric. 16, 17 +$Dashes = (($B2cm $SP*)*); # Dashes 11a + + + + + + + +$Word15 = ($BBcm* ($Word | $Number | $Dashes)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words. + [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the + [^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD + # to be glued. + +$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together. + # Rules 13, 14 + +# +# The actual rule, a combination of everything defined above. +# +$Openings $GluedWord $Closings $Endings; +# $GluedWord; + + + + + +# +# Reverse Rules. +# +# Back up to a hard break or a space that will cause a boundary. +# Not all spaces cause line breaks. $SpaceGlue represents a sequence +# containing a space that may inhibit a break from occuring. +# + +$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP); +$ClumpingChars = [^$SP $BK $CR $LF]; + +!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR); + |