summaryrefslogtreecommitdiff
path: root/i18npool/source
diff options
context:
space:
mode:
authorCaolán McNamara <caolanm@redhat.com>2012-05-13 22:41:30 +0100
committerCaolán McNamara <caolanm@redhat.com>2012-05-13 22:46:43 +0100
commit20c24114143d6d38774b56a142fd4ae05094308e (patch)
treea89f8ebda95eae5cb3c1f44e206cba6654c95f8a /i18npool/source
parent347e345295f14d486ec4b175887a67478b34e7c7 (diff)
Resolves: fdo#49849 implement Unicode 6.1 hebrew line breaking rules
i.e. sync with svn diff -c 31071 http://source.icu-project.org/repos/icu/icu/trunk/source/data/brkitr/line.txt Change-Id: I I I41b3d02f1a0da3b83a9684f29d466660d96254c6
Diffstat (limited to 'i18npool/source')
-rw-r--r--i18npool/source/breakiterator/data/README12
-rw-r--r--i18npool/source/breakiterator/data/line.txt57
2 files changed, 47 insertions, 22 deletions
diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README
new file mode 100644
index 000000000000..8d7598dab562
--- /dev/null
+++ b/i18npool/source/breakiterator/data/README
@@ -0,0 +1,12 @@
+The originals of these come from svn checkout
+http://source.icu-project.org/repos/icu/icu/trunk/source/data/brkitr they no
+longer appear in the icu tarballs, but are in icu's svn
+
+At various stages these copies have been customized and are not horribly out of
+sync. It unclear which diffs from the base versions are deliberate and which
+are now accidental :-(
+
+We need to review the various issues referenced in the commits that caused
+custimizations and see if they're still relevant or not, write regression tests
+for them, if any are still relavant then apply the changes back on top of the
+latest versions.
diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt
index cbabee6f20af..91c8f3d4850d 100644
--- a/i18npool/source/breakiterator/data/line.txt
+++ b/i18npool/source/breakiterator/data/line.txt
@@ -61,11 +61,13 @@ $BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
$B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:] ;
$CM = [:LineBreak = Combining_Mark:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
@@ -77,7 +79,7 @@ $JV = [:LineBreak = JV:];
$JT = [:LineBreak = JT:];
$LF = [:LineBreak = Line_Feed:];
$NL = [:LineBreak = Next_Line:];
-$NS = [:LineBreak = Nonstarter:];
+$NS = [[:LineBreak = Nonstarter:] $CJ];
$NU = [:LineBreak = Numeric:];
$OP = [[:LineBreak = Open_Punctuation:] - $DG];
$PO = [:LineBreak = Postfix_Numeric:];
@@ -118,6 +120,7 @@ $B2cm = $B2 $CM*;
$CLcm = $CL $CM*;
$EXcm = $EX $CM*;
$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
@@ -150,6 +153,7 @@ $B2 $CM+;
$CL $CM+;
$EX $CM+;
$GL $CM+;
+$HL $CM+;
$HY $CM+;
$H2 $CM+;
$H3 $CM+;
@@ -186,7 +190,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# so for this one case we need to manually list out longer sequences.
#
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
-$AL_FOLLOW_CM = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
+$AL_FOLLOW_CM = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
@@ -320,8 +324,13 @@ $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm) [^$CB]?;
+
# LB 22
-$ALcm $INcm;
+($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$IDcm $INcm;
$INcm $INcm;
@@ -331,16 +340,18 @@ $NUcm $INcm;
# $LB 23
$IDcm $POcm;
$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
+$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$ALcm $PRcm;
-$PRcm $ALcm;
-$POcm $ALcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
#
# LB 25 Numbers.
@@ -361,8 +372,8 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
-$ALcm $ALcm;
-$CM+ $ALcm; # The $CM+ is from rule 10, and unattached CM is treated as AL
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $NUcm);
@@ -371,11 +382,9 @@ $IScm ($ALcm | $NUcm);
# Rule 30 Do not break between letters, numbers or ordinary symbols
# and opening or closing punctuation
#
-($ALcm | $NUcm) $OPcm;
+($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm;
-$CLcm ($ALcm | $NUcm);
-
-
+$CLcm ($ALcm | $HLcm | $NUcm);
#
# Reverse Rules.
@@ -391,6 +400,7 @@ $CM+ $B2;
$CM+ $CL;
$CM+ $EX;
$CM+ $GL;
+$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
@@ -544,24 +554,25 @@ $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
-
+# LB21a
+[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB 22
-$CM* $IN $CM* $ALPlus;
+$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
-$CM* $NU $CM* $ALPlus;
-$CM* $ALPlus $CM* $NU;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* $PR $CM* $ALPlus;
-$CM* $ALPlus $CM* $PR;
-$CM* $ALPlus $CM* $PO;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP;
$CM* $NU+ $CM* $HY+ / $SP;
@@ -580,15 +591,14 @@ $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
-$CM* $ALPlus $CM* $ALPlus;
-
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP];
# LB 30
-$CM* $OP $CM* ($NU | $ALPlus);
-$CM* ($NU | $ALPlus) $CM* ($CL | $SY)+ [^$SP];
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP];
## -------------------------------------------------
@@ -609,6 +619,9 @@ $SP+ $CM* $QU;
$SP+ $CM* $CL;
$SP+ $CM* $B2;
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
# LB 18
($CM* ($IS | $SY))+ $CM* $NU;
$CL $CM* ($NU | $IS | $SY);
@@ -629,6 +642,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
-[$CM $OP $QU $CL $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $dictionary];
+[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary];
$dictionary $dictionary;