diff options
author | Caolán McNamara <caolanm@redhat.com> | 2014-05-12 17:07:21 +0100 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2014-05-12 17:08:24 +0100 |
commit | 6e225b41f1ab3e6cac395b0c0c6db73414658625 (patch) | |
tree | 8c9c4ec2ea754c8e985e400f4bdc9e79eb16e9ff | |
parent | 3ddae832bbb71306a574c4e1087de0a0da318966 (diff) |
Resolves: fdo#55707 Word count incorrect if language is set to Finnish
Change-Id: I283dddaa4bd8baf05b90ce5f81d43b785021a3c4
-rw-r--r-- | i18npool/CustomTarget_breakiterator.mk | 2 | ||||
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 14 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/count_word_fi.txt | 134 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/dict_word_fi.txt | 147 |
4 files changed, 10 insertions, 287 deletions
diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk index ce8a03877659..4aaf2e5a710e 100644 --- a/i18npool/CustomTarget_breakiterator.mk +++ b/i18npool/CustomTarget_breakiterator.mk @@ -48,9 +48,7 @@ endif i18npool_BRKTXTS := \ $(if $(i18npool_breakiterator_want_in),char_in.brk) \ char.brk \ - $(call gb_Helper_optional_locale,fi,count_word_fi.brk) \ count_word.brk \ - $(call gb_Helper_optional_locale,fi,dict_word_fi.brk) \ $(call gb_Helper_optional_locale,he,dict_word_he.brk) \ $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \ dict_word_nodash.brk \ diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 36e3d3b808fc..ccea424cbb88 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -405,7 +405,7 @@ void TestBreakIterator::testWordBoundaries() } //See https://issues.apache.org/ooo/show_bug.cgi?id=85411 - for (int j = 0; j < 2; ++j) + for (int j = 0; j < 3; ++j) { switch (j) { @@ -417,6 +417,10 @@ void TestBreakIterator::testWordBoundaries() aLocale.Language = "ca"; aLocale.Country = "ES"; break; + case 2: + aLocale.Language = "fi"; + aLocale.Country = "FI"; + break; default: CPPUNIT_ASSERT(false); break; @@ -484,15 +488,16 @@ void TestBreakIterator::testWordBoundaries() } //See https://issues.apache.org/ooo/show_bug.cgi?id=58513 + //See https://bugs.freedesktop.org/show_bug.cgi?id=55707 { aLocale.Language = "fi"; aLocale.Country = "FI"; - OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi"); + OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"); { sal_Int32 nPos = 0; - sal_Int32 aExpected[] = {12, 22, 25, 36}; + sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51}; size_t i = 0; do { @@ -507,7 +512,8 @@ void TestBreakIterator::testWordBoundaries() { sal_Int32 nPos = 0; - sal_Int32 aExpected[] = {0, 11, 12, 21, 22, 24, 25, 36}; + sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37, + 40, 41, 42, 43, 45, 46, 47, 50, 51}; size_t i = 0; do { diff --git a/i18npool/source/breakiterator/data/count_word_fi.txt b/i18npool/source/breakiterator/data/count_word_fi.txt deleted file mode 100644 index 0429edc44162..000000000000 --- a/i18npool/source/breakiterator/data/count_word_fi.txt +++ /dev/null @@ -1,134 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: count_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] [:name = HYPHEN-MINUS:] - - $Katakana - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = COLON:]]; - -$MidNumLet = [:name = FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidNumLetEx = $MidNumLet $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$FormatEx = $Format $Extend*; - -$word_pad=[[:P:][:S:][:Z:][:C:]]; - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*; -$NumberSequence $word_pad* {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $word_pad* {200}; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -# [:IDEOGRAPHIC:] $Extend* $word_pad* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend* $word_pad*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_fi.txt b/i18npool/source/breakiterator/data/dict_word_fi.txt deleted file mode 100644 index 7026c992f1f9..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_fi.txt +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word_fi.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] [:name = HYPHEN-MINUS:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = COLON:]]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - |