From e1ad946ef5db3f7c0a540207d0f0fd85799e3b66 Mon Sep 17 00:00:00 2001 From: Release Engineers Date: Thu, 6 Aug 2009 18:13:57 +0000 Subject: CWS-TOOLING: integrate CWS tl73 2009-07-31 15:29:33 +0200 tl r274535 : #i64400# dash/hyphen should not break words --- i18npool/inc/breakiterator_ctl.hxx | 1 + i18npool/inc/breakiterator_unicode.hxx | 6 +- i18npool/inc/xdictionary.hxx | 4 +- .../source/breakiterator/breakiterator_cjk.cxx | 4 +- .../source/breakiterator/breakiterator_ctl.cxx | 7 +- .../source/breakiterator/breakiterator_unicode.cxx | 29 ++-- i18npool/source/breakiterator/data/dict_word.txt | 3 +- .../source/breakiterator/data/dict_word_dash.txt | 148 ------------------- .../source/breakiterator/data/dict_word_nodash.txt | 147 +++++++++++++++++++ .../breakiterator/data/dict_word_prepostdash.txt | 157 +++++++++++++++++++++ i18npool/source/breakiterator/makefile.mk | 2 +- i18npool/source/breakiterator/xdictionary.cxx | 30 +++- i18npool/source/localedata/data/de_DE.xml | 9 +- i18npool/source/localedata/data/nl_NL.xml | 35 +++-- i18npool/source/localedata/data/pl_PL.xml | 7 - i18npool/source/localedata/data/sv_SE.xml | 7 + 16 files changed, 405 insertions(+), 191 deletions(-) delete mode 100644 i18npool/source/breakiterator/data/dict_word_dash.txt create mode 100644 i18npool/source/breakiterator/data/dict_word_nodash.txt create mode 100644 i18npool/source/breakiterator/data/dict_word_prepostdash.txt (limited to 'i18npool') diff --git a/i18npool/inc/breakiterator_ctl.hxx b/i18npool/inc/breakiterator_ctl.hxx index 9e753e8ae16f..3e687b5e8e68 100644 --- a/i18npool/inc/breakiterator_ctl.hxx +++ b/i18npool/inc/breakiterator_ctl.hxx @@ -58,6 +58,7 @@ protected: sal_Int32* nextCellIndex; sal_Int32* previousCellIndex; sal_Int32 cellIndexSize; + virtual void SAL_CALL makeIndex(const rtl::OUString& text, sal_Int32 pos) throw(com::sun::star::uno::RuntimeException); }; diff --git a/i18npool/inc/breakiterator_unicode.hxx b/i18npool/inc/breakiterator_unicode.hxx index 3a0720f27545..654df424b237 100644 --- a/i18npool/inc/breakiterator_unicode.hxx +++ b/i18npool/inc/breakiterator_unicode.hxx @@ -86,12 +86,16 @@ protected: const sal_Char *cBreakIterator, *wordRule, *lineRule; Boundary result; // for word break iterator - struct { + struct BI_Data { UnicodeString aICUText; icu::BreakIterator *aBreakIterator; + + BI_Data() : aICUText(), aBreakIterator(NULL) {} } character, word, sentence, line, *icuBI; + com::sun::star::lang::Locale aLocale; sal_Int16 aBreakType, aWordType; + void SAL_CALL loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char* name, const rtl::OUString& rText) throw(com::sun::star::uno::RuntimeException); }; diff --git a/i18npool/inc/xdictionary.hxx b/i18npool/inc/xdictionary.hxx index 32ffdbbfe377..04c5836aaaea 100644 --- a/i18npool/inc/xdictionary.hxx +++ b/i18npool/inc/xdictionary.hxx @@ -45,11 +45,13 @@ namespace com { namespace sun { namespace star { namespace i18n { // cache structure. struct WordBreakCache { - sal_Bool equals(const sal_Unicode *str, Boundary& boundary); // checking cached string sal_Int32 length; // contents length saved here. sal_Unicode *contents; // seperated segment contents. sal_Int32* wordboundary; // word boundaries in segments. sal_Int32 size; // size of wordboundary + + WordBreakCache(); + sal_Bool equals(const sal_Unicode *str, Boundary& boundary); // checking cached string }; class xdictionary diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx index 3d684b8788df..3a44c02edc23 100644 --- a/i18npool/source/breakiterator/breakiterator_cjk.cxx +++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx @@ -46,7 +46,9 @@ namespace com { namespace sun { namespace star { namespace i18n { // class BreakIterator_CJK // ----------------------------------------------------; -BreakIterator_CJK::BreakIterator_CJK() : dict(NULL) +BreakIterator_CJK::BreakIterator_CJK() : + dict( NULL ), + hangingCharacters() { cBreakIterator = "com.sun.star.i18n.BreakIterator_CJK"; } diff --git a/i18npool/source/breakiterator/breakiterator_ctl.cxx b/i18npool/source/breakiterator/breakiterator_ctl.cxx index a42014615f1a..0e1bee869cac 100644 --- a/i18npool/source/breakiterator/breakiterator_ctl.cxx +++ b/i18npool/source/breakiterator/breakiterator_ctl.cxx @@ -45,11 +45,14 @@ namespace com { namespace sun { namespace star { namespace i18n { /** * Constructor. */ -BreakIterator_CTL::BreakIterator_CTL() +BreakIterator_CTL::BreakIterator_CTL() : + cachedText(), + nextCellIndex( NULL ), + previousCellIndex( NULL ), + cellIndexSize( 512 ) { cBreakIterator = "com.sun.star.i18n.BreakIterator_CTL"; // to improve performance, alloc big enough memory in construct. - cellIndexSize = 512; nextCellIndex = (sal_Int32*) calloc(cellIndexSize, sizeof(sal_Int32)); previousCellIndex = (sal_Int32*) calloc(cellIndexSize, sizeof(sal_Int32)); memset(nextCellIndex, 0, cellIndexSize * sizeof(sal_Int32)); diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx index b832bb02e7e9..f0710c996858 100644 --- a/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -51,20 +51,23 @@ namespace com { namespace sun { namespace star { namespace i18n { #define ERROR ::com::sun::star::uno::RuntimeException() -#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode"; - - -BreakIterator_Unicode::BreakIterator_Unicode() +//#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode"; + + +BreakIterator_Unicode::BreakIterator_Unicode() : + cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name + wordRule( "word" ), + lineRule( "line" ), + result(), + character(), + word(), + sentence(), + line(), + icuBI( NULL ), + aLocale(), + aBreakType(), + aWordType() { - wordRule="word"; - lineRule="line"; - character.aBreakIterator=word.aBreakIterator=sentence.aBreakIterator=line.aBreakIterator=NULL; - character.aICUText=UnicodeString(); - word.aICUText=UnicodeString(); - sentence.aICUText=UnicodeString(); - line.aICUText=UnicodeString(); - cBreakIterator = ImplementName; - icuBI=NULL; } diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index 4c5c80823041..367a82db9e6f 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -34,7 +34,8 @@ $ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCT $MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; + [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] + [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ]; $SufixLetter = [:name= FULL STOP:]; diff --git a/i18npool/source/breakiterator/data/dict_word_dash.txt b/i18npool/source/breakiterator/data/dict_word_dash.txt deleted file mode 100644 index 7f861c7911ca..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_dash.txt +++ /dev/null @@ -1,148 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt new file mode 100644 index 000000000000..4c5c80823041 --- /dev/null +++ b/i18npool/source/breakiterator/data/dict_word_nodash.txt @@ -0,0 +1,147 @@ +# +# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: dict_word.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on Version 4.0.0, dated 2003-04-17 +# + + + +#################################################################################### +# +# Character class definitions from TR 29 +# +#################################################################################### +$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] + [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; + +$Ideographic = [:Ideographic:]; +$Hangul = [:Script = HANGUL:]; + +$ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] + - $Ideographic + - $Katakana + - $Hangul + - [:Script = Thai:] + - [:Script = Lao:] + - [:Script = Hiragana:]]; + +$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] + [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] + [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; + +$SufixLetter = [:name= FULL STOP:]; + + +$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] + [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] + [:name = PRIME:]]; +$Numeric = [:LineBreak = Numeric:]; + + +$TheZWSP = \u200b; + +# +# Character Class Definitions. +# The names are those from TR29. +# +$CR = \u000d; +$LF = \u000a; +$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +$Extend = [[:Grapheme_Extend = TRUE:]]; + + + + +#################################################################################### +# +# Word Break Rules. Definitions and Rules specific to word break begin Here. +# +#################################################################################### + +$Format = [[:Cf:] - $TheZWSP]; + + + +# Rule 3: Treat a grapheme cluster as if it were a single character. +# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +# because we don't need to find the boundaries between adjacent syllables - +# they won't be word boundaries. +# + + +# +# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# +$ALetterEx = $ALetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$SufixLetterEx= $SufixLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; +$IdeographicEx= $Ideographic $Extend*; +$HangulEx = $Hangul $Extend*; +$FormatEx = $Format $Extend*; + + +# +# Numbers. Rules 8, 11, 12 form the TR. +# +$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +$NumberSequence {100}; + +# +# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +# - must include at least one letter. +# - may include both letters and numbers. +# - may include MideLetter, MidNumber punctuation. +# +$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; + +[[:P:][:S:]]*; + +# +# Do not break between Katakana. Rule #13. +# +$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +[:Hiragana:] $Extend* {300}; + +# +# Ideographic Characters. Stand by themselves as words. +# Separated from the "Everything Else" rule, below, only so that they +# can be tagged with a return value. TODO: is this what we want? +# +$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +$HangulEx ($FormatEx* $HangulEx)* {400}; + +# +# Everything Else, with no tag. +# Non-Control chars combine with $Extend (combining) chars. +# Controls are do not. +# +[^$Control [:Ideographic:]] $Extend*; +$CR $LF; + +# +# Reverse Rules. Back up over any of the chars that can group together. +# (Reverse rules do not need to be exact; they can back up too far, +# but must back up at least enough, and must stop on a boundary.) +# + +# NonStarters are the set of all characters that can appear at the 2nd - nth position of +# a word. (They may also be the first.) The reverse rule skips over these, until it +# reaches something that can only be the start (and probably only) char in a "word". +# A space or punctuation meets the test. +# +$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; + +#!.*; +! ($NonStarters* | \n \r) .; + diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt new file mode 100644 index 000000000000..1bf94451fae2 --- /dev/null +++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt @@ -0,0 +1,157 @@ +# +# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: dict_word.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on Version 4.0.0, dated 2003-04-17 +# + + + +#################################################################################### +# +# Character class definitions from TR 29 +# +#################################################################################### +$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] + [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; + +$Ideographic = [:Ideographic:]; +$Hangul = [:Script = HANGUL:]; + +# list of dashes or hyphens that should be accepted as part of the word if a single one of these +# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to +# be part of the word in order to have it properly spell checked etc. +$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ]; + + +$ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] + - $Ideographic + - $Katakana + - $Hangul + - [:Script = Thai:] + - [:Script = Lao:] + - [:Script = Hiragana:]]; + +$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] + [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] + [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] + [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ]; + +$SufixLetter = [:name= FULL STOP:]; + + +$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] + [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] + [:name = PRIME:]]; +$Numeric = [:LineBreak = Numeric:]; + + +$TheZWSP = \u200b; + +# +# Character Class Definitions. +# The names are those from TR29. +# +$CR = \u000d; +$LF = \u000a; +$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +$Extend = [[:Grapheme_Extend = TRUE:]]; + + + + +#################################################################################### +# +# Word Break Rules. Definitions and Rules specific to word break begin Here. +# +#################################################################################### + +$Format = [[:Cf:] - $TheZWSP]; + + + +# Rule 3: Treat a grapheme cluster as if it were a single character. +# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +# because we don't need to find the boundaries between adjacent syllables - +# they won't be word boundaries. +# + + +# +# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# +$ALetterEx = $ALetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$SufixLetterEx= $SufixLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; +$IdeographicEx= $Ideographic $Extend*; +$HangulEx = $Hangul $Extend*; +$FormatEx = $Format $Extend*; + + +# +# Numbers. Rules 8, 11, 12 form the TR. +# +$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +$NumberSequence {100}; + +# +# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +# - must include at least one letter. +# - may include both letters and numbers. +# - may include MideLetter, MidNumber punctuation. +# +# At most one leading or trailing dash/hyphen should be accepted as well. +# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to +# be part of the word in order to have it properly spell checked etc. +$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 +($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; + +[[:P:][:S:]]*; + +# +# Do not break between Katakana. Rule #13. +# +$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +[:Hiragana:] $Extend* {300}; + +# +# Ideographic Characters. Stand by themselves as words. +# Separated from the "Everything Else" rule, below, only so that they +# can be tagged with a return value. TODO: is this what we want? +# +$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +$HangulEx ($FormatEx* $HangulEx)* {400}; + +# +# Everything Else, with no tag. +# Non-Control chars combine with $Extend (combining) chars. +# Controls are do not. +# +[^$Control [:Ideographic:]] $Extend*; +$CR $LF; + +# +# Reverse Rules. Back up over any of the chars that can group together. +# (Reverse rules do not need to be exact; they can back up too far, +# but must back up at least enough, and must stop on a boundary.) +# + +# NonStarters are the set of all characters that can appear at the 2nd - nth position of +# a word. (They may also be the first.) The reverse rule skips over these, until it +# reaches something that can only be the start (and probably only) char in a "word". +# A space or punctuation meets the test. +# +$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; + +#!.*; +! ($NonStarters* | \n \r) .; + diff --git a/i18npool/source/breakiterator/makefile.mk b/i18npool/source/breakiterator/makefile.mk index 7ba9cd3e753d..50e437e98b91 100644 --- a/i18npool/source/breakiterator/makefile.mk +++ b/i18npool/source/breakiterator/makefile.mk @@ -93,7 +93,7 @@ $(MISC)$/%_brk.c : $(MISC)$/%.brk # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, # so the output (OpenOffice_icu_dat.c) is changed here to include a pragma to disable the warnings. # Output of gencmn is redirected to OpenOffice_icu_tmp.c with the -t switch. -$(MISC)$/OpenOffice_%.c : +$(MISC)$/OpenOffice_%.c : $(MY_BRK_BRKFILES:s/.brk/_brk.c/) $(WRAPCMD) $(GENCMN) -n OpenOffice -t tmp -S -d $(MISC) O $(mktmp $(subst,$(MISC)$/, $(MY_BRK_BRKFILES:t"\n"))) echo $(USQ)#ifdef _MSC_VER$(USQ) > $@ echo $(USQ)#pragma warning( disable : 4229 4668 )$(USQ) >> $@ diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx index fb832f0b6468..f286dd2449ac 100644 --- a/i18npool/source/breakiterator/xdictionary.cxx +++ b/i18npool/source/breakiterator/xdictionary.cxx @@ -54,7 +54,21 @@ namespace com { namespace sun { namespace star { namespace i18n { extern "C" { static void SAL_CALL thisModule() {} } -xdictionary::xdictionary(const sal_Char *lang) +xdictionary::xdictionary(const sal_Char *lang) : + existMark( NULL ), + index1( NULL ), + index2( NULL ), + lenArray( NULL ), + dataArea( NULL ), + hModule( NULL ), + boundary(), + japaneseWordBreak( sal_False ) +#if USE_CELL_BOUNDARY_CODE + // For CTL breakiterator, where the word boundary should not be inside cell. + , + useCellBoundary( sal_False ), + cellBoundary( NULL ) +#endif { index1 = 0; #ifdef SAL_DLLPREFIX @@ -92,6 +106,7 @@ xdictionary::xdictionary(const sal_Char *lang) #if USE_CELL_BOUNDARY_CODE useCellBoundary = sal_False; + cellBoundary = NULL; #endif japaneseWordBreak = sal_False; } @@ -149,6 +164,19 @@ sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) { return 0; } + +/* + * c-tor + */ + +WordBreakCache::WordBreakCache() : + length( 0 ), + contents( NULL ), + wordboundary( NULL ), + size( 0 ) +{ +} + /* * Compare two unicode string, */ diff --git a/i18npool/source/localedata/data/de_DE.xml b/i18npool/source/localedata/data/de_DE.xml index 295b826a5e9f..a83b3dbb791d 100644 --- a/i18npool/source/localedata/data/de_DE.xml +++ b/i18npool/source/localedata/data/de_DE.xml @@ -328,7 +328,14 @@ - + + + dict_word_prepostdash + + + + + wahr falsch 1. Quartal diff --git a/i18npool/source/localedata/data/nl_NL.xml b/i18npool/source/localedata/data/nl_NL.xml index eedfdc146b5e..5a91c9c3e42e 100644 --- a/i18npool/source/localedata/data/nl_NL.xml +++ b/i18npool/source/localedata/data/nl_NL.xml @@ -360,20 +360,27 @@ - -waar -onwaar -1ste kwartaal -2de kwartaal -3de kwartaal -4de kwartaal -boven -onder -K1 -K2 -K3 -K4 - + + + dict_word_prepostdash + + + + + + waar + onwaar + 1ste kwartaal + 2de kwartaal + 3de kwartaal + 4de kwartaal + boven + onder + K1 + K2 + K3 + K4 + diff --git a/i18npool/source/localedata/data/pl_PL.xml b/i18npool/source/localedata/data/pl_PL.xml index 4119060ea7df..195689666247 100644 --- a/i18npool/source/localedata/data/pl_PL.xml +++ b/i18npool/source/localedata/data/pl_PL.xml @@ -326,13 +326,6 @@ - - - dict_word_dash - - - - prawda fałsz diff --git a/i18npool/source/localedata/data/sv_SE.xml b/i18npool/source/localedata/data/sv_SE.xml index 333690a0ef09..211f95c3e894 100644 --- a/i18npool/source/localedata/data/sv_SE.xml +++ b/i18npool/source/localedata/data/sv_SE.xml @@ -315,6 +315,13 @@ + + + dict_word_prepostdash + + + + sant falskt -- cgit