From e1ad946ef5db3f7c0a540207d0f0fd85799e3b66 Mon Sep 17 00:00:00 2001
From: Release Engineers <releng@openoffice.org>
Date: Thu, 6 Aug 2009 18:13:57 +0000
Subject: CWS-TOOLING: integrate CWS tl73 2009-07-31 15:29:33 +0200 tl  r274535
 : #i64400# dash/hyphen should not break words

---
 i18npool/inc/breakiterator_ctl.hxx                 |   1 +
 i18npool/inc/breakiterator_unicode.hxx             |   6 +-
 i18npool/inc/xdictionary.hxx                       |   4 +-
 .../source/breakiterator/breakiterator_cjk.cxx     |   4 +-
 .../source/breakiterator/breakiterator_ctl.cxx     |   7 +-
 .../source/breakiterator/breakiterator_unicode.cxx |  29 ++--
 i18npool/source/breakiterator/data/dict_word.txt   |   3 +-
 .../source/breakiterator/data/dict_word_dash.txt   | 148 -------------------
 .../source/breakiterator/data/dict_word_nodash.txt | 147 +++++++++++++++++++
 .../breakiterator/data/dict_word_prepostdash.txt   | 157 +++++++++++++++++++++
 i18npool/source/breakiterator/makefile.mk          |   2 +-
 i18npool/source/breakiterator/xdictionary.cxx      |  30 +++-
 i18npool/source/localedata/data/de_DE.xml          |   9 +-
 i18npool/source/localedata/data/nl_NL.xml          |  35 +++--
 i18npool/source/localedata/data/pl_PL.xml          |   7 -
 i18npool/source/localedata/data/sv_SE.xml          |   7 +
 16 files changed, 405 insertions(+), 191 deletions(-)
 delete mode 100644 i18npool/source/breakiterator/data/dict_word_dash.txt
 create mode 100644 i18npool/source/breakiterator/data/dict_word_nodash.txt
 create mode 100644 i18npool/source/breakiterator/data/dict_word_prepostdash.txt

(limited to 'i18npool')

diff --git a/i18npool/inc/breakiterator_ctl.hxx b/i18npool/inc/breakiterator_ctl.hxx
index 9e753e8ae16f..3e687b5e8e68 100644
--- a/i18npool/inc/breakiterator_ctl.hxx
+++ b/i18npool/inc/breakiterator_ctl.hxx
@@ -58,6 +58,7 @@ protected:
     sal_Int32* nextCellIndex;
     sal_Int32* previousCellIndex;
     sal_Int32 cellIndexSize;
+
     virtual void SAL_CALL makeIndex(const rtl::OUString& text, sal_Int32 pos) throw(com::sun::star::uno::RuntimeException);
 };
 
diff --git a/i18npool/inc/breakiterator_unicode.hxx b/i18npool/inc/breakiterator_unicode.hxx
index 3a0720f27545..654df424b237 100644
--- a/i18npool/inc/breakiterator_unicode.hxx
+++ b/i18npool/inc/breakiterator_unicode.hxx
@@ -86,12 +86,16 @@ protected:
     const sal_Char *cBreakIterator, *wordRule, *lineRule;
     Boundary result; // for word break iterator
 
-    struct {
+    struct BI_Data {
         UnicodeString aICUText;
         icu::BreakIterator *aBreakIterator;
+
+        BI_Data() : aICUText(), aBreakIterator(NULL) {}
     } character, word, sentence, line, *icuBI;
+
     com::sun::star::lang::Locale aLocale;
     sal_Int16 aBreakType, aWordType;
+
     void SAL_CALL loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char* name, const rtl::OUString& rText) throw(com::sun::star::uno::RuntimeException);
 };
diff --git a/i18npool/inc/xdictionary.hxx b/i18npool/inc/xdictionary.hxx
index 32ffdbbfe377..04c5836aaaea 100644
--- a/i18npool/inc/xdictionary.hxx
+++ b/i18npool/inc/xdictionary.hxx
@@ -45,11 +45,13 @@ namespace com { namespace sun { namespace star { namespace i18n {
 
 // cache structure.
 struct WordBreakCache {
-    sal_Bool equals(const sal_Unicode *str, Boundary& boundary);    // checking cached string
     sal_Int32 length;       // contents length saved here.
     sal_Unicode *contents;      // seperated segment contents.
     sal_Int32* wordboundary;        // word boundaries in segments.
     sal_Int32 size;         // size of wordboundary
+
+    WordBreakCache();
+    sal_Bool equals(const sal_Unicode *str, Boundary& boundary);    // checking cached string
 };
 
 class xdictionary
diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx
index 3d684b8788df..3a44c02edc23 100644
--- a/i18npool/source/breakiterator/breakiterator_cjk.cxx
+++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx
@@ -46,7 +46,9 @@ namespace com { namespace sun { namespace star { namespace i18n {
 //      class BreakIterator_CJK
 //      ----------------------------------------------------;
 
-BreakIterator_CJK::BreakIterator_CJK() : dict(NULL)
+BreakIterator_CJK::BreakIterator_CJK() :
+    dict( NULL ),
+    hangingCharacters()
 {
         cBreakIterator = "com.sun.star.i18n.BreakIterator_CJK";
 }
diff --git a/i18npool/source/breakiterator/breakiterator_ctl.cxx b/i18npool/source/breakiterator/breakiterator_ctl.cxx
index a42014615f1a..0e1bee869cac 100644
--- a/i18npool/source/breakiterator/breakiterator_ctl.cxx
+++ b/i18npool/source/breakiterator/breakiterator_ctl.cxx
@@ -45,11 +45,14 @@ namespace com { namespace sun { namespace star { namespace i18n {
 /**
  * Constructor.
  */
-BreakIterator_CTL::BreakIterator_CTL()
+BreakIterator_CTL::BreakIterator_CTL() :
+    cachedText(),
+    nextCellIndex( NULL ),
+    previousCellIndex( NULL ),
+    cellIndexSize( 512 )
 {
     cBreakIterator = "com.sun.star.i18n.BreakIterator_CTL";
     // to improve performance, alloc big enough memory in construct.
-    cellIndexSize = 512;
     nextCellIndex = (sal_Int32*) calloc(cellIndexSize, sizeof(sal_Int32));
     previousCellIndex = (sal_Int32*) calloc(cellIndexSize, sizeof(sal_Int32));
     memset(nextCellIndex, 0, cellIndexSize * sizeof(sal_Int32));
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index b832bb02e7e9..f0710c996858 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -51,20 +51,23 @@ namespace com { namespace sun { namespace star { namespace i18n {
 
 #define ERROR ::com::sun::star::uno::RuntimeException()
 
-#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
-
-
-BreakIterator_Unicode::BreakIterator_Unicode()
+//#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
+
+
+BreakIterator_Unicode::BreakIterator_Unicode() :
+    cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
+    wordRule( "word" ),
+    lineRule( "line" ),
+    result(),
+    character(),
+    word(),
+    sentence(),
+    line(),
+    icuBI( NULL ),
+    aLocale(),
+    aBreakType(),
+    aWordType()
 {
-        wordRule="word";
-        lineRule="line";
-        character.aBreakIterator=word.aBreakIterator=sentence.aBreakIterator=line.aBreakIterator=NULL;
-        character.aICUText=UnicodeString();
-        word.aICUText=UnicodeString();
-        sentence.aICUText=UnicodeString();
-        line.aICUText=UnicodeString();
-        cBreakIterator = ImplementName;
-        icuBI=NULL;
 }
 
 
diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt
index 4c5c80823041..367a82db9e6f 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -34,7 +34,8 @@ $ALetter   = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCT
                            
 $MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
               [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ];  
+              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 
+              [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ];
 
 $SufixLetter = [:name= FULL STOP:];
               
diff --git a/i18npool/source/breakiterator/data/dict_word_dash.txt b/i18npool/source/breakiterator/data/dict_word_dash.txt
deleted file mode 100644
index 7f861c7911ca..000000000000
--- a/i18npool/source/breakiterator/data/dict_word_dash.txt
+++ /dev/null
@@ -1,148 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
-#       All Rights Reserved.
-#
-#   file:  dict_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 
-              [:name = HYPHEN-MINUS:] [:name = EN DASH:] ];
-
-$SufixLetter = [:name= FULL STOP:];
-              
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
-
-[[:P:][:S:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, only so that they
-#                           can be tagged with a return value.   TODO:  is this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  too far,
-#                   but must back up at least enough, and must stop on a boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
-#    reaches something that can only be the start (and probably only) char in a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
-
-#!.*;
-! ($NonStarters* | \n \r) .;
-
diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt
new file mode 100644
index 000000000000..4c5c80823041
--- /dev/null
+++ b/i18npool/source/breakiterator/data/dict_word_nodash.txt
@@ -0,0 +1,147 @@
+#
+#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+#       All Rights Reserved.
+#
+#   file:  dict_word.txt   
+#
+#   ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on Version 4.0.0, dated 2003-04-17
+#
+
+
+
+####################################################################################
+#
+#  Character class definitions from TR 29
+#
+####################################################################################
+$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+
+$Ideographic = [:Ideographic:];
+$Hangul = [:Script = HANGUL:];
+
+$ALetter   = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+                           - $Ideographic
+                           - $Katakana
+                           - $Hangul
+                           - [:Script = Thai:]
+                           - [:Script = Lao:]
+                           - [:Script = Hiragana:]];
+                           
+$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
+              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ];  
+
+$SufixLetter = [:name= FULL STOP:];
+              
+
+$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+             [:name = PRIME:]];
+$Numeric   = [:LineBreak = Numeric:];
+
+
+$TheZWSP = \u200b;
+
+#
+#  Character Class Definitions.
+#    The names are those from TR29.
+#
+$CR         = \u000d;
+$LF         = \u000a;
+$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+
+
+
+
+####################################################################################
+#
+#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+#
+####################################################################################
+
+$Format    = [[:Cf:] - $TheZWSP];
+
+
+
+# Rule 3:  Treat a grapheme cluster as if it were a single character.
+#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+#          because we don't need to find the boundaries between adjacent syllables -
+#          they won't be word boundaries.
+#
+
+
+#
+#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+#
+$ALetterEx    = $ALetter   $Extend*; 
+$NumericEx    = $Numeric   $Extend*;
+$MidNumEx     = $MidNum    $Extend*;
+$MidLetterEx  = $MidLetter $Extend*;
+$SufixLetterEx= $SufixLetter $Extend*;
+$KatakanaEx   = $Katakana  $Extend*;
+$IdeographicEx= $Ideographic  $Extend*;
+$HangulEx = $Hangul  $Extend*;
+$FormatEx     = $Format    $Extend*;
+
+
+#
+#  Numbers.  Rules 8, 11, 12 form the TR.
+#
+$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+$NumberSequence {100};
+
+#
+#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+#     - must include at least one letter. 
+#     - may include both letters and numbers.
+#     - may include  MideLetter, MidNumber punctuation.
+#
+$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+
+[[:P:][:S:]]*;
+
+#
+#  Do not break between Katakana.   Rule #13.
+#
+$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+[:Hiragana:] $Extend* {300};
+
+#
+#  Ideographic Characters.  Stand by themselves as words.
+#                           Separated from the "Everything Else" rule, below, only so that they
+#                           can be tagged with a return value.   TODO:  is this what we want?
+#
+$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+$HangulEx ($FormatEx* $HangulEx)* {400};
+
+#
+#  Everything Else, with no tag.
+#                   Non-Control chars combine with $Extend (combining) chars.
+#                   Controls are do not.
+#
+[^$Control [:Ideographic:]] $Extend*;
+$CR $LF;
+
+#
+#  Reverse Rules.   Back up over any of the chars that can group together.
+#                   (Reverse rules do not need to be exact; they can back up  too far,
+#                   but must back up at least enough, and must stop on a boundary.)
+#
+
+# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+#    reaches something that can only be the start (and probably only) char in a "word".
+#    A space or punctuation meets the test.
+#
+$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
+
+#!.*;
+! ($NonStarters* | \n \r) .;
+
diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
new file mode 100644
index 000000000000..1bf94451fae2
--- /dev/null
+++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
@@ -0,0 +1,157 @@
+#
+#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+#       All Rights Reserved.
+#
+#   file:  dict_word.txt   
+#
+#   ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on Version 4.0.0, dated 2003-04-17
+#
+
+
+
+####################################################################################
+#
+#  Character class definitions from TR 29
+#
+####################################################################################
+$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+
+$Ideographic = [:Ideographic:];
+$Hangul = [:Script = HANGUL:];
+
+# list of dashes or hyphens that should be accepted as part of the word if a single one of these
+# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
+# be part of the word in order to have it properly spell checked etc.
+$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ];
+
+
+$ALetter   = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+                           - $Ideographic
+                           - $Katakana
+                           - $Hangul
+                           - [:Script = Thai:]
+                           - [:Script = Lao:]
+                           - [:Script = Hiragana:]];
+                           
+$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
+              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 
+              [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ];
+
+$SufixLetter = [:name= FULL STOP:];
+              
+
+$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+             [:name = PRIME:]];
+$Numeric   = [:LineBreak = Numeric:];
+
+
+$TheZWSP = \u200b;
+
+#
+#  Character Class Definitions.
+#    The names are those from TR29.
+#
+$CR         = \u000d;
+$LF         = \u000a;
+$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+
+
+
+
+####################################################################################
+#
+#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+#
+####################################################################################
+
+$Format    = [[:Cf:] - $TheZWSP];
+
+
+
+# Rule 3:  Treat a grapheme cluster as if it were a single character.
+#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+#          because we don't need to find the boundaries between adjacent syllables -
+#          they won't be word boundaries.
+#
+
+
+#
+#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+#
+$ALetterEx    = $ALetter   $Extend*; 
+$NumericEx    = $Numeric   $Extend*;
+$MidNumEx     = $MidNum    $Extend*;
+$MidLetterEx  = $MidLetter $Extend*;
+$SufixLetterEx= $SufixLetter $Extend*;
+$KatakanaEx   = $Katakana  $Extend*;
+$IdeographicEx= $Ideographic  $Extend*;
+$HangulEx = $Hangul  $Extend*;
+$FormatEx     = $Format    $Extend*;
+
+
+#
+#  Numbers.  Rules 8, 11, 12 form the TR.
+#
+$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+$NumberSequence {100};
+
+#
+#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+#     - must include at least one letter. 
+#     - may include both letters and numbers.
+#     - may include  MideLetter, MidNumber punctuation.
+#
+# At most one leading or trailing dash/hyphen should be accepted as well.
+# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
+# be part of the word in order to have it properly spell checked etc.
+$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?;     # rules #6, #7
+($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+
+[[:P:][:S:]]*;
+
+#
+#  Do not break between Katakana.   Rule #13.
+#
+$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+[:Hiragana:] $Extend* {300};
+
+#
+#  Ideographic Characters.  Stand by themselves as words.
+#                           Separated from the "Everything Else" rule, below, only so that they
+#                           can be tagged with a return value.   TODO:  is this what we want?
+#
+$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+$HangulEx ($FormatEx* $HangulEx)* {400};
+
+#
+#  Everything Else, with no tag.
+#                   Non-Control chars combine with $Extend (combining) chars.
+#                   Controls are do not.
+#
+[^$Control [:Ideographic:]] $Extend*;
+$CR $LF;
+
+#
+#  Reverse Rules.   Back up over any of the chars that can group together.
+#                   (Reverse rules do not need to be exact; they can back up  too far,
+#                   but must back up at least enough, and must stop on a boundary.)
+#
+
+# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+#    reaches something that can only be the start (and probably only) char in a "word".
+#    A space or punctuation meets the test.
+#
+$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
+
+#!.*;
+! ($NonStarters* | \n \r) .;
+
diff --git a/i18npool/source/breakiterator/makefile.mk b/i18npool/source/breakiterator/makefile.mk
index 7ba9cd3e753d..50e437e98b91 100644
--- a/i18npool/source/breakiterator/makefile.mk
+++ b/i18npool/source/breakiterator/makefile.mk
@@ -93,7 +93,7 @@ $(MISC)$/%_brk.c : $(MISC)$/%.brk
 # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools,
 # so the output (OpenOffice_icu_dat.c) is changed here to include a pragma to disable the warnings.
 # Output of gencmn is redirected to OpenOffice_icu_tmp.c with the -t switch.
-$(MISC)$/OpenOffice_%.c : 
+$(MISC)$/OpenOffice_%.c : $(MY_BRK_BRKFILES:s/.brk/_brk.c/)
     $(WRAPCMD) $(GENCMN) -n OpenOffice -t tmp -S -d $(MISC) O $(mktmp $(subst,$(MISC)$/, $(MY_BRK_BRKFILES:t"\n")))
     echo $(USQ)#ifdef _MSC_VER$(USQ) > $@
     echo $(USQ)#pragma warning( disable : 4229 4668 )$(USQ) >> $@
diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx
index fb832f0b6468..f286dd2449ac 100644
--- a/i18npool/source/breakiterator/xdictionary.cxx
+++ b/i18npool/source/breakiterator/xdictionary.cxx
@@ -54,7 +54,21 @@ namespace com { namespace sun { namespace star { namespace i18n {
 
 extern "C" { static void SAL_CALL thisModule() {} }
 
-xdictionary::xdictionary(const sal_Char *lang)
+xdictionary::xdictionary(const sal_Char *lang) :
+    existMark( NULL ),
+    index1( NULL ),
+    index2( NULL ),
+    lenArray( NULL ),
+    dataArea( NULL ),
+    hModule( NULL ),
+    boundary(),
+    japaneseWordBreak( sal_False )
+#if USE_CELL_BOUNDARY_CODE
+    // For CTL breakiterator, where the word boundary should not be inside cell.
+    ,
+    useCellBoundary( sal_False ),
+    cellBoundary( NULL )
+#endif
 {
     index1 = 0;
 #ifdef SAL_DLLPREFIX
@@ -92,6 +106,7 @@ xdictionary::xdictionary(const sal_Char *lang)
 
 #if USE_CELL_BOUNDARY_CODE
         useCellBoundary = sal_False;
+        cellBoundary = NULL;
 #endif
         japaneseWordBreak = sal_False;
 }
@@ -149,6 +164,19 @@ sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
         return 0;
 }
 
+
+/*
+ * c-tor
+ */
+
+WordBreakCache::WordBreakCache() :
+    length( 0 ),
+    contents( NULL ),
+    wordboundary( NULL ),
+    size( 0 )
+{
+}
+
 /*
  * Compare two unicode string,
  */
diff --git a/i18npool/source/localedata/data/de_DE.xml b/i18npool/source/localedata/data/de_DE.xml
index 295b826a5e9f..a83b3dbb791d 100644
--- a/i18npool/source/localedata/data/de_DE.xml
+++ b/i18npool/source/localedata/data/de_DE.xml
@@ -328,7 +328,14 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <ReservedWords>
+      <BreakIteratorRules>
+          <EditMode/>
+          <DictionaryMode>dict_word_prepostdash</DictionaryMode>
+          <WordCountMode/>
+          <CharacterMode/>
+          <LineMode/>
+      </BreakIteratorRules>
+      <ReservedWords>
       <trueWord>wahr</trueWord>
       <falseWord>falsch</falseWord>
       <quarter1Word>1. Quartal</quarter1Word>
diff --git a/i18npool/source/localedata/data/nl_NL.xml b/i18npool/source/localedata/data/nl_NL.xml
index eedfdc146b5e..5a91c9c3e42e 100644
--- a/i18npool/source/localedata/data/nl_NL.xml
+++ b/i18npool/source/localedata/data/nl_NL.xml
@@ -360,20 +360,27 @@
 </LC_CURRENCY>
 <LC_TRANSLITERATION ref="en_US"/>
 <LC_MISC>
-<ReservedWords>
-<trueWord>waar</trueWord>
-<falseWord>onwaar</falseWord>
-<quarter1Word>1ste kwartaal</quarter1Word>
-<quarter2Word>2de kwartaal</quarter2Word>
-<quarter3Word>3de kwartaal</quarter3Word>
-<quarter4Word>4de kwartaal</quarter4Word>
-<aboveWord>boven</aboveWord>
-<belowWord>onder</belowWord>
-<quarter1Abbreviation>K1</quarter1Abbreviation>
-<quarter2Abbreviation>K2</quarter2Abbreviation>
-<quarter3Abbreviation>K3</quarter3Abbreviation>
-<quarter4Abbreviation>K4</quarter4Abbreviation>
-</ReservedWords>
+  <BreakIteratorRules>
+    <EditMode/>
+    <DictionaryMode>dict_word_prepostdash</DictionaryMode>
+    <WordCountMode/>
+    <CharacterMode/>
+    <LineMode/>
+  </BreakIteratorRules>
+  <ReservedWords>
+    <trueWord>waar</trueWord>
+    <falseWord>onwaar</falseWord>
+    <quarter1Word>1ste kwartaal</quarter1Word>
+    <quarter2Word>2de kwartaal</quarter2Word>
+    <quarter3Word>3de kwartaal</quarter3Word>
+    <quarter4Word>4de kwartaal</quarter4Word>
+    <aboveWord>boven</aboveWord>
+    <belowWord>onder</belowWord>
+    <quarter1Abbreviation>K1</quarter1Abbreviation>
+    <quarter2Abbreviation>K2</quarter2Abbreviation>
+    <quarter3Abbreviation>K3</quarter3Abbreviation>
+    <quarter4Abbreviation>K4</quarter4Abbreviation>
+  </ReservedWords>
 </LC_MISC>
 <LC_NumberingLevel ref="en_US"/>
 <LC_OutLineNumberingLevel ref="en_US"/>
diff --git a/i18npool/source/localedata/data/pl_PL.xml b/i18npool/source/localedata/data/pl_PL.xml
index 4119060ea7df..195689666247 100644
--- a/i18npool/source/localedata/data/pl_PL.xml
+++ b/i18npool/source/localedata/data/pl_PL.xml
@@ -326,13 +326,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode>dict_word_dash</DictionaryMode>
-      <WordCountMode/>
-      <CharacterMode/>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>prawda</trueWord>
       <falseWord>fałsz</falseWord>
diff --git a/i18npool/source/localedata/data/sv_SE.xml b/i18npool/source/localedata/data/sv_SE.xml
index 333690a0ef09..211f95c3e894 100644
--- a/i18npool/source/localedata/data/sv_SE.xml
+++ b/i18npool/source/localedata/data/sv_SE.xml
@@ -315,6 +315,13 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
+    <BreakIteratorRules>
+      <EditMode/>
+      <DictionaryMode>dict_word_prepostdash</DictionaryMode>
+      <WordCountMode/>
+      <CharacterMode/>
+      <LineMode/>
+    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>sant</trueWord>
       <falseWord>falskt</falseWord>
-- 
cgit