Fix applying external dict to icu, and khmer break engine fixes

Change-Id: Ib897e5fa5e80f75f501694dbf874aabd92253b25 Reviewed-on: https://gerrit.libreoffice.org/21247 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Martin Hosken <martin_hosken@sil.org>
author: Martin Hosken <martin_hosken@sil.org> 2016-01-08 16:41:52 +0700
committer: Martin Hosken <martin_hosken@sil.org> 2016-01-09 02:24:42 +0000
commit: 39b718dd655220110523b7013e65ea4f821aedf7 (patch)
tree: 996dd8606a7ffc85e5a6fa13914bb01015202891
parent: eba202b65accc0b4c4e08346c8e579eea58342f4 (diff)
1 files changed, 8 insertions, 8 deletions
diff --git a/external/icu/khmerbreakengine.patch b/external/icu/khmerbreakengine.patch
index 03e6079b19f0..ba3e392a27f3 100644
--- a/external/icu/khmerbreakengine.patch
+++ b/external/icu/khmerbreakengine.patch
@@ -14,8 +14,8 @@ index f1c874d..3ad1b3f 100644
      fTypes = breakTypes;
 +    fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
 +
-+    fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]"), status);
-+    fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]"), status);
++    fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]\\u200C\\u200D\\u2060"), status);
++    fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]\\u200C\\u200D\\u2060"), status);
 +    fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
  }
  
@@ -473,7 +473,7 @@ index f1c874d..3ad1b3f 100644
          // Look ahead for possible suffixes if a dictionary word does not follow.
          // We do this in code rather than using a rule so that the heuristic
          // resynch continues to function. For example, one of the suffix characters
-@@ -828,51 +993,29 @@ foundBest:
+@@ -828,51 +993,28 @@ foundBest:
   * KhmerBreakEngine
   */
  
@@ -506,7 +506,7 @@ index f1c874d..3ad1b3f 100644
          setCharacters(fKhmerWordSet);
      }
      fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
-     fMarkSet.add(0x0020);
+-    fMarkSet.add(0x0020);
 -    fEndWordSet = fKhmerWordSet;
 -    fBeginWordSet.add(0x1780, 0x17B3);
 -    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
@@ -522,7 +522,7 @@ index f1c874d..3ad1b3f 100644
 -//    fSuffixSet.add(THAI_MAIYAMOK);
 +    fIgnoreSet.add(0x2060);         // WJ
 +    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
-+    fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:^M:]]"), status);
++    fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
 +    fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
  
      // Compact for caching.
@@ -750,7 +750,7 @@ index f1c874d..3ad1b3f 100644
 -                if (cuWordLength <= 0) {
 -                    wordsFound += 1;
 -                }
-+                } while (fMarkSet.contains(c));
++                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
 +                values.setElementAt(BADSNLP, count);
 +                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
 +            } else {
@@ -775,7 +775,7 @@ index f1c874d..3ad1b3f 100644
 -            else {
 -                // Back up to where we were for next iteration
 -                utext_setNativeIndex(text, current+cuWordLength);
-+            int32_t ln_j_i = ln + i;
++            int32_t ln_j_i = ln + i;   // yes really i!
 +            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
 +                if (v == BADSNLP) {
 +                    int32_t p = prev.elementAti(i);
@@ -1395,7 +1395,7 @@ index 816c82d..c637d70 100644
 +#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
 +# 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
 +$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
-+	cp $< $(MAINBUILDDIR)
++	cp $< $(BRKBLDDIR)
 +	echo "timestamp" > $@
 
  ####################################################    CFU
author	Martin Hosken <martin_hosken@sil.org>	2016-01-08 16:41:52 +0700
committer	Martin Hosken <martin_hosken@sil.org>	2016-01-09 02:24:42 +0000
commit	39b718dd655220110523b7013e65ea4f821aedf7 (patch)
tree	996dd8606a7ffc85e5a6fa13914bb01015202891
parent	eba202b65accc0b4c4e08346c8e579eea58342f4 (diff)