summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Hosken <martin_hosken@sil.org>2016-01-08 16:41:52 +0700
committerMartin Hosken <martin_hosken@sil.org>2016-01-09 02:24:42 +0000
commit39b718dd655220110523b7013e65ea4f821aedf7 (patch)
tree996dd8606a7ffc85e5a6fa13914bb01015202891
parenteba202b65accc0b4c4e08346c8e579eea58342f4 (diff)
Fix applying external dict to icu, and khmer break engine fixes
Change-Id: Ib897e5fa5e80f75f501694dbf874aabd92253b25 Reviewed-on: https://gerrit.libreoffice.org/21247 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Martin Hosken <martin_hosken@sil.org>
-rw-r--r--external/icu/khmerbreakengine.patch16
1 files changed, 8 insertions, 8 deletions
diff --git a/external/icu/khmerbreakengine.patch b/external/icu/khmerbreakengine.patch
index 03e6079b19f0..ba3e392a27f3 100644
--- a/external/icu/khmerbreakengine.patch
+++ b/external/icu/khmerbreakengine.patch
@@ -14,8 +14,8 @@ index f1c874d..3ad1b3f 100644
fTypes = breakTypes;
+ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
+
-+ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]"), status);
-+ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]"), status);
++ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]\\u200C\\u200D\\u2060"), status);
++ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]\\u200C\\u200D\\u2060"), status);
+ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
}
@@ -473,7 +473,7 @@ index f1c874d..3ad1b3f 100644
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
-@@ -828,51 +993,29 @@ foundBest:
+@@ -828,51 +993,28 @@ foundBest:
* KhmerBreakEngine
*/
@@ -506,7 +506,7 @@ index f1c874d..3ad1b3f 100644
setCharacters(fKhmerWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
- fMarkSet.add(0x0020);
+- fMarkSet.add(0x0020);
- fEndWordSet = fKhmerWordSet;
- fBeginWordSet.add(0x1780, 0x17B3);
- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
@@ -522,7 +522,7 @@ index f1c874d..3ad1b3f 100644
-// fSuffixSet.add(THAI_MAIYAMOK);
+ fIgnoreSet.add(0x2060); // WJ
+ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
-+ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:^M:]]"), status);
++ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
+ fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
// Compact for caching.
@@ -750,7 +750,7 @@ index f1c874d..3ad1b3f 100644
- if (cuWordLength <= 0) {
- wordsFound += 1;
- }
-+ } while (fMarkSet.contains(c));
++ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
+ values.setElementAt(BADSNLP, count);
+ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
+ } else {
@@ -775,7 +775,7 @@ index f1c874d..3ad1b3f 100644
- else {
- // Back up to where we were for next iteration
- utext_setNativeIndex(text, current+cuWordLength);
-+ int32_t ln_j_i = ln + i;
++ int32_t ln_j_i = ln + i; // yes really i!
+ if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
+ if (v == BADSNLP) {
+ int32_t p = prev.elementAti(i);
@@ -1395,7 +1395,7 @@ index 816c82d..c637d70 100644
+#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
+# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
+$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
-+ cp $< $(MAINBUILDDIR)
++ cp $< $(BRKBLDDIR)
+ echo "timestamp" > $@
#################################################### CFU