diff options
author | Eike Rathke <erack@redhat.com> | 2018-04-17 20:13:52 +0200 |
---|---|---|
committer | Eike Rathke <erack@redhat.com> | 2018-04-18 12:33:40 +0200 |
commit | f247f08e370626bbb427acd8f4a400fd875350a3 (patch) | |
tree | b00a3b211ba4707a938741be84cf804be525def5 /external/icu/icu4c-khmerbreakengine.patch.1 | |
parent | 71df42a8d5ec20646045ffb419b89b19fbdf4c02 (diff) |
Upgrade to ICU 61.1
Change-Id: I89c1c3d13d85decc72576744de2a16d20471d29d
Reviewed-on: https://gerrit.libreoffice.org/53064
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Eike Rathke <erack@redhat.com>
Diffstat (limited to 'external/icu/icu4c-khmerbreakengine.patch.1')
-rw-r--r-- | external/icu/icu4c-khmerbreakengine.patch.1 | 246 |
1 files changed, 124 insertions, 122 deletions
diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 index 6b45b3743611..9f134dd961b1 100644 --- a/external/icu/icu4c-khmerbreakengine.patch.1 +++ b/external/icu/icu4c-khmerbreakengine.patch.1 @@ -1,16 +1,18 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp ---- icu.org/source/common/dictbe.cpp 2017-01-20 01:20:31.000000000 +0100 -+++ icu/source/common/dictbe.cpp 2017-04-21 23:14:23.845894374 +0200 -@@ -29,8 +29,17 @@ +--- icu.org/source/common/dictbe.cpp 2018-03-26 15:38:30.000000000 +0200 ++++ icu/source/common/dictbe.cpp 2018-04-17 17:55:38.620944919 +0200 +@@ -29,7 +29,19 @@ ****************************************************************** */ --DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { -+DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) : -+ clusterLimit(3) -+{ +-DictionaryBreakEngine::DictionaryBreakEngine() { ++DictionaryBreakEngine::DictionaryBreakEngine() ++ : fTypes(0), clusterLimit(0) { ++} ++ ++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) ++ : fTypes(breakTypes), clusterLimit(3) { + UErrorCode status = U_ZERO_ERROR; - fTypes = breakTypes; + fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status); + + // note Skip Sets contain fIgnoreSet characters too. @@ -20,16 +22,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp } DictionaryBreakEngine::~DictionaryBreakEngine() { -@@ -92,7 +101,7 @@ - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); - utext_setNativeIndex(text, current); - } -- -+ - return result; - } - -@@ -103,6 +112,169 @@ +@@ -76,6 +88,169 @@ fSet.compact(); } @@ -199,7 +192,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp /* ****************************************************************** * PossibleWord -@@ -130,35 +302,35 @@ +@@ -103,35 +278,35 @@ public: PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; ~PossibleWord() {}; @@ -244,7 +237,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp // Dictionary leaves text after longest prefix, not longest word. Back up. if (count <= 0) { utext_setNativeIndex(text, start); -@@ -830,51 +1002,28 @@ +@@ -803,51 +978,28 @@ * KhmerBreakEngine */ @@ -265,7 +258,8 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; - KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) - : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), +- : DictionaryBreakEngine(), ++ : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), fDictionary(adoptDictionary) { - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); @@ -301,13 +295,13 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp - fEndWordSet.compact(); - fBeginWordSet.compact(); -// fSuffixSet.compact(); -+ fIgnoreSet.compact(); -+ fBaseSet.compact(); -+ fPuncSet.compact(); ++ fIgnoreSet.compact(); ++ fBaseSet.compact(); ++ fPuncSet.compact(); } KhmerBreakEngine::~KhmerBreakEngine() { -@@ -886,180 +1035,204 @@ +@@ -859,180 +1011,204 @@ int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks ) const { @@ -350,17 +344,6 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + foundBreaks.push(rangeEnd, status); + return foundBreaks.size() - wordsFound; + } -+ -+ scanStart = rangeStart; -+ scanWJ(text, scanStart, rangeEnd, before, after); -+ if (startZwsp || initAfter >= before) { -+ after = initAfter; -+ before = 0; -+ } -+ if (!endZwsp && after > finalBefore && after < rangeEnd) -+ endZwsp = true; -+ if (endZwsp && before > finalBefore) -+ before = finalBefore; - while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { - cuWordLength = 0; @@ -375,7 +358,17 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp - cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); - wordsFound += 1; - } -- ++ scanStart = rangeStart; ++ scanWJ(text, scanStart, rangeEnd, before, after); ++ if (startZwsp || initAfter >= before) { ++ after = initAfter; ++ before = 0; ++ } ++ if (!endZwsp && after > finalBefore && after < rangeEnd) ++ endZwsp = true; ++ if (endZwsp && before > finalBefore) ++ before = finalBefore; + - // If there was more than one, see which one can take us forward the most words - else if (candidates > 1) { - // If we're already at the end of the range, we're done @@ -390,22 +383,6 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp - words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); - wordsMatched = 2; - } -- -- // If we're already at the end of the range, we're done -- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { -- goto foundBest; -- } -- -- // See if any of the possible second words is followed by a third word -- do { -- // If we find a third word, stop right away -- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { -- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); -- goto foundBest; -- } -- } -- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); -- } + utext_setNativeIndex(text, rangeStart); + int32_t numCodePts = rangeEnd - rangeStart; + // bestSnlp[i] is the snlp of the best segmentation of the first i @@ -415,7 +392,11 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + for(int32_t i = 1; i <= numCodePts; i++) { + bestSnlp.addElement(kuint32max, status); + } -+ + +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } + // prev[i] is the index of the last code point in the previous word in + // the best segmentation of the first i characters. Note negative implies + // that the code point is part of an unknown word. @@ -423,7 +404,17 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + for(int32_t i = 0; i <= numCodePts; i++) { + prev.addElement(kuint32max, status); + } -+ + +- // See if any of the possible second words is followed by a third word +- do { +- // If we find a third word, stop right away +- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- goto foundBest; +- } +- } +- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); +- } + const int32_t maxWordSize = 20; + UVector32 values(maxWordSize, status); + values.setSize(maxWordSize); @@ -528,27 +519,17 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp - if (cuWordLength <= 0) { - wordsFound += 1; - } -- -- // Update the length with the passed-over characters -- cuWordLength += chars; -- } -- else { -- // Back up to where we were for next iteration -- utext_setNativeIndex(text, current+cuWordLength); + } while (fMarkSet.contains(c) || fIgnoreSet.contains(c)); + values.setElementAt(BADSNLP, count); + lengths.setElementAt(utext_getNativeIndex(text) - currix, count++); + } else { + values.setElementAt(BADSNLP, count); + lengths.setElementAt(1, count++); - } - } ++ } ++ } -- // Never stop before a combining mark. -- int32_t currPos; -- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { -- utext_next32(text); -- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; +- // Update the length with the passed-over characters +- cuWordLength += chars; + for (int32_t j = 0; j < count; j++) { + uint32_t v = values.elementAti(j); + int32_t newSnlp = bestSnlp.elementAti(i) + v; @@ -559,7 +540,10 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + ++ln; + utext_next32(text); + c = utext_current32(text); -+ } + } +- else { +- // Back up to where we were for next iteration +- utext_setNativeIndex(text, current+cuWordLength); + int32_t ln_j_i = ln + i; // yes really i! + if (newSnlp < bestSnlp.elementAti(ln_j_i)) { + if (v == BADSNLP) { @@ -572,9 +556,38 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + else + prev.setElementAt(i, ln_j_i); + bestSnlp.setElementAt(newSnlp, ln_j_i); -+ } + } } - +- // Never stop before a combining mark. +- int32_t currPos; +- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { +- utext_next32(text); +- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; ++ } ++ // Start pushing the optimal offset index into t_boundary (t for tentative). ++ // prev[numCodePts] is guaranteed to be meaningful. ++ // We'll first push in the reverse order, i.e., ++ // t_boundary[0] = numCodePts, and afterwards do a swap. ++ UVector32 t_boundary(numCodePts+1, status); ++ ++ int32_t numBreaks = 0; ++ // No segmentation found, set boundary to end of range ++ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { ++ --numCodePts; ++ } ++ if (numCodePts < 0) { ++ t_boundary.addElement(numCodePts, status); ++ numBreaks++; ++ } else { ++ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { ++ if (i < 0) i = -i; ++ t_boundary.addElement(i, status); ++ numBreaks++; + } ++ U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); ++ } + - // Look ahead for possible suffixes if a dictionary word does not follow. - // We do this in code rather than using a rule so that the heuristic - // resynch continues to function. For example, one of the suffix characters @@ -616,30 +629,6 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp - // Did we find a word on this iteration? If so, push it on the break stack - if (cuWordLength > 0) { - foundBreaks.push((current+cuWordLength), status); -+ } -+ // Start pushing the optimal offset index into t_boundary (t for tentative). -+ // prev[numCodePts] is guaranteed to be meaningful. -+ // We'll first push in the reverse order, i.e., -+ // t_boundary[0] = numCodePts, and afterwards do a swap. -+ UVector32 t_boundary(numCodePts+1, status); -+ -+ int32_t numBreaks = 0; -+ // No segmentation found, set boundary to end of range -+ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { -+ --numCodePts; -+ } -+ if (numCodePts < 0) { -+ t_boundary.addElement(numCodePts, status); -+ numBreaks++; -+ } else { -+ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { -+ if (i < 0) i = -i; -+ t_boundary.addElement(i, status); -+ numBreaks++; -+ } -+ U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); -+ } -+ + // Now that we're done, convert positions in t_boundary[] (indices in + // the normalized input string) back to indices in the original input UText + // while reversing t_boundary and pushing values to foundBreaks. @@ -669,38 +658,35 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp #if !UCONFIG_NO_NORMALIZATION diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h ---- icu.org/source/common/dictbe.h 2017-01-20 01:20:31.000000000 +0100 -+++ icu/source/common/dictbe.h 2017-04-21 23:14:23.845894374 +0200 -@@ -34,6 +34,15 @@ +--- icu.org/source/common/dictbe.h 2018-03-26 15:38:30.000000000 +0200 ++++ icu/source/common/dictbe.h 2018-04-17 14:55:33.307639865 +0200 +@@ -34,7 +34,8 @@ + * threads without synchronization.</p> */ class DictionaryBreakEngine : public LanguageBreakEngine { - private: -+ -+ /** -+ * <p>Default constructor.</p> -+ * -+ */ -+ DictionaryBreakEngine(); -+ +- private: + protected: + /** * The set of characters handled by this engine * @internal -@@ -48,11 +57,63 @@ +@@ -42,14 +43,84 @@ - uint32_t fTypes; + UnicodeSet fSet; + const int32_t WJ = 0x2060; + const int32_t ZWSP = 0x200B; + - /** -- * <p>Default constructor.</p> -- * ++ /** ++ * The break types it was constructed with ++ * @internal ++ */ ++ uint32_t fTypes; ++ ++ /** + * A Unicode set of all viramas + * @internal - */ -- DictionaryBreakEngine(); ++ */ + UnicodeSet fViramaSet; + + /** @@ -751,10 +737,26 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h + bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const; + void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const; + void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const; - ++ public: -@@ -83,7 +144,7 @@ + /** +- * <p>Constructor </p> ++ * <p>Default constructor.</p> ++ * + */ + DictionaryBreakEngine(); + + /** ++ * <p>Constructor with break types.</p> ++ */ ++ explicit DictionaryBreakEngine(uint32_t breakTypes); ++ ++ /** + * <p>Virtual destructor.</p> + */ + virtual ~DictionaryBreakEngine(); +@@ -68,7 +139,7 @@ * <p>Find any breaks within a run in the supplied text.</p> * * @param text A UText representing the text. The iterator is left at @@ -763,7 +765,7 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h * that starts from the first character in the range. * @param startPos The start of the run within the supplied text. * @param endPos The end of the run within the supplied text. -@@ -245,118 +306,120 @@ +@@ -218,118 +289,120 @@ }; @@ -997,8 +999,8 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h /******************************************************************* diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp ---- icu.org/source/common/dictionarydata.cpp 2017-01-20 01:20:31.000000000 +0100 -+++ icu/source/common/dictionarydata.cpp 2017-04-21 23:14:23.846894372 +0200 +--- icu.org/source/common/dictionarydata.cpp 2018-03-26 15:38:30.000000000 +0200 ++++ icu/source/common/dictionarydata.cpp 2018-04-17 14:04:50.775567214 +0200 @@ -44,7 +44,7 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, @@ -1046,8 +1048,8 @@ diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionaryda if (values != NULL) { values[wordCount] = bt.getValue(); diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h ---- icu.org/source/common/dictionarydata.h 2017-01-20 01:20:31.000000000 +0100 -+++ icu/source/common/dictionarydata.h 2017-04-21 23:14:23.846894372 +0200 +--- icu.org/source/common/dictionarydata.h 2018-03-26 15:38:30.000000000 +0200 ++++ icu/source/common/dictionarydata.h 2018-04-17 14:04:50.775567214 +0200 @@ -21,6 +21,7 @@ #include "unicode/utext.h" #include "unicode/udata.h" @@ -1084,8 +1086,8 @@ diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata private: UChar32 transform(UChar32 c) const; diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in ---- icu.org/source/data/Makefile.in 2017-04-21 23:13:03.248087545 +0200 -+++ icu/source/data/Makefile.in 2017-04-21 23:14:23.846894372 +0200 +--- icu.org/source/data/Makefile.in 2018-04-17 12:28:37.098707466 +0200 ++++ icu/source/data/Makefile.in 2018-04-17 14:04:50.775567214 +0200 @@ -183,7 +183,7 @@ endif endif |