From 8578a1c9d167c19f1d8038fac5946b4b3cae305e Mon Sep 17 00:00:00 2001 From: Tor Lillqvist Date: Thu, 26 Nov 2020 15:47:26 +0200 Subject: tdf#138481: Trust the built-in break iterator character data in ICU Don't use our own char.txt. Quite possibly we wouldn't need to use the other data in the i18npool/source/breakiterator/data folder either. See the README file there for the sad details. Change-Id: I82923ae76407fdd3fa3642d818d43427fe4f5591 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/106632 Tested-by: Jenkins Reviewed-by: Tor Lillqvist --- i18npool/source/breakiterator/data/char.txt | 119 ---------------------------- 1 file changed, 119 deletions(-) delete mode 100644 i18npool/source/breakiterator/data/char.txt (limited to 'i18npool/source') diff --git a/i18npool/source/breakiterator/data/char.txt b/i18npool/source/breakiterator/data/char.txt deleted file mode 100644 index 3b9f801a7ace..000000000000 --- a/i18npool/source/breakiterator/data/char.txt +++ /dev/null @@ -1,119 +0,0 @@ -# -# Copyright (C) 2002-2009, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: char.txt -# -# ICU Character Break Rules, also known as Grapheme Cluster Boundaries -# See Unicode Standard Annex #29. -# These rules are based on TR29 Revision 13, for Unicode Version 5.1 -# Modifications to SpacingMark and Prepend by M. Hosken. -# - -# -# Character Class Definitions. -# -$CR = [\p{Grapheme_Cluster_Break = CR}]; -$LF = [\p{Grapheme_Cluster_Break = LF}]; -$Control = [\p{Grapheme_Cluster_Break = Control}]; -$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; -$Extend = [\p{Grapheme_Cluster_Break = Extend}]; -$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; -# True Indic wants to move by syllables. Break up SpacingMark. This based on Unicode 6.0 data -# In effect it is [\p{Grapheme_Cluster_Break = SpacingMark} - \u0E30 \u0E32 \u0E45 \u0EB0 \u0EB2 \u102B \u102C \u1038 -# \u1062-\u1064 \u1067-\u106D \u1083 \u1087-\u108C \u108F \u109A-\u109C \u19B0-\u19B4 \u19B8-\u19C0 \u19C8 \u19C9 -# \u1A61 \u1A63 \u1A64 \u1BE7 \u1BEA-\u1BEC \u1BEE \u1BF2 \u1BF3 \uAA7B -$IndicSpacing = [\u0903 \u093B \u093E-\u0940 \u0949-\u094C \u094E \u094F \u0982 \u0983 \u09BF \u09C0 \u09C7 \u09C8 \u09CB \u09CC \u0A03 \u0A3E-\u0A40 \u0A83 \u0ABE-\u0AC0 \u0AC9 \u0ACB \u0ACC \u0B02 \u0B03 \u0B40 \u0B47 \u0B48 \u0B4B-\u0B4C \u0BBF \u0BC1 \u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0C01-\u0C03 \u0C41-\u0C44 \u0C82 \u0C83 \u0CBE \u0CC0 \u0CC1 \u0CC3 \u0CC4 \u0CC7 \u0CC8 \u0CCA \u0CCB \u0D02 \u0D03 \u0D3F \u0D40 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D82 \u0D83 \u0DD0 \u0DD1 \u0DD8-\u0DDE \u0F3E \u0F3F \u0F7F \u1923-\u1926 \u1929-\u192B \u1930 \u1931 \u1933-\u1938 \u1A19-\u1A1B \u1B04 \u1B35 \u1B3B \u1B3D-\u1B41 \u1B43 \u1B44 \u1B82 \u1BA1 \u1BA6 \u1BA7 \u1BAA \u1C24-\u1C2B \u1C34 \u1C35 \u1CE1 \u1CF2 \uA880 \uA881 \uA8B4-\uA8C3 \uA952 \uA953 \uA983 \uA9B4 \uA9B5 \uA9BA \uA9BB \uA9BD-\uA9C0 \uAA2F \uAA30 \uAA33 \uAA34 \uABE3 \uABE4 \uABE6 \uABE7 \uABE9 \uABEA \uABEC \U00011000 \U00011002 \U00011082 \U000110B0-\U000110B2 \U000110B7 \U000100B8 \U0001D166 \U0001D16D]; -# SEAsian (Thai, Lao, Burmese, Tai Lue, Tai Tham, Batak) are cluster based not syllable based -$SEASpacing = [\u0E33 \u0EB3 \u1031 \u103B \u103C \u1056 \u1057 \u1084 \u17B6 \u17BE-\u17C5 \u17C7 \u17C8 \u1A55 \u1A57 \u1A6D-\u1A72 \uA823 \uA824 \uA827 \uAA4D]; -$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; -$BengaliSignVirama = \u09CD; -$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; -$GujaratiSignVirama = \u0ACD; -$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; -$DevanagariSignVirama = \u094D; -$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; -$KannadaSignVirama = \u0CCD; -$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; -$MalayalamSignVirama = \u0D4D; -$OdiaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; -$OdiaSignVirama = \u0B4D; -$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; -$GurmukhiSignVirama = \u0A4D; -$TamilKa = \u0B95; -$TamilSignVirama = \u0BCD; -$TamilSsa = \u0BB7; -$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; -$TeluguSignVirama = \u0C4D; - -# -# Korean Syllable Definitions -# -$L = [\p{Grapheme_Cluster_Break = L}]; -$V = [\p{Grapheme_Cluster_Break = V}]; -$T = [\p{Grapheme_Cluster_Break = T}]; - -$LV = [\p{Grapheme_Cluster_Break = LV}]; -$LVT = [\p{Grapheme_Cluster_Break = LVT}]; - - -## ------------------------------------------------- -!!chain; - -!!forward; - -$CR $LF; - -$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; -$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; -$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; -$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; -$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; -$OdiaLetter ($OdiaSignVirama $OdiaLetter?)+; -$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; -$TamilKa $TamilSignVirama $TamilSsa; -$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; - -$L ($L | $V | $LV | $LVT); -($LV | $V) ($V | $T); -($LVT | $T) $T; - -[^$Control $CR $LF] $Extend; - -[^$Control $CR $LF] ($IndicSpacing | $SEASpacing); -#[^$Control $CR $LF] $SpacingMark; -# $Prepend [^$Control $CR $LF]; - - -## ------------------------------------------------- - -!!reverse; -$LF $CR; -($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; -($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; -($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; -($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; -($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; -($OdiaLetter? $OdiaSignVirama)+ $OdiaLetter; -($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; -$TamilSsa $TamilSignVirama $TamilKa; -($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; -($L | $V | $LV | $LVT) $L; -($V | $T) ($LV | $V); -$T ($LVT | $T); - -$Extend [^$Control $CR $LF]; -($IndicSpacing | $SEASpacing) [^$Control $CR $LF]; -#$SpacingMark [^$Control $CR $LF]; -# [^$Control $CR $LF] $Prepend; - - -## ------------------------------------------------- - -!!safe_reverse; - - -## ------------------------------------------------- - -!!safe_forward; - -- cgit