diff options
Diffstat (limited to 'i18npool/source/breakiterator/data/dict_word_hu.txt')
-rw-r--r-- | i18npool/source/breakiterator/data/dict_word_hu.txt | 324 |
1 files changed, 185 insertions, 139 deletions
diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index b0a0276b36a8..88648e6e5716 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -1,176 +1,222 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - - -# Fix spelling of a)-ban, b)-ben, when the letter is a reference -# resulting bad word breaking "ban" and "ben" -# (reference fields are not expanded in spell checking, yet, only -# for grammar checking). - -$PrefixLetter = [[:name = RIGHT PARENTHESIS:]]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] - [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = RIGHT DOUBLE QUOTATION MARK:] - [:name = LEFT PARENTHESIS:] - [:name = RIGHT PARENTHESIS:] - [:name = RIGHT SQUARE BRACKET:] - [:name = EXCLAMATION MARK:] - [:name = QUESTION MARK:] - [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - +$Han = [:Han:]; + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; + +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. +### tdf#116072: Extend MidLetter in Hungarian word breaking +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian + +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; + +#$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; + +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; + +$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] + [:name = LEFT PARENTHESIS:] + [:name = RIGHT PARENTHESIS:] + [:name = RIGHT SQUARE BRACKET:] + [:name = EXCLAMATION MARK:] + [:name = QUESTION MARK:] + $Symbols_hu]; + +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; + +### END CUSTOMIZATION + +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; + +# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + +## ------------------------------------------------- + +# Rule 3 - CR x LF +# +$CR $LF; -#################################################################################### +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Word Break Rules. Definitions and Rules specific to word break begin Here. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -#################################################################################### +$WSegSpace $WSegSpace; -$Format = [[:Cf:] - $TheZWSP]; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. +$ExFm = [$Extend $Format $ZWJ]; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# rule 5 +# Do not break between most letters. # -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; -[[:P:][:S:]]*; +# rule 8 -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$Numeric $ExFm* $Numeric; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +# rule 9 -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; |