summaryrefslogtreecommitdiff
path: root/i18npool/source/breakiterator/data/dict_word_hu.txt
diff options
context:
space:
mode:
Diffstat (limited to 'i18npool/source/breakiterator/data/dict_word_hu.txt')
-rw-r--r--i18npool/source/breakiterator/data/dict_word_hu.txt324
1 files changed, 185 insertions, 139 deletions
diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt
index b0a0276b36a8..88648e6e5716 100644
--- a/i18npool/source/breakiterator/data/dict_word_hu.txt
+++ b/i18npool/source/breakiterator/data/dict_word_hu.txt
@@ -1,176 +1,222 @@
#
-# Copyright (C) 2002-2003, International Business Machines Corporation and others.
-# All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
#
-# file: dict_word.txt
+# file: word.txt
#
-# ICU Word Break Rules
+# ICU Word Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on Version 4.0.0, dated 2003-04-17
+# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
+# Note: Updates to word.txt will usually need to be merged into
+# word_POSIX.txt also.
-
-
-####################################################################################
+##############################################################################
#
# Character class definitions from TR 29
#
-####################################################################################
-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-
-# Fix spelling of a)-ban, b)-ben, when the letter is a reference
-# resulting bad word breaking "ban" and "ben"
-# (reference fields are not expanded in spell checking, yet, only
-# for grammar checking).
-
-$PrefixLetter = [[:name = RIGHT PARENTHESIS:]];
-
-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
- [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
- [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:]
- [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
- [:name = DIGIT ZERO:]
- [:name = DIGIT ONE:]
- [:name = DIGIT TWO:]
- [:name = DIGIT THREE:]
- [:name = DIGIT FOUR:]
- [:name = DIGIT FIVE:]
- [:name = DIGIT SIX:]
- [:name = DIGIT SEVEN:]
- [:name = DIGIT EIGHT:]
- [:name = DIGIT NINE:]
- - $Ideographic
- - $Katakana
- - $Hangul
- - [:Script = Thai:]
- - [:Script = Lao:]
- - [:Script = Hiragana:]];
-
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:]
- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]
- [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
- [:name = EN DASH:] [:name = EM DASH:]
- [:name = RIGHT DOUBLE QUOTATION MARK:]
- [:name = LEFT PARENTHESIS:]
- [:name = RIGHT PARENTHESIS:]
- [:name = RIGHT SQUARE BRACKET:]
- [:name = EXCLAMATION MARK:]
- [:name = QUESTION MARK:]
- [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]];
-
-$SufixLetter = [:name= FULL STOP:];
-
-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
- [:name = PRIME:]];
-$Numeric = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed alongside.
+### - By doing this, maintainers can more easily compare to an upstream baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
#
# Character Class Definitions.
-# The names are those from TR29.
#
-$CR = \u000d;
-$LF = \u000a;
-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend = [[:Grapheme_Extend = TRUE:]];
-
-
+$Han = [:Han:];
+
+$CR = [\p{Word_Break = CR}];
+$LF = [\p{Word_Break = LF}];
+$Newline = [\p{Word_Break = Newline}];
+$Extend = [\p{Word_Break = Extend}-$Han];
+$ZWJ = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format = [\p{Word_Break = Format}];
+$Katakana = [\p{Word_Break = Katakana}];
+$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
+$Single_Quote = [\p{Word_Break = Single_Quote}];
+$Double_Quote = [\p{Word_Break = Double_Quote}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace = [\p{Word_Break = WSegSpace}];
+$Extended_Pict = [\p{Extended_Pictographic}];
+
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously excluded.
+### tdf#116072: Extend MidLetter in Hungarian word breaking
+### i#56347: BreakIterator patch for Hungarian
+### i#56348: Special chars in first pos not handled by spell checking for Hungarian
+
+$Symbols_hu = [[:name = PERCENT SIGN:]
+ [:name = PER MILLE SIGN:]
+ [:name = PER TEN THOUSAND SIGN:]
+ [:name = SECTION SIGN:]
+ [:name = DEGREE SIGN:]
+ [:name = EURO SIGN:]
+ [:name = HYPHEN-MINUS:]
+ [:name = EN DASH:]
+ [:name = EM DASH:]];
+
+#$ALetter = [\p{Word_Break = ALetter}];
+$ALetter = [\p{Word_Break = ALetter} $Symbols_hu];
+
+$IncludedML = [:name = HYPHEN-MINUS:];
+$ExcludedML = [[:name = COLON:]
+ [:name = GREEK ANO TELEIA:]
+ [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+ [:name = SMALL COLON:]
+ [:name = FULLWIDTH COLON:]];
+
+$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:]
+ [:name = LEFT PARENTHESIS:]
+ [:name = RIGHT PARENTHESIS:]
+ [:name = RIGHT SQUARE BRACKET:]
+ [:name = EXCLAMATION MARK:]
+ [:name = QUESTION MARK:]
+ $Symbols_hu];
+
+# $MidLetter = [\p{Word_Break = MidLetter}];
+$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu];
+
+### END CUSTOMIZATION
+
+$Hiragana = [:Hiragana:];
+$Ideographic = [\p{Ideographic}];
+
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$Control = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji = [$Han $Hiragana $Katakana];
+$dictionaryCJK = [$KanaKanji $HangulSyllable];
+$dictionary = [$ComplexContext $dictionaryCJK];
+
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
+
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
-####################################################################################
+# Rule 3c Do not break within emoji zwj sequences.
+# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
#
-# Word Break Rules. Definitions and Rules specific to word break begin Here.
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
#
-####################################################################################
+$WSegSpace $WSegSpace;
-$Format = [[:Cf:] - $TheZWSP];
+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
+# of a region of Text.
+$ExFm = [$Extend $Format $ZWJ];
+^$ExFm+; # This rule fires only when there are format or extend characters at the
+ # start of text, or immediately following another boundary. It groups them, in
+ # the event there are more than one.
-# Rule 3: Treat a grapheme cluster as if it were a single character.
-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
-# because we don't need to find the boundaries between adjacent syllables -
-# they won't be word boundaries.
-#
+[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
+ # with no special rule status value.
+$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
+$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400}; # note: these status values override those from rule 5
+$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
+$Ideographic $ExFm* {400}; #
#
-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
+# rule 5
+# Do not break between most letters.
#
-$ALetterEx = $ALetter $Extend*;
-$NumericEx = $Numeric $Extend*;
-$MidNumEx = $MidNum $Extend*;
-$MidLetterEx = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx = $Katakana $Extend*;
-$IdeographicEx= $Ideographic $Extend*;
-$HangulEx = $Hangul $Extend*;
-$FormatEx = $Format $Extend*;
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
-#
-# Numbers. Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
-#
-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
-# - must include at least one letter.
-# - may include both letters and numbers.
-# - may include MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
-[[:P:][:S:]]*;
+# rule 8
-#
-# Do not break between Katakana. Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$Numeric $ExFm* $Numeric;
-#
-# Ideographic Characters. Stand by themselves as words.
-# Separated from the "Everything Else" rule, below, only so that they
-# can be tagged with a return value. TODO: is this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+# rule 9
-#
-# Everything Else, with no tag.
-# Non-Control chars combine with $Extend (combining) chars.
-# Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
-#
-# Reverse Rules. Back up over any of the chars that can group together.
-# (Reverse rules do not need to be exact; they can back up too far,
-# but must back up at least enough, and must stop on a boundary.)
-#
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm* $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
+$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
+$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
+$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
+
+$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
+$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
+$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
+$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
-# a word. (They may also be the first.) The reverse rule skips over these, until it
-# reaches something that can only be the start (and probably only) char in a "word".
-# A space or punctuation meets the test.
+# rules 15 - 17
+# Pairs of Regional Indicators stay together.
+# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
+# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
-#!.*;
-! ($NonStarters* | \n \r) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+# Rule 999
+# Match a single code point if no other rule applies.
+.;