From f037207675010fdff2c1968a67fae5b0c2c34331 Mon Sep 17 00:00:00 2001 From: László Németh Date: Thu, 2 Nov 2017 09:51:36 +0100 Subject: fix spell checking issues using recent Hunspell patches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test: English word "Ian" are "item" are not allowed as "İan", "İtem" now. Patch list with commit ids in Hunspell repository: commit 66badb7449c2053c89456f11a7f71f3f5916b550 Extend dotless i and dotted I rules to Crimean Tatar language commit 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Allow dotted I in dictionary, and disable bad capitalization commit 39b785a6b03b35cc8a27f43f6005dcaa432694e1 FORBIDDENWORD precedes BREAK commit 0f691abe68788d0a58e72ab66877a9f670cd2741 Remove forbidden words from dash suggestion list commit 15b2cde4f01706f0a648518a5cfc57394d015448 tdf#95024 fix compound handling for new Hungarian orthography commit de3ae6844af62300e473f7b7b66a56e54153b4b9 fix compound word part "pa:" Change-Id: Id12b5629b0c975464072b5b144743cbe40fe45a3 Reviewed-on: https://gerrit.libreoffice.org/44200 Tested-by: Jenkins Reviewed-by: Andras Timar --- ...d-I-in-dictionary-and-disable-bad-capital.patch | 55 ++++++++++++++++++ ...ess-i-and-dotted-I-rules-to-Crimean-Tatar.patch | 66 ++++++++++++++++++++++ .../0001-FORBIDDENWORD-precedes-BREAK.patch | 27 +++++++++ ...forbidden-words-from-dash-suggestion-list.patch | 29 ++++++++++ ...nd-handling-for-new-Hungarian-orthography.patch | 43 ++++++++++++++ .../hunspell/0001-fix-compound-word-part-pa.patch | 26 +++++++++ external/hunspell/UnpackedTarball_hunspell.mk | 6 ++ 7 files changed, 252 insertions(+) create mode 100644 external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch create mode 100644 external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch create mode 100644 external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch create mode 100644 external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch create mode 100644 external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch create mode 100644 external/hunspell/0001-fix-compound-word-part-pa.patch (limited to 'external/hunspell') diff --git a/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch new file mode 100644 index 000000000000..b4b04385c935 --- /dev/null +++ b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch @@ -0,0 +1,55 @@ +From 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= + +Date: Thu, 5 Oct 2017 12:24:02 +0200 +Subject: [PATCH] Allow dotted I in dictionary, and disable bad capitalization + of i. + +Dictionary words weren't recognized with dotted I, but dictionary +words with the letter i were recognized with dotted I, too. +--- + src/hunspell/hunspell.cxx | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx +index 1ef11df..5c98f8a 100644 +--- a/src/hunspell/hunspell.cxx ++++ b/src/hunspell/hunspell.cxx +@@ -562,11 +562,15 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) + } + } + case INITCAP: { +- ++ // handle special capitalization of dotted I ++ bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0); + *info += SPELL_ORIGCAP; +- mkallsmall2(scw, sunicw); +- std::string u8buffer(scw); +- mkinitcap2(scw, sunicw); ++ if (captype == ALLCAP) { ++ mkallsmall2(scw, sunicw); ++ mkinitcap2(scw, sunicw); ++ if (Idot) ++ scw.replace(0, 1, "\xc4\xb0"); ++ } + if (captype == INITCAP) + *info += SPELL_INITCAP; + rv = checkword(scw, info, root); +@@ -581,9 +585,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) + } + if (rv && is_keepcase(rv) && (captype == ALLCAP)) + rv = NULL; +- if (rv) ++ if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh)) + break; + ++ mkallsmall2(scw, sunicw); ++ std::string u8buffer(scw); ++ mkinitcap2(scw, sunicw); ++ + rv = checkword(u8buffer, info, root); + if (abbv && !rv) { + u8buffer.push_back('.'); +-- +1.9.1 + diff --git a/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch new file mode 100644 index 000000000000..66cc78188521 --- /dev/null +++ b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch @@ -0,0 +1,66 @@ +From 66badb7449c2053c89456f11a7f71f3f5916b550 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= + +Date: Thu, 5 Oct 2017 11:13:28 +0200 +Subject: [PATCH] Extend dotless i and dotted I rules to Crimean Tatar language +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +to support its special casing of ı/I, i/İ. + +(Use + +LANG crh + +in the affix file to use this feature.) +--- + src/hunspell/csutil.cxx | 5 +++-- + src/hunspell/langnum.hxx | 1 + + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx +index df97b57..2980da7 100644 +--- a/src/hunspell/csutil.cxx ++++ b/src/hunspell/csutil.cxx +@@ -2401,6 +2401,7 @@ static struct lang_map lang2enc[] = + {{"ar", LANG_ar}, {"az", LANG_az}, + {"az_AZ", LANG_az}, // for back-compatibility + {"bg", LANG_bg}, {"ca", LANG_ca}, ++ {"crh", LANG_crh}, + {"cs", LANG_cs}, {"da", LANG_da}, + {"de", LANG_de}, {"el", LANG_el}, + {"en", LANG_en}, {"es", LANG_es}, +@@ -2458,7 +2459,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) { + // In Azeri and Turkish, I and i dictinct letters: + // There are a dotless lower case i pair of upper `I', + // and an upper I with dot pair of lower `i'. +- if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr))) ++ if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh))) + return 0x0130; + #ifdef OPENOFFICEORG + return static_cast(u_toupper(c)); +@@ -2475,7 +2476,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) { + // In Azeri and Turkish, I and i dictinct letters: + // There are a dotless lower case i pair of upper `I', + // and an upper I with dot pair of lower `i'. +- if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr))) ++ if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh))) + return 0x0131; + #ifdef OPENOFFICEORG + return static_cast(u_tolower(c)); +diff --git a/src/hunspell/langnum.hxx b/src/hunspell/langnum.hxx +index a64d3d7..f09de40 100644 +--- a/src/hunspell/langnum.hxx ++++ b/src/hunspell/langnum.hxx +@@ -48,6 +48,7 @@ enum { + LANG_az = 100, // custom number + LANG_bg = 41, + LANG_ca = 37, ++ LANG_crh = 102, // custom number + LANG_cs = 42, + LANG_da = 45, + LANG_de = 49, +-- +1.9.1 + diff --git a/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch new file mode 100644 index 000000000000..6cad45d8a8bf --- /dev/null +++ b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch @@ -0,0 +1,27 @@ +From 39b785a6b03b35cc8a27f43f6005dcaa432694e1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= +Date: Mon, 9 Oct 2017 13:02:39 +0200 +Subject: [PATCH] FORBIDDENWORD precedes BREAK + +Now it's possible to forbid compound forms recognized by +BREAK word breaking. +--- + src/hunspell/hunspell.cxx | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx +index 5c98f8a..3fd0d16 100644 +--- a/src/hunspell/hunspell.cxx ++++ b/src/hunspell/hunspell.cxx +@@ -633,7 +633,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) + } + + // recursive breaking at break points +- if (!wordbreak.empty()) { ++ if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) { + + int nbr = 0; + wl = scw.size(); +-- +1.9.1 + diff --git a/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch new file mode 100644 index 000000000000..b0f8563371ed --- /dev/null +++ b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch @@ -0,0 +1,29 @@ +From 0f691abe68788d0a58e72ab66877a9f670cd2741 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= +Date: Tue, 10 Oct 2017 11:58:43 +0200 +Subject: [PATCH] Remove forbidden words from dash suggestion list + +--- + src/hunspell/hunspell.cxx | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx +index 3fd0d16..76e61b1 100644 +--- a/src/hunspell/hunspell.cxx ++++ b/src/hunspell/hunspell.cxx +@@ -1069,7 +1069,11 @@ std::vector HunspellImpl::suggest(const std::string& word) { + wspace.append("-"); + wspace.append(scw.substr(dash_pos + 1)); + } +- insert_sug(slst, wspace); ++ int info = 0; ++ if (pAMgr && pAMgr->get_forbiddenword()) ++ checkword(wspace, &info, NULL); ++ if (!(info & SPELL_FORBIDDEN)) ++ insert_sug(slst, wspace); + } + nodashsug = 0; + } +-- +1.9.1 + diff --git a/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch new file mode 100644 index 000000000000..0bf52bdd95d4 --- /dev/null +++ b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch @@ -0,0 +1,43 @@ +From 15b2cde4f01706f0a648518a5cfc57394d015448 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= +Date: Thu, 12 Oct 2017 16:47:57 +0200 +Subject: [PATCH] fix compound handling for new Hungarian orthography + +Extend partial fix in commit 42807f970ac2d65f0d13a7c57eb454b210e92240. +--- + src/hunspell/affixmgr.cxx | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx +index ffce7bb..ea0f0fc 100644 +--- a/src/hunspell/affixmgr.cxx ++++ b/src/hunspell/affixmgr.cxx +@@ -1990,6 +1990,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word, + std::string tmp(sfxappnd); + reverseword(tmp); + numsyllable -= get_syllable(tmp) + sfxextra; ++ } else { ++ numsyllable -= sfxextra; + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian +@@ -2024,7 +2026,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word, + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } +- + // second word is acceptable, as a word with prefix or/and suffix? + // hungarian conventions: compounding is acceptable, + // when compound forms consist 2 word, otherwise +@@ -2553,6 +2554,8 @@ int AffixMgr::compound_check_morph(const char* word, + std::string tmp(sfxappnd); + reverseword(tmp); + numsyllable -= get_syllable(tmp) + sfxextra; ++ } else { ++ numsyllable -= sfxextra; + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian +-- +1.9.1 + diff --git a/external/hunspell/0001-fix-compound-word-part-pa.patch b/external/hunspell/0001-fix-compound-word-part-pa.patch new file mode 100644 index 000000000000..152a9ff58a14 --- /dev/null +++ b/external/hunspell/0001-fix-compound-word-part-pa.patch @@ -0,0 +1,26 @@ +From de3ae6844af62300e473f7b7b66a56e54153b4b9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= +Date: Mon, 16 Oct 2017 23:00:23 +0200 +Subject: [PATCH] fix compound word part "pa:" + +(regression in morphological analysis) +--- + src/hunspell/affixmgr.cxx | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx +index ea0f0fc..52c7fa5 100644 +--- a/src/hunspell/affixmgr.cxx ++++ b/src/hunspell/affixmgr.cxx +@@ -2608,7 +2608,7 @@ int AffixMgr::compound_check_morph(const char* word, + if (!m.empty()) { + result.push_back(MSEP_FLD); + result.append(MORPH_PART); +- result.append(word + 1); ++ result.append(word + i); + line_uniq_app(m, MSEP_REC); + result.append(m); + } +-- +1.9.1 + diff --git a/external/hunspell/UnpackedTarball_hunspell.mk b/external/hunspell/UnpackedTarball_hunspell.mk index 3bb7e5e42dc7..23d3aca47131 100644 --- a/external/hunspell/UnpackedTarball_hunspell.mk +++ b/external/hunspell/UnpackedTarball_hunspell.mk @@ -21,6 +21,12 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1)) $(eval $(call gb_UnpackedTarball_add_patches,hunspell, \ external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \ + external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch \ + external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch \ + external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch \ + external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch \ + external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch \ + external/hunspell/0001-fix-compound-word-part-pa.patch \ )) # vim: set noet sw=4 ts=4: -- cgit