summaryrefslogtreecommitdiff
path: root/external/hunspell
diff options
context:
space:
mode:
authorLászló Németh <nemeth@numbertext.org>2017-11-02 09:51:36 +0100
committerAndras Timar <andras.timar@collabora.com>2017-11-06 17:50:04 +0100
commitf037207675010fdff2c1968a67fae5b0c2c34331 (patch)
treeda0e0d233368adc67063e024a72b2391db8a3d03 /external/hunspell
parenta7cd63df37144eba8544f6b10b83737fa0496461 (diff)
fix spell checking issues using recent Hunspell patches
Test: English word "Ian" are "item" are not allowed as "İan", "İtem" now. Patch list with commit ids in Hunspell repository: commit 66badb7449c2053c89456f11a7f71f3f5916b550 Extend dotless i and dotted I rules to Crimean Tatar language commit 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Allow dotted I in dictionary, and disable bad capitalization commit 39b785a6b03b35cc8a27f43f6005dcaa432694e1 FORBIDDENWORD precedes BREAK commit 0f691abe68788d0a58e72ab66877a9f670cd2741 Remove forbidden words from dash suggestion list commit 15b2cde4f01706f0a648518a5cfc57394d015448 tdf#95024 fix compound handling for new Hungarian orthography commit de3ae6844af62300e473f7b7b66a56e54153b4b9 fix compound word part "pa:" Change-Id: Id12b5629b0c975464072b5b144743cbe40fe45a3 Reviewed-on: https://gerrit.libreoffice.org/44200 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Andras Timar <andras.timar@collabora.com>
Diffstat (limited to 'external/hunspell')
-rw-r--r--external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch55
-rw-r--r--external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch66
-rw-r--r--external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch27
-rw-r--r--external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch29
-rw-r--r--external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch43
-rw-r--r--external/hunspell/0001-fix-compound-word-part-pa.patch26
-rw-r--r--external/hunspell/UnpackedTarball_hunspell.mk6
7 files changed, 252 insertions, 0 deletions
diff --git a/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
new file mode 100644
index 000000000000..b4b04385c935
--- /dev/null
+++ b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
@@ -0,0 +1,55 @@
+From 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
+ <laszlo.nemeth@collabora.com>
+Date: Thu, 5 Oct 2017 12:24:02 +0200
+Subject: [PATCH] Allow dotted I in dictionary, and disable bad capitalization
+ of i.
+
+Dictionary words weren't recognized with dotted I, but dictionary
+words with the letter i were recognized with dotted I, too.
+---
+ src/hunspell/hunspell.cxx | 18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 1ef11df..5c98f8a 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -562,11 +562,15 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+ }
+ }
+ case INITCAP: {
+-
++ // handle special capitalization of dotted I
++ bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
+ *info += SPELL_ORIGCAP;
+- mkallsmall2(scw, sunicw);
+- std::string u8buffer(scw);
+- mkinitcap2(scw, sunicw);
++ if (captype == ALLCAP) {
++ mkallsmall2(scw, sunicw);
++ mkinitcap2(scw, sunicw);
++ if (Idot)
++ scw.replace(0, 1, "\xc4\xb0");
++ }
+ if (captype == INITCAP)
+ *info += SPELL_INITCAP;
+ rv = checkword(scw, info, root);
+@@ -581,9 +585,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+ }
+ if (rv && is_keepcase(rv) && (captype == ALLCAP))
+ rv = NULL;
+- if (rv)
++ if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
+ break;
+
++ mkallsmall2(scw, sunicw);
++ std::string u8buffer(scw);
++ mkinitcap2(scw, sunicw);
++
+ rv = checkword(u8buffer, info, root);
+ if (abbv && !rv) {
+ u8buffer.push_back('.');
+--
+1.9.1
+
diff --git a/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
new file mode 100644
index 000000000000..66cc78188521
--- /dev/null
+++ b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
@@ -0,0 +1,66 @@
+From 66badb7449c2053c89456f11a7f71f3f5916b550 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
+ <laszlo.nemeth@collabora.com>
+Date: Thu, 5 Oct 2017 11:13:28 +0200
+Subject: [PATCH] Extend dotless i and dotted I rules to Crimean Tatar language
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+to support its special casing of ı/I, i/İ.
+
+(Use
+
+LANG crh
+
+in the affix file to use this feature.)
+---
+ src/hunspell/csutil.cxx | 5 +++--
+ src/hunspell/langnum.hxx | 1 +
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
+index df97b57..2980da7 100644
+--- a/src/hunspell/csutil.cxx
++++ b/src/hunspell/csutil.cxx
+@@ -2401,6 +2401,7 @@ static struct lang_map lang2enc[] =
+ {{"ar", LANG_ar}, {"az", LANG_az},
+ {"az_AZ", LANG_az}, // for back-compatibility
+ {"bg", LANG_bg}, {"ca", LANG_ca},
++ {"crh", LANG_crh},
+ {"cs", LANG_cs}, {"da", LANG_da},
+ {"de", LANG_de}, {"el", LANG_el},
+ {"en", LANG_en}, {"es", LANG_es},
+@@ -2458,7 +2459,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) {
+ // In Azeri and Turkish, I and i dictinct letters:
+ // There are a dotless lower case i pair of upper `I',
+ // and an upper I with dot pair of lower `i'.
+- if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr)))
++ if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
+ return 0x0130;
+ #ifdef OPENOFFICEORG
+ return static_cast<unsigned short>(u_toupper(c));
+@@ -2475,7 +2476,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) {
+ // In Azeri and Turkish, I and i dictinct letters:
+ // There are a dotless lower case i pair of upper `I',
+ // and an upper I with dot pair of lower `i'.
+- if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr)))
++ if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
+ return 0x0131;
+ #ifdef OPENOFFICEORG
+ return static_cast<unsigned short>(u_tolower(c));
+diff --git a/src/hunspell/langnum.hxx b/src/hunspell/langnum.hxx
+index a64d3d7..f09de40 100644
+--- a/src/hunspell/langnum.hxx
++++ b/src/hunspell/langnum.hxx
+@@ -48,6 +48,7 @@ enum {
+ LANG_az = 100, // custom number
+ LANG_bg = 41,
+ LANG_ca = 37,
++ LANG_crh = 102, // custom number
+ LANG_cs = 42,
+ LANG_da = 45,
+ LANG_de = 49,
+--
+1.9.1
+
diff --git a/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
new file mode 100644
index 000000000000..6cad45d8a8bf
--- /dev/null
+++ b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
@@ -0,0 +1,27 @@
+From 39b785a6b03b35cc8a27f43f6005dcaa432694e1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Mon, 9 Oct 2017 13:02:39 +0200
+Subject: [PATCH] FORBIDDENWORD precedes BREAK
+
+Now it's possible to forbid compound forms recognized by
+BREAK word breaking.
+---
+ src/hunspell/hunspell.cxx | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 5c98f8a..3fd0d16 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -633,7 +633,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+ }
+
+ // recursive breaking at break points
+- if (!wordbreak.empty()) {
++ if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
+
+ int nbr = 0;
+ wl = scw.size();
+--
+1.9.1
+
diff --git a/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
new file mode 100644
index 000000000000..b0f8563371ed
--- /dev/null
+++ b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
@@ -0,0 +1,29 @@
+From 0f691abe68788d0a58e72ab66877a9f670cd2741 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Tue, 10 Oct 2017 11:58:43 +0200
+Subject: [PATCH] Remove forbidden words from dash suggestion list
+
+---
+ src/hunspell/hunspell.cxx | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 3fd0d16..76e61b1 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -1069,7 +1069,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+ wspace.append("-");
+ wspace.append(scw.substr(dash_pos + 1));
+ }
+- insert_sug(slst, wspace);
++ int info = 0;
++ if (pAMgr && pAMgr->get_forbiddenword())
++ checkword(wspace, &info, NULL);
++ if (!(info & SPELL_FORBIDDEN))
++ insert_sug(slst, wspace);
+ }
+ nodashsug = 0;
+ }
+--
+1.9.1
+
diff --git a/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
new file mode 100644
index 000000000000..0bf52bdd95d4
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
@@ -0,0 +1,43 @@
+From 15b2cde4f01706f0a648518a5cfc57394d015448 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Thu, 12 Oct 2017 16:47:57 +0200
+Subject: [PATCH] fix compound handling for new Hungarian orthography
+
+Extend partial fix in commit 42807f970ac2d65f0d13a7c57eb454b210e92240.
+---
+ src/hunspell/affixmgr.cxx | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
+index ffce7bb..ea0f0fc 100644
+--- a/src/hunspell/affixmgr.cxx
++++ b/src/hunspell/affixmgr.cxx
+@@ -1990,6 +1990,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
++ } else {
++ numsyllable -= sfxextra;
+ }
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian
+@@ -2024,7 +2026,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+-
+ // second word is acceptable, as a word with prefix or/and suffix?
+ // hungarian conventions: compounding is acceptable,
+ // when compound forms consist 2 word, otherwise
+@@ -2553,6 +2554,8 @@ int AffixMgr::compound_check_morph(const char* word,
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
++ } else {
++ numsyllable -= sfxextra;
+ }
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian
+--
+1.9.1
+
diff --git a/external/hunspell/0001-fix-compound-word-part-pa.patch b/external/hunspell/0001-fix-compound-word-part-pa.patch
new file mode 100644
index 000000000000..152a9ff58a14
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-word-part-pa.patch
@@ -0,0 +1,26 @@
+From de3ae6844af62300e473f7b7b66a56e54153b4b9 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Mon, 16 Oct 2017 23:00:23 +0200
+Subject: [PATCH] fix compound word part "pa:"
+
+(regression in morphological analysis)
+---
+ src/hunspell/affixmgr.cxx | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
+index ea0f0fc..52c7fa5 100644
+--- a/src/hunspell/affixmgr.cxx
++++ b/src/hunspell/affixmgr.cxx
+@@ -2608,7 +2608,7 @@ int AffixMgr::compound_check_morph(const char* word,
+ if (!m.empty()) {
+ result.push_back(MSEP_FLD);
+ result.append(MORPH_PART);
+- result.append(word + 1);
++ result.append(word + i);
+ line_uniq_app(m, MSEP_REC);
+ result.append(m);
+ }
+--
+1.9.1
+
diff --git a/external/hunspell/UnpackedTarball_hunspell.mk b/external/hunspell/UnpackedTarball_hunspell.mk
index 3bb7e5e42dc7..23d3aca47131 100644
--- a/external/hunspell/UnpackedTarball_hunspell.mk
+++ b/external/hunspell/UnpackedTarball_hunspell.mk
@@ -21,6 +21,12 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1))
$(eval $(call gb_UnpackedTarball_add_patches,hunspell, \
external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \
+ external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch \
+ external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch \
+ external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch \
+ external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch \
+ external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch \
+ external/hunspell/0001-fix-compound-word-part-pa.patch \
))
# vim: set noet sw=4 ts=4: