in addition to that: configure.ac portion was fixed to not have unbalanced [] From d9f392dc35f75b1246862b2db8090e8d5b6ec068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= Date: Sun, 17 Jun 2018 17:21:01 +0200 Subject: [PATCH] recent Hunspell fixes for suggestion, spelling and analysis 6f976bf fix compiling on WIN32, use time.h and thread_local 24f0963 [morph] better time limitation for morphological analysis 8e6ceaa [spelling] tdf#118162 better time limitation for compounding 3f00ff3 [suggestion] tdf#118162 time limit for a HunspellImpl::suggest() call a1f9dfa [suggestion] tdf#118162 time limit for a SuggestMgr::suggest() call d70bf2d [spelling] optimize IGNORE to speed up dictionary loading 16b4900 [spelling] add time limit for compound word handling b0ded55 [suggestion] lower limit for doubletwochars b3a44fa [suggestion] limit longswapchar, lower limit for movechar a295af9 [morph] clean up for separators of morphological analysis ca5f629 [morph] add missing field separator for members with prefixes --- Makefile.in | 1 + configure.ac | 8 ++ src/hunspell/affentry.cxx | 12 +-- src/hunspell/affixmgr.cxx | 89 +++++++++++++------ src/hunspell/atypes.hxx | 10 +++ src/hunspell/csutil.hxx | 12 +++ src/hunspell/hashmgr.cxx | 2 +- src/hunspell/hunspell.cxx | 210 ++++++++++++++++++++++++++------------------ src/hunspell/hunvisapi.h | 12 ++- src/hunspell/hunvisapi.h.in | 12 ++- src/hunspell/suggestmgr.cxx | 72 +++++++++++---- src/hunspell/suggestmgr.hxx | 5 -- 12 files changed, 300 insertions(+), 145 deletions(-) diff --git a/Makefile.in b/Makefile.in index 06d933e..241f797 100644 --- a/Makefile.in +++ b/Makefile.in @@ -296,6 +296,7 @@ GMSGFMT = @GMSGFMT@ GMSGFMT_015 = @GMSGFMT_015@ GREP = @GREP@ HAVE_ASPRINTF = @HAVE_ASPRINTF@ +HAVE_CXX11 = @HAVE_CXX11@ HAVE_NEWLOCALE = @HAVE_NEWLOCALE@ HAVE_POSIX_PRINTF = @HAVE_POSIX_PRINTF@ HAVE_SNPRINTF = @HAVE_SNPRINTF@ diff --git a/configure.ac b/configure.ac index fb79d0d..2936107 100644 --- a/configure.ac +++ b/configure.ac @@ -16,6 +16,14 @@ HUNSPELL_VERSION_MINOR=`echo $VERSION | cut -d"." -f2` AC_SUBST(HUNSPELL_VERSION_MAJOR) AC_SUBST(HUNSPELL_VERSION_MINOR) +# check C++11 compiling environment for thread_local +# to handle time limits better also with threads +AS_CASE([$CXXFLAGS], + [*-std=c++11*], [HAVE_CXX11=1], + [HAVE_CXX11=0] + ) +AC_SUBST(HAVE_CXX11) + # Checks for programs. AC_PROG_CXX AC_PROG_CC diff --git a/src/hunspell/affentry.cxx b/src/hunspell/affentry.cxx index 4ef0c00..ffcdb21 100644 --- a/src/hunspell/affentry.cxx +++ b/src/hunspell/affentry.cxx @@ -399,28 +399,28 @@ std::string PfxEntry::check_morph(const char* word, ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) { if (morphcode) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(morphcode); } else result.append(getKey()); if (!HENTRY_FIND(he, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(he)); } // store the pointer of the hash entry if (HENTRY_DATA(he)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(he)); } else { // return with debug information char* flag = pmyMgr->encode_flag(getFlag()); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_FLAG); result.append(flag); free(flag); } - result.append("\n"); + result.push_back(MSEP_REC); } he = he->next_homonym; } while (he); @@ -804,7 +804,7 @@ std::string SfxEntry::check_twosfx_morph(const char* word, if (!st.empty()) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } result.append(st); mychomp(result); diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx index 2c540f2..1610ef0 100644 --- a/src/hunspell/affixmgr.cxx +++ b/src/hunspell/affixmgr.cxx @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -1014,7 +1015,7 @@ int AffixMgr::process_sfx_order() { // add flags to the result for dictionary debugging std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { char* st = encode_flag(flag); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_FLAG); if (st) { result.append(st); @@ -1594,6 +1595,17 @@ struct hentry* AffixMgr::compound_check(const std::string& word, int checked_prefix; + // add a time limit to handle possible + // combinatorical explosion of the overlapping words + + HUNSPELL_THREAD_LOCAL clock_t timelimit; + + if (wordnum == 0) + timelimit = clock(); + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { + timelimit = 0; + } + setcminmax(&cmin, &cmax, word.c_str(), len); st.assign(word); @@ -1618,6 +1630,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word, do { // simplified checkcompoundpattern loop + if (timelimit == 0) + return 0; + if (scpd > 0) { for (; scpd <= checkcpdtable.size() && (checkcpdtable[scpd - 1].pattern3.empty() || @@ -2186,6 +2201,17 @@ int AffixMgr::compound_check_morph(const char* word, char affixed = 0; hentry** oldwords = words; + // add a time limit to handle possible + // combinatorical explosion of the overlapping words + + HUNSPELL_THREAD_LOCAL clock_t timelimit; + + if (wordnum == 0) + timelimit = clock(); + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { + timelimit = 0; + } + setcminmax(&cmin, &cmax, word, len); st.assign(word); @@ -2204,6 +2230,9 @@ int AffixMgr::compound_check_morph(const char* word, do { // onlycpdrule loop + if (timelimit == 0) + return 0; + oldnumsyllable = numsyllable; oldwordnum = wordnum; checked_prefix = 0; @@ -2245,6 +2274,9 @@ int AffixMgr::compound_check_morph(const char* word, rv = rv->next_homonym; } + if (timelimit == 0) + return 0; + if (rv) affixed = 0; @@ -2435,22 +2467,22 @@ int AffixMgr::compound_check_morph(const char* word, if (rv && words && words[wnum + 1]) { result.append(presult); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_PART); result.append(word + i); if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } - result.append("\n"); + result.push_back(MSEP_REC); return 0; } @@ -2492,7 +2524,7 @@ int AffixMgr::compound_check_morph(const char* word, ((!checkcompounddup || (rv != rv_first)))) { // bad compound word result.append(presult); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_PART); result.append(word + i); @@ -2500,17 +2532,17 @@ int AffixMgr::compound_check_morph(const char* word, if (complexprefixes) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry if (!complexprefixes) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } } - result.append("\n"); + result.push_back(MSEP_REC); ok = 1; } @@ -2549,7 +2581,7 @@ int AffixMgr::compound_check_morph(const char* word, line_uniq_app(m, MSEP_REC); result.append(m); } - result.append("\n"); + result.push_back(MSEP_REC); ok = 1; } } @@ -2639,6 +2671,7 @@ int AffixMgr::compound_check_morph(const char* word, result.append(MORPH_PART); result.append(word + i); line_uniq_app(m, MSEP_REC); + result.push_back(MSEP_FLD); result.append(m); } result.push_back(MSEP_REC); @@ -2864,17 +2897,17 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } result.append(st); if (se->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(se->getMorph()); } else debugflag(result, se->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); } } se = se->getNext(); @@ -2899,12 +2932,12 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word, result3.clear(); if (sptr->getMorph()) { - result3.append(" "); + result3.push_back(MSEP_FLD); result3.append(sptr->getMorph()); } else debugflag(result3, sptr->getFlag()); strlinecat(result2, result3); - result2.append("\n"); + result2.push_back(MSEP_REC); result.append(result2); } } @@ -2967,28 +3000,28 @@ std::string AffixMgr::suffix_check_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } if (se->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(se->getMorph()); } else debugflag(result, se->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -3034,29 +3067,29 @@ std::string AffixMgr::suffix_check_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } if (sptr->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(sptr->getMorph()); } else debugflag(result, sptr->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -3245,7 +3278,7 @@ std::string AffixMgr::morphgen(const char* ts, // use input suffix fields, if exist if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { mymorph.assign(morph); - mymorph.append(" "); + mymorph.push_back(MSEP_FLD); stemmorphcatpos = mymorph.size(); } else { stemmorphcatpos = std::string::npos; @@ -4557,7 +4590,7 @@ bool AffixMgr::parse_affix(const std::string& line, entry->appnd = std::string(start_piece, dash); std::string dash_str(dash + 1, iter); - if (!ignorechars.empty()) { + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { if (utf8) { remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); } else { @@ -4593,7 +4626,7 @@ bool AffixMgr::parse_affix(const std::string& line, } else { entry->appnd = std::string(start_piece, iter); - if (!ignorechars.empty()) { + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { if (utf8) { remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); } else { diff --git a/src/hunspell/atypes.hxx b/src/hunspell/atypes.hxx index f841523..38396db 100644 --- a/src/hunspell/atypes.hxx +++ b/src/hunspell/atypes.hxx @@ -95,6 +95,16 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} #define TESTAFF(a, b, c) (std::binary_search(a, a + c, b)) +// timelimit: max. ~1/4 sec (process time on Linux) for +// for a suggestion, including max. ~/10 sec for a case +// sensitive plain or compound word suggestion, within +// ~1/20 sec long time consuming suggestion functions +#define TIMELIMIT_GLOBAL (CLOCKS_PER_SEC / 4) +#define TIMELIMIT_SUGGESTION (CLOCKS_PER_SEC / 10) +#define TIMELIMIT (CLOCKS_PER_SEC / 20) +#define MINTIMER 100 +#define MAXPLUSTIMER 100 + struct guessword { char* word; bool allow; diff --git a/src/hunspell/csutil.hxx b/src/hunspell/csutil.hxx index 01c0a24..3397257 100644 --- a/src/hunspell/csutil.hxx +++ b/src/hunspell/csutil.hxx @@ -311,4 +311,16 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h, return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); } +// to avoid unnecessary string copies and Unicode conversions +// we simply check the ignored_chars characters in the word +// (in the case of UTF-8 encoded strings, "false" means +// "likely false", if ignored_chars characters are not ASCII) +inline bool has_no_ignored_chars(const std::string& word, + const std::string& ignored_chars) { + for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it) + if (word.find(*it) != std::string::npos) + return false; + return true; +} + #endif diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx index 5183f02..7e843c3 100644 --- a/src/hunspell/hashmgr.cxx +++ b/src/hunspell/hashmgr.cxx @@ -190,7 +190,7 @@ int HashMgr::add_word(const std::string& in_word, std::string *word_copy = NULL; std::string *desc_copy = NULL; - if (!ignorechars.empty() || complexprefixes) { + if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) { word_copy = new std::string(in_word); if (!ignorechars.empty()) { diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx index d6e871f..0dcd748 100644 --- a/src/hunspell/hunspell.cxx +++ b/src/hunspell/hunspell.cxx @@ -71,6 +71,7 @@ #include #include #include +#include #include "affixmgr.hxx" #include "hunspell.hxx" @@ -101,7 +102,8 @@ public: bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL); std::vector suggest(const std::string& word); - std::vector suggest_internal(const std::string& word); + std::vector suggest_internal(const std::string& word, + bool& capitalized, size_t& abbreviated, int& captype); const std::string& get_wordchars() const; const std::vector& get_wordchars_utf16() const; const std::string& get_dict_encoding() const; @@ -755,7 +757,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str int len; const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; - if (ignoredchars != NULL) { + if (ignoredchars != NULL && !has_no_ignored_chars(w, ignoredchars)) { w2.assign(w); if (utf8) { const std::vector& ignoredchars_utf16 = @@ -887,8 +889,83 @@ std::vector Hunspell::suggest(const std::string& word) { } std::vector HunspellImpl::suggest(const std::string& word) { - std::vector slst; - slst = suggest_internal(word); + bool capwords; + size_t abbv; + int captype; + std::vector slst = suggest_internal(word, capwords, abbv, captype); + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (size_t j = 0; j < slst.size(); ++j) { + if (utf8) + reverseword_utf(slst[j]); + else + reverseword(slst[j]); + } + } + + // capitalize + if (capwords) + for (size_t j = 0; j < slst.size(); ++j) { + mkinitcap(slst[j]); + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (size_t j = 0; j < slst.size(); ++j) { + slst[j].append(word.substr(word.size() - abbv)); + } + } + + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { + switch (captype) { + case INITCAP: + case ALLCAP: { + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { + std::string s; + std::vector w; + if (utf8) { + u8_u16(w, slst[j]); + } else { + s = slst[j]; + } + mkallsmall2(s, w); + if (spell(s)) { + slst[l] = s; + ++l; + } else { + mkinitcap2(s, w); + if (spell(s)) { + slst[l] = s; + ++l; + } + } + } else { + slst[l] = slst[j]; + ++l; + } + } + slst.resize(l); + } + } + } + + // remove duplications + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + slst[l] = slst[j]; + for (size_t k = 0; k < l; ++k) { + if (slst[k] == slst[j]) { + --l; + break; + } + } + ++l; + } + slst.resize(l); + // output conversion RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; if (rl) { @@ -902,7 +979,8 @@ std::vector HunspellImpl::suggest(const std::string& word) { return slst; } -std::vector HunspellImpl::suggest_internal(const std::string& word) { +std::vector HunspellImpl::suggest_internal(const std::string& word, + bool& capwords, size_t& abbv, int& captype) { std::vector slst; int onlycmpdsug = 0; @@ -920,8 +998,8 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) if (word.size() >= MAXWORDLEN) return slst; } - int captype = NOCAP; - size_t abbv = 0; + captype = NOCAP; + abbv = 0; size_t wl = 0; std::string scw; @@ -942,9 +1020,13 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) return slst; } - int capwords = 0; + capwords = false; bool good = false; + HUNSPELL_THREAD_LOCAL clock_t timelimit; + // initialize in every suggestion call + timelimit = clock(); + // check capitalized form for FORCEUCASE if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { int info = SPELL_ORIGCAP; @@ -959,26 +1041,36 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) switch (captype) { case NOCAP: { good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; if (abbv) { std::string wspace(scw); wspace.push_back('.'); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; } break; } case INITCAP: { - capwords = 1; + capwords = true; good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; std::string wspace(scw); mkallsmall2(wspace, sunicw); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case HUHINITCAP: - capwords = 1; + capwords = true; case HUHCAP: { good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; // something.The -> something. The size_t dot_pos = scw.find('.'); if (dot_pos != std::string::npos) { @@ -1005,6 +1097,8 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) wspace = scw; mkinitsmall2(wspace, sunicw); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; } wspace = scw; mkallsmall2(wspace, sunicw); @@ -1012,11 +1106,15 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) insert_sug(slst, wspace); size_t prevns = slst.size(); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; if (captype == HUHINITCAP) { mkinitcap2(wspace, sunicw); if (spell(wspace.c_str())) insert_sug(slst, wspace); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; } // aNew -> "a New" (instead of "a new") for (size_t j = prevns; j < slst.size(); ++j) { @@ -1044,10 +1142,14 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) std::string wspace(scw); mkallsmall2(wspace, sunicw); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) insert_sug(slst, wspace); mkinitcap2(wspace, sunicw); good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (size_t j = 0; j < slst.size(); ++j) { mkallcap(slst[j]); if (pAMgr && pAMgr->get_checksharps()) { @@ -1084,21 +1186,27 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) switch (captype) { case NOCAP: { pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case HUHINITCAP: - capwords = 1; + capwords = true; case HUHCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case INITCAP: { - capwords = 1; + capwords = true; std::string wspace(scw); mkallsmall2(wspace, sunicw); pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case ALLCAP: { @@ -1106,6 +1214,8 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) mkallsmall2(wspace, sunicw); size_t oldns = slst.size(); pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (size_t j = oldns; j < slst.size(); ++j) { mkallcap(slst[j]); } @@ -1137,6 +1247,8 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); if (!spell(chunk.c_str())) { std::vector nlst = suggest(chunk.c_str()); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (std::vector::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) { std::string wspace = scw.substr(0, prev_pos); wspace.append(*j); @@ -1160,80 +1272,6 @@ std::vector HunspellImpl::suggest_internal(const std::string& word) dash_pos = scw.size(); } } - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - for (size_t j = 0; j < slst.size(); ++j) { - if (utf8) - reverseword_utf(slst[j]); - else - reverseword(slst[j]); - } - } - - // capitalize - if (capwords) - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); - } - - // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (size_t j = 0; j < slst.size(); ++j) { - slst[j].append(word.substr(word.size() - abbv)); - } - } - - // remove bad capitalized and forbidden forms - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { - switch (captype) { - case INITCAP: - case ALLCAP: { - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { - std::string s; - std::vector w; - if (utf8) { - u8_u16(w, slst[j]); - } else { - s = slst[j]; - } - mkallsmall2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } else { - mkinitcap2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } - } - } else { - slst[l] = slst[j]; - ++l; - } - } - slst.resize(l); - } - } - } - - // remove duplications - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - slst[l] = slst[j]; - for (size_t k = 0; k < l; ++k) { - if (slst[k] == slst[j]) { - --l; - break; - } - } - ++l; - } - slst.resize(l); - return slst; } diff --git a/src/hunspell/hunvisapi.h b/src/hunspell/hunvisapi.h index eb2b348..8283017 100644 --- a/src/hunspell/hunvisapi.h +++ b/src/hunspell/hunvisapi.h @@ -3,7 +3,7 @@ #if defined(HUNSPELL_STATIC) # define LIBHUNSPELL_DLL_EXPORTED -#elif defined(_MSC_VER) +#elif defined(_WIN32) # if defined(BUILDING_LIBHUNSPELL) # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport) # else @@ -15,4 +15,14 @@ # define LIBHUNSPELL_DLL_EXPORTED #endif +/* use thread_local, if it's possible, otherwise static */ + +#if defined(_WIN32) +# define HUNSPELL_THREAD_LOCAL thread_local +#elif 0 +# define HUNSPELL_THREAD_LOCAL thread_local +#else +# define HUNSPELL_THREAD_LOCAL static +#endif + #endif diff --git a/src/hunspell/hunvisapi.h.in b/src/hunspell/hunvisapi.h.in index a1020c8..85972dd 100644 --- a/src/hunspell/hunvisapi.h.in +++ b/src/hunspell/hunvisapi.h.in @@ -3,7 +3,7 @@ #if defined(HUNSPELL_STATIC) # define LIBHUNSPELL_DLL_EXPORTED -#elif defined(_MSC_VER) +#elif defined(_WIN32) # if defined(BUILDING_LIBHUNSPELL) # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport) # else @@ -15,4 +15,14 @@ # define LIBHUNSPELL_DLL_EXPORTED #endif +/* use thread_local, if it's possible, otherwise static */ + +#if defined(_WIN32) +# define HUNSPELL_THREAD_LOCAL thread_local +#elif @HAVE_CXX11@ +# define HUNSPELL_THREAD_LOCAL thread_local +#else +# define HUNSPELL_THREAD_LOCAL static +#endif + #endif diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx index ade85af..d9fabca 100644 --- a/src/hunspell/suggestmgr.cxx +++ b/src/hunspell/suggestmgr.cxx @@ -72,6 +72,7 @@ #include #include #include +#include #include "suggestmgr.hxx" #include "htypes.hxx" @@ -79,6 +80,8 @@ const w_char W_VLINE = {'\0', '|'}; +#define MAX_CHAR_DISTANCE 4 + SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) { // register affix manager and check in string of chars to // try when building candidate suggestions @@ -211,6 +214,11 @@ bool SuggestMgr::suggest(std::vector& slst, for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion; cpdsuggest++) { + + HUNSPELL_THREAD_LOCAL clock_t timelimit; + // initialize both in non-compound and compound cycles + timelimit = clock(); + // limit compound suggestion if (cpdsuggest > 0) oldSug = slst.size(); @@ -233,12 +241,16 @@ bool SuggestMgr::suggest(std::vector& slst, if (slst.size() > i) good_suggestion = true; } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // perhaps we made chose the wrong char from a related set if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { mapchars(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // only suggest compound words when no other suggestion if ((cpdsuggest == 0) && (slst.size() > nsugorig)) @@ -251,6 +263,8 @@ bool SuggestMgr::suggest(std::vector& slst, else swapchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we swap the order of non adjacent chars by mistake if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -259,6 +273,8 @@ bool SuggestMgr::suggest(std::vector& slst, else longswapchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we just hit the wrong key in place of a good char (case and keyboard) if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -267,6 +283,8 @@ bool SuggestMgr::suggest(std::vector& slst, else badcharkey(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we add a char that should not be there if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -275,6 +293,8 @@ bool SuggestMgr::suggest(std::vector& slst, else extrachar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we forgot a char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -283,6 +303,8 @@ bool SuggestMgr::suggest(std::vector& slst, else forgotchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we move a char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -291,6 +313,8 @@ bool SuggestMgr::suggest(std::vector& slst, else movechar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we just hit the wrong key in place of a good char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -299,6 +323,8 @@ bool SuggestMgr::suggest(std::vector& slst, else badchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we double two characters if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { @@ -307,6 +333,8 @@ bool SuggestMgr::suggest(std::vector& slst, else doubletwochars(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // perhaps we forgot to hit space and two words ran together // (dictionary word pairs have top priority here, so @@ -315,6 +343,8 @@ bool SuggestMgr::suggest(std::vector& slst, if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) { good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; } // repeating ``for'' statement compounding support @@ -469,8 +499,11 @@ int SuggestMgr::replchars(std::vector& wlst, return wlst.size(); } -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -// -> vacacation) +// perhaps we doubled two characters +// (for example vacation -> vacacation) +// The recognized pattern with regex back-references: +// "(.)(.)\1\2\1" or "..(.)(.)\1\2" + int SuggestMgr::doubletwochars(std::vector& wlst, const char* word, int cpdsuggest) { @@ -481,7 +514,7 @@ int SuggestMgr::doubletwochars(std::vector& wlst, for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; - if (state == 3) { + if (state == 3 || (state == 2 && i >= 4)) { std::string candidate(word, word + i - 1); candidate.insert(candidate.end(), word + i + 1, word + wl); testsug(wlst, candidate, cpdsuggest, NULL, NULL); @@ -494,8 +527,11 @@ int SuggestMgr::doubletwochars(std::vector& wlst, return wlst.size(); } -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -// -> vacacation) +// perhaps we doubled two characters +// (for example vacation -> vacacation) +// The recognized pattern with regex back-references: +// "(.)(.)\1\2\1" or "..(.)(.)\1\2" + int SuggestMgr::doubletwochars_utf(std::vector& wlst, const w_char* word, int wl, @@ -506,7 +542,7 @@ int SuggestMgr::doubletwochars_utf(std::vector& wlst, for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; - if (state == 3) { + if (state == 3 || (state == 2 && i >= 4)) { std::vector candidate_utf(word, word + i - 1); candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl); std::string candidate; @@ -939,7 +975,8 @@ int SuggestMgr::longswapchar(std::vector& wlst, // try swapping not adjacent chars one by one for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) { - if (std::abs(std::distance(q, p)) > 1) { + size_t distance = std::abs(std::distance(q, p)); + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) { std::swap(*p, *q); testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(*p, *q); @@ -958,7 +995,8 @@ int SuggestMgr::longswapchar_utf(std::vector& wlst, // try swapping not adjacent chars for (std::vector::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { for (std::vector::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) { - if (std::abs(std::distance(q, p)) > 1) { + size_t distance = std::abs(std::distance(q, p)); + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) { std::swap(*p, *q); std::string candidate; u16_u8(candidate, candidate_utf); @@ -980,7 +1018,7 @@ int SuggestMgr::movechar(std::vector& wlst, // try moving a char for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { - for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) { + for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -990,7 +1028,7 @@ int SuggestMgr::movechar(std::vector& wlst, } for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) { - for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) { + for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -1013,7 +1051,7 @@ int SuggestMgr::movechar_utf(std::vector& wlst, // try moving a char for (std::vector::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { - for (std::vector::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) { + for (std::vector::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -1025,7 +1063,7 @@ int SuggestMgr::movechar_utf(std::vector& wlst, } for (std::vector::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) { - for (std::vector::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) < 10; ++q) { + for (std::vector::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -1715,15 +1753,15 @@ std::string SuggestMgr::suggest_morph(const std::string& in_w) { TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) { if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(w); } if (HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } - result.append("\n"); + result.push_back(MSEP_REC); } rv = rv->next_homonym; } @@ -1779,7 +1817,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { HENTRY_DATA(rv), pattern, 0); if (!aff.empty()) { result.append(aff); - result.append("\n"); + result.push_back(MSEP_REC); } } @@ -1803,7 +1841,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { rv2->alen, HENTRY_DATA(rv2), pattern, 0); if (!aff.empty()) { result.append(aff); - result.append("\n"); + result.push_back(MSEP_REC); } } } diff --git a/src/hunspell/suggestmgr.hxx b/src/hunspell/suggestmgr.hxx index f0daf23..a435aac 100644 --- a/src/hunspell/suggestmgr.hxx +++ b/src/hunspell/suggestmgr.hxx @@ -78,11 +78,6 @@ #define MAXPHONSUGS 2 #define MAXCOMPOUNDSUGS 3 -// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function -#define TIMELIMIT (CLOCKS_PER_SEC >> 2) -#define MINTIMER 100 -#define MAXPLUSTIMER 100 - #define NGRAM_LONGER_WORSE (1 << 0) #define NGRAM_ANY_MISMATCH (1 << 1) #define NGRAM_LOWERING (1 << 2) -- 2.7.4