From 43dcdfae40c9c37032ed5e92cd0634feb53b706d Mon Sep 17 00:00:00 2001 From: Mike Kaganski Date: Sat, 4 Mar 2023 12:46:56 +0300 Subject: tdf#153617: percent-encode the text sent to LanguageTool API. Change-Id: I0bb55c70f5602444440fca6e3c13b3d75418e49d Reviewed-on: https://gerrit.libreoffice.org/c/core/+/148236 Tested-by: Jenkins Reviewed-by: Mike Kaganski --- .../spellcheck/languagetool/languagetoolimp.cxx | 26 +++++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'lingucomponent') diff --git a/lingucomponent/source/spellcheck/languagetool/languagetoolimp.cxx b/lingucomponent/source/spellcheck/languagetool/languagetoolimp.cxx index 0856be3f741d..a153e7ac5ef0 100644 --- a/lingucomponent/source/spellcheck/languagetool/languagetoolimp.cxx +++ b/lingucomponent/source/spellcheck/languagetool/languagetoolimp.cxx @@ -43,6 +43,7 @@ #include #include #include +#include using namespace osl; using namespace com::sun::star; @@ -77,6 +78,20 @@ Sequence lcl_GetLineColorPropertyFromErrorId(const std::string& r Sequence aProperties{ comphelper::makePropertyValue("LineColor", aColor) }; return aProperties; } + +OString encodeTextForLanguageTool(const OUString& text) +{ + // Let's be a bit conservative. I don't find a good description what needs encoding (and in + // which way) at https://languagetool.org/http-api/; the "Try it out!" function shows that + // different cases are handled differently by the demo; some percent-encode the UTF-8 + // representation, like %D0%90 (for cyrillic А); some turn into entities like ! (for + // exclamation mark !); some other to things like \u0027 (for apostrophe '). + static constexpr auto myCharClass + = rtl::createUriCharClass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + return OUStringToOString( + rtl::Uri::encode(text, myCharClass.data(), rtl_UriEncodeStrict, RTL_TEXTENCODING_UTF8), + RTL_TEXTENCODING_ASCII_US); +} } LanguageToolGrammarChecker::LanguageToolGrammarChecker() @@ -224,14 +239,14 @@ ProofreadingResult SAL_CALL LanguageToolGrammarChecker::doProofreading( xRes.nBehindEndOfSentencePosition = std::min(xRes.nStartOfNextSentencePosition, aText.getLength()); - OUString langTag(aLocale.Language + "-" + aLocale.Country); - OString postData; + OString langTag(LanguageTag::convertToBcp47(aLocale, false).toUtf8()); + OString postData = encodeTextForLanguageTool(aText); if (rLanguageOpts.getRestProtocol() == sDuden) { std::stringstream aStream; boost::property_tree::ptree aTree; - aTree.put("text-language", langTag.toUtf8().getStr()); - aTree.put("text", aText.toUtf8().getStr()); + aTree.put("text-language", langTag.getStr()); + aTree.put("text", postData.getStr()); aTree.put("hyphenation", false); aTree.put("spellchecking-level", 3); aTree.put("correction-proposals", true); @@ -240,8 +255,7 @@ ProofreadingResult SAL_CALL LanguageToolGrammarChecker::doProofreading( } else { - postData = OUStringToOString(Concat2View("text=" + aText + "&language=" + langTag), - RTL_TEXTENCODING_UTF8); + postData = "text=" + postData + "&language=" + langTag; } if (auto cachedResult = mCachedResults.find(postData); cachedResult != mCachedResults.end()) -- cgit