diff options
author | Baole Fang <baole.fang@gmail.com> | 2023-04-25 00:33:01 -0400 |
---|---|---|
committer | Stephan Bergmann <sbergman@redhat.com> | 2023-04-27 15:27:33 +0200 |
commit | a772976f047882918d5386a3ef9226c4aa2aa118 (patch) | |
tree | 2a6a3f64bc2471fc9c68e18b554dde2ab8fa0330 /svl | |
parent | ec90dae4993b90e0c3a797ac9b43a076527e1a7d (diff) |
tdf#145925: Add DOI recognition
Detect DOI string in the form of "doi:10.*" and add hyperlink to it.
It works the same way as url recognition.
Change-Id: I3c4e78a110fd81ad7e727d5e9acee7e51127466a
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/150954
Tested-by: Jenkins
Reviewed-by: Heiko Tietze <heiko.tietze@documentfoundation.org>
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'svl')
-rw-r--r-- | svl/qa/unit/test_URIHelper.cxx | 54 | ||||
-rw-r--r-- | svl/source/misc/urihelper.cxx | 59 |
2 files changed, 113 insertions, 0 deletions
diff --git a/svl/qa/unit/test_URIHelper.cxx b/svl/qa/unit/test_URIHelper.cxx index eb5135cbe3c6..df9e5d5114b8 100644 --- a/svl/qa/unit/test_URIHelper.cxx +++ b/svl/qa/unit/test_URIHelper.cxx @@ -181,11 +181,14 @@ public: void testFindFirstURLInText(); + void testFindFirstDOIInText(); + void testResolveIdnaHost(); CPPUNIT_TEST_SUITE(Test); CPPUNIT_TEST(testNormalizedMakeRelative); CPPUNIT_TEST(testFindFirstURLInText); + CPPUNIT_TEST(testFindFirstDOIInText); CPPUNIT_TEST(testResolveIdnaHost); CPPUNIT_TEST(finish); CPPUNIT_TEST_SUITE_END(); @@ -398,6 +401,57 @@ void Test::testFindFirstURLInText() { } } +void Test::testFindFirstDOIInText() { + struct Data { + char const * input; + char const * result; + sal_Int32 begin; + sal_Int32 end; + }; + static Data const tests[] = { + { "doi:10.1000/182", "https://doi.org/10.1000/182", 0, 15 }, // valid doi suffix with only digits + { "doi:10.1038/nature03001", "https://doi.org/10.1038/nature03001", 0, 23 }, // valid doi suffix with alphanumeric characters + { "doi:10.1093/ajae/aaq063", "https://doi.org/10.1093/ajae/aaq063", 0, 23 }, // valid doi suffix with multiple slash + { "doi:10.1016/S0735-1097(98)00347-7", "https://doi.org/10.1016/S0735-1097(98)00347-7", 0, 33 }, // valid doi suffix with characters apart from alphanumeric + { "doi:10.109/ajae/aaq063", nullptr, 0, 0 }, // # of digits after doi;10. is not between 4 and 9 + { "doi:10.1234567890/ajae/aaq063", nullptr, 0, 0 }, // # of digits after doi;10. is not between 4 and 9 + { "doi:10.1093/ajae/aaq063/", nullptr, 0, 0 }, // nothing after slash + { "doi:10.1093", nullptr, 0, 0 }, // no slash + { "doi:11.1093/ajae/aaq063", nullptr, 0, 0 }, // doesn't begin with doi:10. + }; + CharClass charClass( m_context, LanguageTag( css::lang::Locale("en", "US", ""))); + for (std::size_t i = 0; i < SAL_N_ELEMENTS(tests); ++i) { + OUString input(OUString::createFromAscii(tests[i].input)); + sal_Int32 begin = 0; + sal_Int32 end = input.getLength(); + OUString result( + URIHelper::FindFirstDOIInText(input, begin, end, charClass)); + bool ok = tests[i].result == nullptr + ? (result.getLength() == 0 && begin == input.getLength() + && end == input.getLength()) + : (result.equalsAscii(tests[i].result) && begin == tests[i].begin + && end == tests[i].end); + OString msg; + if (!ok) { + OStringBuffer buf; + buf.append(OString::Concat("\"") + + tests[i].input + + "\" -> "); + buf.append(tests[i].result == nullptr ? "none" : tests[i].result); + buf.append(" (" + + OString::number(tests[i].begin) + + ", " + + OString::number(tests[i].end) + + ")" + " != " + + OUStringToOString(result, RTL_TEXTENCODING_UTF8) + + " (" + OString::number(begin) + ", " + OString::number(end) +")"); + msg = buf.makeStringAndClear(); + } + CPPUNIT_ASSERT_MESSAGE(msg.getStr(), ok); + } +} + void Test::testResolveIdnaHost() { OUString input; diff --git a/svl/source/misc/urihelper.cxx b/svl/source/misc/urihelper.cxx index 6f121fba56d9..0043b7883a87 100644 --- a/svl/source/misc/urihelper.cxx +++ b/svl/source/misc/urihelper.cxx @@ -745,6 +745,65 @@ OUString URIHelper::FindFirstURLInText(OUString const & rText, return OUString(); } +OUString URIHelper::FindFirstDOIInText(OUString const & rText, + sal_Int32 & rBegin, + sal_Int32 & rEnd, + CharClass const & rCharClass) +{ + if (rBegin > rEnd || rEnd > rText.getLength()) + return OUString(); + + sal_Int32 start = 7; + sal_Int32 count = rEnd-rBegin; + OUString candidate(rText.subView(rBegin, count)); + // Match with regex "doi:10\.\d{4,9}\/[-._;()\/:a-zA-Z0-9]+" + if (candidate.startsWith("doi:10.")) + { + bool flag = true; + sal_Int32 digit = 0; + for (sal_Int32 i=start; i<count; i++) + { + sal_Unicode c = candidate[i]; + // Match 4 to 9 digits before slash + if (digit >= 0) + { + if (digit>9) + { + flag = false; + break; + } + + if ( rCharClass.isDigit(candidate,i) ) + { + digit++; + } + else if (c=='/' && digit>=4 && i<count-1) + { + digit=-1; + } + else + { + flag = false; + break; + } + } + // Match [-._;()\/:a-zA-Z0-9] after slash + else if (!( rCharClass.isAlphaNumeric(candidate, i) || c == '.' || c == '-' || c=='_' || + c==';' || c=='(' || c==')' || c=='\\' || (c=='/' && i<count-1) || c==':')) + { + flag = false; + break; + } + } + if (flag && digit==-1) + { + return candidate.replaceFirst("doi:","https://doi.org/"); + } + } + rBegin = rEnd; + return OUString(); +} + OUString URIHelper::removePassword(OUString const & rURI, INetURLObject::EncodeMechanism eEncodeMechanism, INetURLObject::DecodeMechanism eDecodeMechanism, |