summaryrefslogtreecommitdiff
path: root/svl
diff options
context:
space:
mode:
authorBaole Fang <baole.fang@gmail.com>2023-04-25 00:33:01 -0400
committerStephan Bergmann <sbergman@redhat.com>2023-04-27 15:27:33 +0200
commita772976f047882918d5386a3ef9226c4aa2aa118 (patch)
tree2a6a3f64bc2471fc9c68e18b554dde2ab8fa0330 /svl
parentec90dae4993b90e0c3a797ac9b43a076527e1a7d (diff)
tdf#145925: Add DOI recognition
Detect DOI string in the form of "doi:10.*" and add hyperlink to it. It works the same way as url recognition. Change-Id: I3c4e78a110fd81ad7e727d5e9acee7e51127466a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/150954 Tested-by: Jenkins Reviewed-by: Heiko Tietze <heiko.tietze@documentfoundation.org> Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'svl')
-rw-r--r--svl/qa/unit/test_URIHelper.cxx54
-rw-r--r--svl/source/misc/urihelper.cxx59
2 files changed, 113 insertions, 0 deletions
diff --git a/svl/qa/unit/test_URIHelper.cxx b/svl/qa/unit/test_URIHelper.cxx
index eb5135cbe3c6..df9e5d5114b8 100644
--- a/svl/qa/unit/test_URIHelper.cxx
+++ b/svl/qa/unit/test_URIHelper.cxx
@@ -181,11 +181,14 @@ public:
void testFindFirstURLInText();
+ void testFindFirstDOIInText();
+
void testResolveIdnaHost();
CPPUNIT_TEST_SUITE(Test);
CPPUNIT_TEST(testNormalizedMakeRelative);
CPPUNIT_TEST(testFindFirstURLInText);
+ CPPUNIT_TEST(testFindFirstDOIInText);
CPPUNIT_TEST(testResolveIdnaHost);
CPPUNIT_TEST(finish);
CPPUNIT_TEST_SUITE_END();
@@ -398,6 +401,57 @@ void Test::testFindFirstURLInText() {
}
}
+void Test::testFindFirstDOIInText() {
+ struct Data {
+ char const * input;
+ char const * result;
+ sal_Int32 begin;
+ sal_Int32 end;
+ };
+ static Data const tests[] = {
+ { "doi:10.1000/182", "https://doi.org/10.1000/182", 0, 15 }, // valid doi suffix with only digits
+ { "doi:10.1038/nature03001", "https://doi.org/10.1038/nature03001", 0, 23 }, // valid doi suffix with alphanumeric characters
+ { "doi:10.1093/ajae/aaq063", "https://doi.org/10.1093/ajae/aaq063", 0, 23 }, // valid doi suffix with multiple slash
+ { "doi:10.1016/S0735-1097(98)00347-7", "https://doi.org/10.1016/S0735-1097(98)00347-7", 0, 33 }, // valid doi suffix with characters apart from alphanumeric
+ { "doi:10.109/ajae/aaq063", nullptr, 0, 0 }, // # of digits after doi;10. is not between 4 and 9
+ { "doi:10.1234567890/ajae/aaq063", nullptr, 0, 0 }, // # of digits after doi;10. is not between 4 and 9
+ { "doi:10.1093/ajae/aaq063/", nullptr, 0, 0 }, // nothing after slash
+ { "doi:10.1093", nullptr, 0, 0 }, // no slash
+ { "doi:11.1093/ajae/aaq063", nullptr, 0, 0 }, // doesn't begin with doi:10.
+ };
+ CharClass charClass( m_context, LanguageTag( css::lang::Locale("en", "US", "")));
+ for (std::size_t i = 0; i < SAL_N_ELEMENTS(tests); ++i) {
+ OUString input(OUString::createFromAscii(tests[i].input));
+ sal_Int32 begin = 0;
+ sal_Int32 end = input.getLength();
+ OUString result(
+ URIHelper::FindFirstDOIInText(input, begin, end, charClass));
+ bool ok = tests[i].result == nullptr
+ ? (result.getLength() == 0 && begin == input.getLength()
+ && end == input.getLength())
+ : (result.equalsAscii(tests[i].result) && begin == tests[i].begin
+ && end == tests[i].end);
+ OString msg;
+ if (!ok) {
+ OStringBuffer buf;
+ buf.append(OString::Concat("\"")
+ + tests[i].input
+ + "\" -> ");
+ buf.append(tests[i].result == nullptr ? "none" : tests[i].result);
+ buf.append(" ("
+ + OString::number(tests[i].begin)
+ + ", "
+ + OString::number(tests[i].end)
+ + ")"
+ " != "
+ + OUStringToOString(result, RTL_TEXTENCODING_UTF8)
+ + " (" + OString::number(begin) + ", " + OString::number(end) +")");
+ msg = buf.makeStringAndClear();
+ }
+ CPPUNIT_ASSERT_MESSAGE(msg.getStr(), ok);
+ }
+}
+
void Test::testResolveIdnaHost() {
OUString input;
diff --git a/svl/source/misc/urihelper.cxx b/svl/source/misc/urihelper.cxx
index 6f121fba56d9..0043b7883a87 100644
--- a/svl/source/misc/urihelper.cxx
+++ b/svl/source/misc/urihelper.cxx
@@ -745,6 +745,65 @@ OUString URIHelper::FindFirstURLInText(OUString const & rText,
return OUString();
}
+OUString URIHelper::FindFirstDOIInText(OUString const & rText,
+ sal_Int32 & rBegin,
+ sal_Int32 & rEnd,
+ CharClass const & rCharClass)
+{
+ if (rBegin > rEnd || rEnd > rText.getLength())
+ return OUString();
+
+ sal_Int32 start = 7;
+ sal_Int32 count = rEnd-rBegin;
+ OUString candidate(rText.subView(rBegin, count));
+ // Match with regex "doi:10\.\d{4,9}\/[-._;()\/:a-zA-Z0-9]+"
+ if (candidate.startsWith("doi:10."))
+ {
+ bool flag = true;
+ sal_Int32 digit = 0;
+ for (sal_Int32 i=start; i<count; i++)
+ {
+ sal_Unicode c = candidate[i];
+ // Match 4 to 9 digits before slash
+ if (digit >= 0)
+ {
+ if (digit>9)
+ {
+ flag = false;
+ break;
+ }
+
+ if ( rCharClass.isDigit(candidate,i) )
+ {
+ digit++;
+ }
+ else if (c=='/' && digit>=4 && i<count-1)
+ {
+ digit=-1;
+ }
+ else
+ {
+ flag = false;
+ break;
+ }
+ }
+ // Match [-._;()\/:a-zA-Z0-9] after slash
+ else if (!( rCharClass.isAlphaNumeric(candidate, i) || c == '.' || c == '-' || c=='_' ||
+ c==';' || c=='(' || c==')' || c=='\\' || (c=='/' && i<count-1) || c==':'))
+ {
+ flag = false;
+ break;
+ }
+ }
+ if (flag && digit==-1)
+ {
+ return candidate.replaceFirst("doi:","https://doi.org/");
+ }
+ }
+ rBegin = rEnd;
+ return OUString();
+}
+
OUString URIHelper::removePassword(OUString const & rURI,
INetURLObject::EncodeMechanism eEncodeMechanism,
INetURLObject::DecodeMechanism eDecodeMechanism,