summaryrefslogtreecommitdiff
path: root/i18npool
diff options
context:
space:
mode:
authorHerbert Dürr <hdu@apache.org>2012-05-09 12:57:20 +0000
committerFridrich Strba <fridrich@documentfoundation.org>2013-03-12 13:18:50 +0000
commitd4b3e398f95225951f86d3e8f7474f7a0fb86b92 (patch)
treec46e18adb6a6bac133b6cbf60bdbccd91b346660 /i18npool
parent36a8b15679a3003091b8b9d819a9b290364bdf5f (diff)
various regex fixes squashed into one
i#118925# enhance textsearch's match-group references to work for look-ahead/look-behind (cherry picked from commit 3b83c404c56e5db5bab29ffee41f02822410d625) Conflicts: sw/source/core/crsr/findtxt.cxx (cherry picked from commit 9a93475d6eba53b2e1fba1585dbd11c94ea4b4a3) Conflicts: sw/source/core/crsr/findtxt.cxx i#120598 better emulation of regexp word-start and word-end operators The emulation of the word-start and word-end operators provided the previous regexp engine can be approximated much better by using the ICU-regexp exngines powerful look-around feature. Patch-by: Herbert Duerr Found-by: ldgolds33@yahoo.com (cherry picked from commit ec7ef30693f10315ce80a8f5d7325a0e40855e66) (cherry picked from commit 8c26876fea085a1bc847abba63dffa97a9499c1d) i#121482# fix attributed text search for regular expression patterns (cherry picked from commit e7fc662799e7e936753e24db8d6d3849c12b3ff4) (cherry picked from commit e6288a5d889da7db5bf23174f85c29ccfcaa44d5) i#121482# fix backwards regexp search for matches overlapping search start (cherry picked from commit 854f4ad6c57be62bd922df08f603d8bfb7b918a5) Additionally fixed unit test, searching backward shall not produce a different result from searching forward. (erAck) (cherry picked from commit b514f0ce86e85d9be269ddf2e797befbbf3423f1) i#121633# fix search for upper-case character classes in ignore-case regular expressions (cherry picked from commit b7ee1803453d3a766ce3a833892e1c208aacf8ff) (cherry picked from commit 531538892795bec909bb8baff6bbf9e19baf809e) i#121633# fix ignore-case problem caused by i18nsearch API mess (cherry picked from commit 7644ec176049937b588fe171a553c9a07b375792) (cherry picked from commit f932a3f1195290f9aa37b593190bd4c6ac5fe2f6) Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a Reviewed-on: https://gerrit.libreoffice.org/2678 Reviewed-by: Fridrich Strba <fridrich@documentfoundation.org> Tested-by: Fridrich Strba <fridrich@documentfoundation.org>
Diffstat (limited to 'i18npool')
-rw-r--r--i18npool/qa/cppunit/test_textsearch.cxx2
-rw-r--r--i18npool/source/search/textsearch.cxx37
2 files changed, 27 insertions, 12 deletions
diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx
index c26550b950ca..d7a6c3349146 100644
--- a/i18npool/qa/cppunit/test_textsearch.cxx
+++ b/i18npool/qa/cppunit/test_textsearch.cxx
@@ -101,7 +101,7 @@ void TestTextSearch::testSearches()
sal_Int32 startPos = 2, endPos = 20 ;
OUString searchStr( "(ab)*a(c|d)+" );
sal_Int32 fStartRes = 10, fEndRes = 18 ;
- sal_Int32 bStartRes = 18, bEndRes = 14 ;
+ sal_Int32 bStartRes = 18, bEndRes = 10 ;
// set options
util::SearchOptions aOptions;
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 314dd5b0e0b5..dceb4d759915 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -60,7 +60,7 @@ static sal_Int32 COMPLEX_TRANS_MASK_TMP =
TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
TransliterationModules_ignoreProlongedSoundMark_ja_JP;
static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
-static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
+static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
// Above 2 transliteration is simple but need to take effect in
// complex transliteration
@@ -675,21 +675,30 @@ void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOp
// REG_NOSUB is not used anywhere => not implemented
// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
// LEV_RELAXED is only used for SearchAlgorithm==Approximate
- // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
- if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
+ // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
+ // probably because the transliteration flag IGNORE_CASE handles it as well.
+ if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
+ || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
UErrorCode nIcuErr = U_ZERO_ERROR;
// assumption: transliteration didn't mangle regexp control chars
IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
#ifndef DISABLE_WORDBOUND_EMULATION
// for conveniance specific syntax elements of the old regex engine are emulated
- // by using regular word boundary matching \b to replace \< and \>
- static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant);
- static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant);
- static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
- aChevronMatcher.reset( aIcuSearchPatStr);
- aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
- aChevronMatcher.reset();
+ // - by replacing \< with "word-break followed by a look-ahead word-char"
+ static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
+ static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
+ static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
+ aChevronMatcherB.reset( aIcuSearchPatStr);
+ aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
+ aChevronMatcherB.reset();
+ // - by replacing \> with "look-behind word-char followed by a word-break"
+ static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
+ static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
+ static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
+ aChevronMatcherE.reset( aIcuSearchPatStr);
+ aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
+ aChevronMatcherE.reset();
#endif
pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
if( nIcuErr)
@@ -769,9 +778,15 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
// find the last match
int nLastPos = 0;
+ int nFoundEnd = 0;
do {
nLastPos = pRegexMatcher->start( nIcuErr);
- } while( pRegexMatcher->find( nLastPos + 1, nIcuErr));
+ nFoundEnd = pRegexMatcher->end( nIcuErr);
+ if( nFoundEnd >= startPos)
+ break;
+ if( nFoundEnd == nLastPos)
+ ++nFoundEnd;
+ } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
// find last match again to get its details
pRegexMatcher->find( nLastPos, nIcuErr);