diff options
author | Herbert Dürr <hdu@apache.org> | 2012-05-09 12:57:20 +0000 |
---|---|---|
committer | Fridrich Strba <fridrich@documentfoundation.org> | 2013-03-12 13:18:50 +0000 |
commit | d4b3e398f95225951f86d3e8f7474f7a0fb86b92 (patch) | |
tree | c46e18adb6a6bac133b6cbf60bdbccd91b346660 /i18npool | |
parent | 36a8b15679a3003091b8b9d819a9b290364bdf5f (diff) |
various regex fixes squashed into one
i#118925# enhance textsearch's match-group references
to work for look-ahead/look-behind
(cherry picked from commit 3b83c404c56e5db5bab29ffee41f02822410d625)
Conflicts:
sw/source/core/crsr/findtxt.cxx
(cherry picked from commit 9a93475d6eba53b2e1fba1585dbd11c94ea4b4a3)
Conflicts:
sw/source/core/crsr/findtxt.cxx
i#120598 better emulation of regexp word-start and word-end operators
The emulation of the word-start and word-end operators provided
the previous regexp engine can be approximated much better
by using the ICU-regexp exngines powerful look-around feature.
Patch-by: Herbert Duerr
Found-by: ldgolds33@yahoo.com
(cherry picked from commit ec7ef30693f10315ce80a8f5d7325a0e40855e66)
(cherry picked from commit 8c26876fea085a1bc847abba63dffa97a9499c1d)
i#121482# fix attributed text search for regular expression patterns
(cherry picked from commit e7fc662799e7e936753e24db8d6d3849c12b3ff4)
(cherry picked from commit e6288a5d889da7db5bf23174f85c29ccfcaa44d5)
i#121482# fix backwards regexp search for matches overlapping search start
(cherry picked from commit 854f4ad6c57be62bd922df08f603d8bfb7b918a5)
Additionally fixed unit test, searching backward shall not produce a
different result from searching forward. (erAck)
(cherry picked from commit b514f0ce86e85d9be269ddf2e797befbbf3423f1)
i#121633# fix search for upper-case character classes
in ignore-case regular expressions
(cherry picked from commit b7ee1803453d3a766ce3a833892e1c208aacf8ff)
(cherry picked from commit 531538892795bec909bb8baff6bbf9e19baf809e)
i#121633# fix ignore-case problem caused by i18nsearch API mess
(cherry picked from commit 7644ec176049937b588fe171a553c9a07b375792)
(cherry picked from commit f932a3f1195290f9aa37b593190bd4c6ac5fe2f6)
Change-Id: I44d6216b12f17d0560c4e8cf355937797ddeee2a
Reviewed-on: https://gerrit.libreoffice.org/2678
Reviewed-by: Fridrich Strba <fridrich@documentfoundation.org>
Tested-by: Fridrich Strba <fridrich@documentfoundation.org>
Diffstat (limited to 'i18npool')
-rw-r--r-- | i18npool/qa/cppunit/test_textsearch.cxx | 2 | ||||
-rw-r--r-- | i18npool/source/search/textsearch.cxx | 37 |
2 files changed, 27 insertions, 12 deletions
diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx index c26550b950ca..d7a6c3349146 100644 --- a/i18npool/qa/cppunit/test_textsearch.cxx +++ b/i18npool/qa/cppunit/test_textsearch.cxx @@ -101,7 +101,7 @@ void TestTextSearch::testSearches() sal_Int32 startPos = 2, endPos = 20 ; OUString searchStr( "(ab)*a(c|d)+" ); sal_Int32 fStartRes = 10, fEndRes = 18 ; - sal_Int32 bStartRes = 18, bEndRes = 14 ; + sal_Int32 bStartRes = 18, bEndRes = 10 ; // set options util::SearchOptions aOptions; diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx index 314dd5b0e0b5..dceb4d759915 100644 --- a/i18npool/source/search/textsearch.cxx +++ b/i18npool/source/search/textsearch.cxx @@ -60,7 +60,7 @@ static sal_Int32 COMPLEX_TRANS_MASK_TMP = TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | TransliterationModules_ignoreProlongedSoundMark_ja_JP; static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; -static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK; +static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); // Above 2 transliteration is simple but need to take effect in // complex transliteration @@ -675,21 +675,30 @@ void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOp // REG_NOSUB is not used anywhere => not implemented // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute // LEV_RELAXED is only used for SearchAlgorithm==Approximate - // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it??? - if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0) + // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO + // probably because the transliteration flag IGNORE_CASE handles it as well. + if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 + || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; UErrorCode nIcuErr = U_ZERO_ERROR; // assumption: transliteration didn't mangle regexp control chars IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); #ifndef DISABLE_WORDBOUND_EMULATION // for conveniance specific syntax elements of the old regex engine are emulated - // by using regular word boundary matching \b to replace \< and \> - static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant); - static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant); - static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr); - aChevronMatcher.reset( aIcuSearchPatStr); - aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr); - aChevronMatcher.reset(); + // - by replacing \< with "word-break followed by a look-ahead word-char" + static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); + static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); + static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); + aChevronMatcherB.reset( aIcuSearchPatStr); + aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); + aChevronMatcherB.reset(); + // - by replacing \> with "look-behind word-char followed by a word-break" + static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); + static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); + static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); + aChevronMatcherE.reset( aIcuSearchPatStr); + aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); + aChevronMatcherE.reset(); #endif pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); if( nIcuErr) @@ -769,9 +778,15 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, // find the last match int nLastPos = 0; + int nFoundEnd = 0; do { nLastPos = pRegexMatcher->start( nIcuErr); - } while( pRegexMatcher->find( nLastPos + 1, nIcuErr)); + nFoundEnd = pRegexMatcher->end( nIcuErr); + if( nFoundEnd >= startPos) + break; + if( nFoundEnd == nLastPos) + ++nFoundEnd; + } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); // find last match again to get its details pRegexMatcher->find( nLastPos, nIcuErr); |