From f1a4663c819bf698f95a75b5a3319506c66f2778 Mon Sep 17 00:00:00 2001 From: Eike Rathke Date: Wed, 17 Feb 2016 19:26:51 +0100 Subject: implement '*' '?' '~' wildcard search, tdf#72196 Change-Id: Id6122a13423c37e91c9f7561e4d8e5c658d5530e --- i18npool/source/search/textsearch.cxx | 401 +++++++++++++++++++++++++++++++++- i18npool/source/search/textsearch.hxx | 14 ++ 2 files changed, 414 insertions(+), 1 deletion(-) (limited to 'i18npool') diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx index 906fc7b9ad96..90bced1be8f8 100644 --- a/i18npool/source/search/textsearch.cxx +++ b/i18npool/source/search/textsearch.cxx @@ -36,6 +36,7 @@ #include #include #include +#include #include #ifdef _MSC_VER @@ -112,6 +113,8 @@ TextSearch::TextSearch(const Reference < XComponentContext > & rxContext) , pJumpTable2( nullptr ) , pRegexMatcher( nullptr ) , pWLD( nullptr ) + , mcWildcardEscapeChar('~') /* TODO: make this option available through API */ + , mbWildcardAllowSubstring(true) /* TODO: make this option available through API */ { SearchOptions2 aOpt; aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE; @@ -137,6 +140,8 @@ void TextSearch::setOptions2( const SearchOptions2& rOptions ) throw( RuntimeExc delete pWLD, pWLD = nullptr; delete pJumpTable, pJumpTable = nullptr; delete pJumpTable2, pJumpTable2 = nullptr; + maWildcardReversePattern.clear(); + maWildcardReversePattern2.clear(); // Create Transliteration class if( isSimpleTrans( aSrchPara.transliterateFlags) ) @@ -235,8 +240,12 @@ void TextSearch::setOptions2( const SearchOptions2& rOptions ) throw( RuntimeExc nLimit = pWLD->GetLimit(); break; + case SearchAlgorithms2::WILDCARD: + fnForward = &TextSearch::WildcardSrchFrwrd; + fnBackward = &TextSearch::WildcardSrchBkwrd; + break; + default: - case SearchAlgorithms2::WILDCARD: /* FIXME: temporary */ SAL_WARN("i18npool","TextSearch::setOptions2 - default what?"); // fallthru case SearchAlgorithms2::ABSOLUTE: @@ -1118,6 +1127,396 @@ SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, } +namespace { +void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset ) +{ + rRes.subRegExpressions = 1; + rRes.startOffset.realloc(1); + rRes.endOffset.realloc(1); + rRes.startOffset[0] = nStartOffset; + rRes.endOffset[0] = nEndOffset; +} +} + +SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) + throw(RuntimeException) +{ + SearchResult aRes; + aRes.subRegExpressions = 0; // no match + sal_Int32 nStartOffset = nStartPos; + sal_Int32 nEndOffset = nEndPos; + + const sal_Int32 nStringLen = searchStr.getLength(); + + // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty + // string match with [0,0). + if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos || nStartPos > nStringLen || + (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) + return aRes; + + const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); + const sal_Int32 nPatternLen = rPattern.getLength(); + + // Handle special cases empty pattern and/or string outside of the loop to + // not add performance penalties there and simplify. + if (nStartPos == nEndPos) + { + sal_Int32 i = 0; + while (i < nPatternLen && rPattern[i] == '*') + ++i; + if (i == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // Empty pattern does not match any non-empty string. + if (!nPatternLen) + return aRes; + + bool bRewind = false; + sal_uInt32 cPattern = 0; + sal_Int32 nPattern = 0; + sal_Int32 nAfterFakePattern = nPattern; + if (mbWildcardAllowSubstring) + { + // Fake a leading '*' wildcard. + cPattern = '*'; + bRewind = true; + // Assume a non-'*' pattern character follows. If it is a '*' instead + // that will be handled in the loop by setting nPat. + sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern); + if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen) + rPattern.iterateCodePoints( &nAfterFakePattern); + } + + sal_Int32 nString = nStartPos, nPat = -1, nStr = -1; + sal_uInt32 cPatternAfterAsterisk = 0; + bool bEscaped = false, bEscapedAfterAsterisk = false; + + // The loop code tries to avoid costly calls to iterateCodePoints() when + // possible. + + do + { + if (bRewind) + { + // Reuse cPattern after '*', nPattern was correspondingly + // incremented to point behind cPattern. + bRewind = false; + } + else if (nPattern < nPatternLen) + { + // nPattern will be incremented by iterateCodePoints(). + cPattern = rPattern.iterateCodePoints( &nPattern); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) + { + bEscaped = true; + cPattern = rPattern.iterateCodePoints( &nPattern); + } + } + else + { + // A trailing '*' is handled below. If the pattern is consumed and + // substring match allowed we're good. + if (mbWildcardAllowSubstring) + setWildcardMatch( aRes, nStartOffset, nString); + return aRes; + } + + if (cPattern == '*' && !bEscaped) + { + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern < nPatternLen && rPattern[nPattern] == '*') + ++nPattern; + + if (nPattern >= nPatternLen) + { + // Last pattern is '*', remaining string matches. + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // cPattern will be the next non-'*' character, nPattern + // incremented. + cPattern = rPattern.iterateCodePoints( &nPattern); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) + { + bEscaped = true; + cPattern = rPattern.iterateCodePoints( &nPattern); + } + + cPatternAfterAsterisk = cPattern; + bEscapedAfterAsterisk = bEscaped; + nPat = nPattern; // Remember position of pattern behind '*', already incremented. + nStr = nString; // Remember the current string to be matched. + } + + if (nString >= nEndPos) + // Whatever follows in pattern, string will not match. + return aRes; + + // nString will be incremented by iterateCodePoints(). + sal_uInt32 cString = searchStr.iterateCodePoints( &nString); + + if ((cPattern != '?' || bEscaped) && cPattern != cString) + { + if (nPat == -1) + // Non-match already without any '*' pattern. + return aRes; + + bRewind = true; + nPattern = nPat; // Rewind pattern to character behind '*', already incremented. + cPattern = cPatternAfterAsterisk; + bEscaped = bEscapedAfterAsterisk; + searchStr.iterateCodePoints( &nStr); + nString = nStr; // Restore incremented remembered string position. + if (nPat == nAfterFakePattern) + { + // Next start offset will be the next character. + nStartOffset = nString; + } + } + else + { + // An unescaped '?' pattern matched any character, or characters + // matched. Reset only escaped state. + bEscaped = false; + } + } + while (nString < nEndPos); + + if (bRewind) + return aRes; + + // Eat trailing '*' pattern that matches anything, including nothing. + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern < nPatternLen && rPattern[nPattern] == '*') + ++nPattern; + + if (nPattern == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; +} + +SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) + throw(RuntimeException) +{ + SearchResult aRes; + aRes.subRegExpressions = 0; // no match + + sal_Int32 nStartOffset = nStartPos; + sal_Int32 nEndOffset = nEndPos; + + const sal_Int32 nStringLen = searchStr.getLength(); + + // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty + // string match with (0,0]. + if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos || nEndPos > nStringLen || + (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) + return aRes; + + const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); + sal_Int32 nPatternLen = rPattern.getLength(); + + // Handle special cases empty pattern and/or string outside of the loop to + // not add performance penalties there and simplify. + if (nStartPos == nEndPos) + { + sal_Int32 i = 0; + while (i < nPatternLen && rPattern[i] == '*') + ++i; + if (i == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // Empty pattern does not match any non-empty string. + if (!nPatternLen) + return aRes; + + // Reverse escaped patterns to ease the handling of escapes, keeping escape + // and following character as one sequence in backward direction. + if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) || + (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty())) + { + OUStringBuffer aPatternBuf( rPattern); + sal_Int32 nIndex = 0; + while (nIndex < nPatternLen) + { + const sal_Int32 nOld = nIndex; + const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex); + if (cu == mcWildcardEscapeChar) + { + if (nIndex < nPatternLen) + { + if (nIndex - nOld == 1) + { + // Simply move code units, we already memorized the one + // in 'cu'. + const sal_Int32 nOld2 = nIndex; + rPattern.iterateCodePoints( &nIndex); + for (sal_Int32 i=0; i < nIndex - nOld2; ++i) + aPatternBuf[nOld+i] = rPattern[nOld2+i]; + aPatternBuf[nIndex-1] = static_cast(cu); + } + else + { + // Copy the escape character code units first in the + // unlikely case that it would not be of BMP. + assert(nIndex - nOld == 2); // it's UTF-16, so.. + sal_Unicode buf[2]; + buf[0] = rPattern[nOld]; + buf[1] = rPattern[nOld+1]; + const sal_Int32 nOld2 = nIndex; + rPattern.iterateCodePoints( &nIndex); + for (sal_Int32 i=0; i < nIndex - nOld2; ++i) + aPatternBuf[nOld+i] = rPattern[nOld2+i]; + aPatternBuf[nIndex-2] = buf[0]; + aPatternBuf[nIndex-1] = buf[1]; + } + } + else + { + // Trailing escape would become leading escape, do what? + // Eliminate. + aPatternBuf.remove( nOld, nIndex - nOld); + } + } + } + if (bUsePrimarySrchStr) + maWildcardReversePattern = aPatternBuf.makeStringAndClear(); + else + maWildcardReversePattern2 = aPatternBuf.makeStringAndClear(); + } + const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2); + nPatternLen = rReversePattern.getLength(); + + bool bRewind = false; + sal_uInt32 cPattern = 0; + sal_Int32 nPattern = nPatternLen; + sal_Int32 nAfterFakePattern = nPattern; + if (mbWildcardAllowSubstring) + { + // Fake a trailing '*' wildcard. + cPattern = '*'; + bRewind = true; + // Assume a non-'*' pattern character follows. If it is a '*' instead + // that will be handled in the loop by setting nPat. + sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); + if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0) + rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); + } + + sal_Int32 nString = nStartPos, nPat = -1, nStr = -1; + sal_uInt32 cPatternAfterAsterisk = 0; + bool bEscaped = false, bEscapedAfterAsterisk = false; + + // The loop code tries to avoid costly calls to iterateCodePoints() when + // possible. + + do + { + if (bRewind) + { + // Reuse cPattern after '*', nPattern was correspondingly + // decremented to point before cPattern. + bRewind = false; + } + else if (nPattern > 0) + { + // nPattern will be decremented by iterateCodePoints(). + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) + { + bEscaped = true; + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + } + } + else + { + // A leading '*' is handled below. If the pattern is consumed and + // substring match allowed we're good. + if (mbWildcardAllowSubstring) + setWildcardMatch( aRes, nStartOffset, nString); + return aRes; + } + + if (cPattern == '*' && !bEscaped) + { + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern > 0 && rReversePattern[nPattern-1] == '*') + --nPattern; + + if (nPattern <= 0) + { + // First pattern is '*', remaining string matches. + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // cPattern will be the previous non-'*' character, nPattern + // decremented. + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) + { + bEscaped = true; + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + } + + cPatternAfterAsterisk = cPattern; + bEscapedAfterAsterisk = bEscaped; + nPat = nPattern; // Remember position of pattern before '*', already decremented. + nStr = nString; // Remember the current string to be matched. + } + + if (nString <= nEndPos) + // Whatever leads in pattern, string will not match. + return aRes; + + // nString will be decremented by iterateCodePoints(). + sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1); + + if ((cPattern != '?' || bEscaped) && cPattern != cString) + { + if (nPat == -1) + // Non-match already without any '*' pattern. + return aRes; + + bRewind = true; + nPattern = nPat; // Rewind pattern to character before '*', already decremented. + cPattern = cPatternAfterAsterisk; + bEscaped = bEscapedAfterAsterisk; + searchStr.iterateCodePoints( &nStr, -1); + nString = nStr; // Restore decremented remembered string position. + if (nPat == nAfterFakePattern) + { + // Next start offset will be this character (exclusive). + nStartOffset = nString; + } + } + else + { + // An unescaped '?' pattern matched any character, or characters + // matched. Reset only escaped state. + bEscaped = false; + } + } + while (nString > nEndPos); + + if (bRewind) + return aRes; + + // Eat leading '*' pattern that matches anything, including nothing. + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern > 0 && rReversePattern[nPattern-1] == '*') + --nPattern; + + if (nPattern == 0) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; +} + + static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; static uno::Sequence< OUString > getServiceName_Static() diff --git a/i18npool/source/search/textsearch.hxx b/i18npool/source/search/textsearch.hxx index ae189c29df63..49c1dfb24fef 100644 --- a/i18npool/source/search/textsearch.hxx +++ b/i18npool/source/search/textsearch.hxx @@ -106,6 +106,20 @@ class TextSearch: public cppu::WeakImplHelper sal_Int32 startPos, sal_Int32 endPos ) throw(css::uno::RuntimeException); + // Members and methods for the wildcard search + OUString maWildcardReversePattern; + OUString maWildcardReversePattern2; + sal_uInt32 mcWildcardEscapeChar; + bool mbWildcardAllowSubstring; + css::util::SearchResult SAL_CALL + WildcardSrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) + throw(css::uno::RuntimeException); + css::util::SearchResult SAL_CALL + WildcardSrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) + throw(css::uno::RuntimeException); + bool IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const; bool checkCTLStart, checkCTLEnd; -- cgit