summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2016-02-17 19:26:51 +0100
committerEike Rathke <erack@redhat.com>2016-02-17 21:17:08 +0100
commitf1a4663c819bf698f95a75b5a3319506c66f2778 (patch)
treeb080436b32403f181be98b3eb97eae1dfa24584b
parent649f74e21b6dc7117b542f490272897ac1d00566 (diff)
implement '*' '?' '~' wildcard search, tdf#72196
Change-Id: Id6122a13423c37e91c9f7561e4d8e5c658d5530e
-rw-r--r--i18npool/source/search/textsearch.cxx401
-rw-r--r--i18npool/source/search/textsearch.hxx14
2 files changed, 414 insertions, 1 deletions
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 906fc7b9ad96..90bced1be8f8 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -36,6 +36,7 @@
#include <cppuhelper/factory.hxx>
#include <cppuhelper/supportsservice.hxx>
#include <cppuhelper/weak.hxx>
+#include <rtl/ustrbuf.hxx>
#include <sal/log.hxx>
#ifdef _MSC_VER
@@ -112,6 +113,8 @@ TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
, pJumpTable2( nullptr )
, pRegexMatcher( nullptr )
, pWLD( nullptr )
+ , mcWildcardEscapeChar('~') /* TODO: make this option available through API */
+ , mbWildcardAllowSubstring(true) /* TODO: make this option available through API */
{
SearchOptions2 aOpt;
aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
@@ -137,6 +140,8 @@ void TextSearch::setOptions2( const SearchOptions2& rOptions ) throw( RuntimeExc
delete pWLD, pWLD = nullptr;
delete pJumpTable, pJumpTable = nullptr;
delete pJumpTable2, pJumpTable2 = nullptr;
+ maWildcardReversePattern.clear();
+ maWildcardReversePattern2.clear();
// Create Transliteration class
if( isSimpleTrans( aSrchPara.transliterateFlags) )
@@ -235,8 +240,12 @@ void TextSearch::setOptions2( const SearchOptions2& rOptions ) throw( RuntimeExc
nLimit = pWLD->GetLimit();
break;
+ case SearchAlgorithms2::WILDCARD:
+ fnForward = &TextSearch::WildcardSrchFrwrd;
+ fnBackward = &TextSearch::WildcardSrchBkwrd;
+ break;
+
default:
- case SearchAlgorithms2::WILDCARD: /* FIXME: temporary */
SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
// fallthru
case SearchAlgorithms2::ABSOLUTE:
@@ -1118,6 +1127,396 @@ SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
}
+namespace {
+void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
+{
+ rRes.subRegExpressions = 1;
+ rRes.startOffset.realloc(1);
+ rRes.endOffset.realloc(1);
+ rRes.startOffset[0] = nStartOffset;
+ rRes.endOffset[0] = nEndOffset;
+}
+}
+
+SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
+ throw(RuntimeException)
+{
+ SearchResult aRes;
+ aRes.subRegExpressions = 0; // no match
+ sal_Int32 nStartOffset = nStartPos;
+ sal_Int32 nEndOffset = nEndPos;
+
+ const sal_Int32 nStringLen = searchStr.getLength();
+
+ // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
+ // string match with [0,0).
+ if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos || nStartPos > nStringLen ||
+ (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
+ return aRes;
+
+ const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
+ const sal_Int32 nPatternLen = rPattern.getLength();
+
+ // Handle special cases empty pattern and/or string outside of the loop to
+ // not add performance penalties there and simplify.
+ if (nStartPos == nEndPos)
+ {
+ sal_Int32 i = 0;
+ while (i < nPatternLen && rPattern[i] == '*')
+ ++i;
+ if (i == nPatternLen)
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+ }
+
+ // Empty pattern does not match any non-empty string.
+ if (!nPatternLen)
+ return aRes;
+
+ bool bRewind = false;
+ sal_uInt32 cPattern = 0;
+ sal_Int32 nPattern = 0;
+ sal_Int32 nAfterFakePattern = nPattern;
+ if (mbWildcardAllowSubstring)
+ {
+ // Fake a leading '*' wildcard.
+ cPattern = '*';
+ bRewind = true;
+ // Assume a non-'*' pattern character follows. If it is a '*' instead
+ // that will be handled in the loop by setting nPat.
+ sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
+ if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
+ rPattern.iterateCodePoints( &nAfterFakePattern);
+ }
+
+ sal_Int32 nString = nStartPos, nPat = -1, nStr = -1;
+ sal_uInt32 cPatternAfterAsterisk = 0;
+ bool bEscaped = false, bEscapedAfterAsterisk = false;
+
+ // The loop code tries to avoid costly calls to iterateCodePoints() when
+ // possible.
+
+ do
+ {
+ if (bRewind)
+ {
+ // Reuse cPattern after '*', nPattern was correspondingly
+ // incremented to point behind cPattern.
+ bRewind = false;
+ }
+ else if (nPattern < nPatternLen)
+ {
+ // nPattern will be incremented by iterateCodePoints().
+ cPattern = rPattern.iterateCodePoints( &nPattern);
+ if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
+ {
+ bEscaped = true;
+ cPattern = rPattern.iterateCodePoints( &nPattern);
+ }
+ }
+ else
+ {
+ // A trailing '*' is handled below. If the pattern is consumed and
+ // substring match allowed we're good.
+ if (mbWildcardAllowSubstring)
+ setWildcardMatch( aRes, nStartOffset, nString);
+ return aRes;
+ }
+
+ if (cPattern == '*' && !bEscaped)
+ {
+ // '*' is one code unit, so not using iterateCodePoints() is ok.
+ while (nPattern < nPatternLen && rPattern[nPattern] == '*')
+ ++nPattern;
+
+ if (nPattern >= nPatternLen)
+ {
+ // Last pattern is '*', remaining string matches.
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+ }
+
+ // cPattern will be the next non-'*' character, nPattern
+ // incremented.
+ cPattern = rPattern.iterateCodePoints( &nPattern);
+ if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
+ {
+ bEscaped = true;
+ cPattern = rPattern.iterateCodePoints( &nPattern);
+ }
+
+ cPatternAfterAsterisk = cPattern;
+ bEscapedAfterAsterisk = bEscaped;
+ nPat = nPattern; // Remember position of pattern behind '*', already incremented.
+ nStr = nString; // Remember the current string to be matched.
+ }
+
+ if (nString >= nEndPos)
+ // Whatever follows in pattern, string will not match.
+ return aRes;
+
+ // nString will be incremented by iterateCodePoints().
+ sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
+
+ if ((cPattern != '?' || bEscaped) && cPattern != cString)
+ {
+ if (nPat == -1)
+ // Non-match already without any '*' pattern.
+ return aRes;
+
+ bRewind = true;
+ nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
+ cPattern = cPatternAfterAsterisk;
+ bEscaped = bEscapedAfterAsterisk;
+ searchStr.iterateCodePoints( &nStr);
+ nString = nStr; // Restore incremented remembered string position.
+ if (nPat == nAfterFakePattern)
+ {
+ // Next start offset will be the next character.
+ nStartOffset = nString;
+ }
+ }
+ else
+ {
+ // An unescaped '?' pattern matched any character, or characters
+ // matched. Reset only escaped state.
+ bEscaped = false;
+ }
+ }
+ while (nString < nEndPos);
+
+ if (bRewind)
+ return aRes;
+
+ // Eat trailing '*' pattern that matches anything, including nothing.
+ // '*' is one code unit, so not using iterateCodePoints() is ok.
+ while (nPattern < nPatternLen && rPattern[nPattern] == '*')
+ ++nPattern;
+
+ if (nPattern == nPatternLen)
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+}
+
+SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
+ throw(RuntimeException)
+{
+ SearchResult aRes;
+ aRes.subRegExpressions = 0; // no match
+
+ sal_Int32 nStartOffset = nStartPos;
+ sal_Int32 nEndOffset = nEndPos;
+
+ const sal_Int32 nStringLen = searchStr.getLength();
+
+ // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
+ // string match with (0,0].
+ if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos || nEndPos > nStringLen ||
+ (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
+ return aRes;
+
+ const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
+ sal_Int32 nPatternLen = rPattern.getLength();
+
+ // Handle special cases empty pattern and/or string outside of the loop to
+ // not add performance penalties there and simplify.
+ if (nStartPos == nEndPos)
+ {
+ sal_Int32 i = 0;
+ while (i < nPatternLen && rPattern[i] == '*')
+ ++i;
+ if (i == nPatternLen)
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+ }
+
+ // Empty pattern does not match any non-empty string.
+ if (!nPatternLen)
+ return aRes;
+
+ // Reverse escaped patterns to ease the handling of escapes, keeping escape
+ // and following character as one sequence in backward direction.
+ if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
+ (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
+ {
+ OUStringBuffer aPatternBuf( rPattern);
+ sal_Int32 nIndex = 0;
+ while (nIndex < nPatternLen)
+ {
+ const sal_Int32 nOld = nIndex;
+ const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
+ if (cu == mcWildcardEscapeChar)
+ {
+ if (nIndex < nPatternLen)
+ {
+ if (nIndex - nOld == 1)
+ {
+ // Simply move code units, we already memorized the one
+ // in 'cu'.
+ const sal_Int32 nOld2 = nIndex;
+ rPattern.iterateCodePoints( &nIndex);
+ for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
+ aPatternBuf[nOld+i] = rPattern[nOld2+i];
+ aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
+ }
+ else
+ {
+ // Copy the escape character code units first in the
+ // unlikely case that it would not be of BMP.
+ assert(nIndex - nOld == 2); // it's UTF-16, so..
+ sal_Unicode buf[2];
+ buf[0] = rPattern[nOld];
+ buf[1] = rPattern[nOld+1];
+ const sal_Int32 nOld2 = nIndex;
+ rPattern.iterateCodePoints( &nIndex);
+ for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
+ aPatternBuf[nOld+i] = rPattern[nOld2+i];
+ aPatternBuf[nIndex-2] = buf[0];
+ aPatternBuf[nIndex-1] = buf[1];
+ }
+ }
+ else
+ {
+ // Trailing escape would become leading escape, do what?
+ // Eliminate.
+ aPatternBuf.remove( nOld, nIndex - nOld);
+ }
+ }
+ }
+ if (bUsePrimarySrchStr)
+ maWildcardReversePattern = aPatternBuf.makeStringAndClear();
+ else
+ maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
+ }
+ const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
+ nPatternLen = rReversePattern.getLength();
+
+ bool bRewind = false;
+ sal_uInt32 cPattern = 0;
+ sal_Int32 nPattern = nPatternLen;
+ sal_Int32 nAfterFakePattern = nPattern;
+ if (mbWildcardAllowSubstring)
+ {
+ // Fake a trailing '*' wildcard.
+ cPattern = '*';
+ bRewind = true;
+ // Assume a non-'*' pattern character follows. If it is a '*' instead
+ // that will be handled in the loop by setting nPat.
+ sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
+ if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
+ rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
+ }
+
+ sal_Int32 nString = nStartPos, nPat = -1, nStr = -1;
+ sal_uInt32 cPatternAfterAsterisk = 0;
+ bool bEscaped = false, bEscapedAfterAsterisk = false;
+
+ // The loop code tries to avoid costly calls to iterateCodePoints() when
+ // possible.
+
+ do
+ {
+ if (bRewind)
+ {
+ // Reuse cPattern after '*', nPattern was correspondingly
+ // decremented to point before cPattern.
+ bRewind = false;
+ }
+ else if (nPattern > 0)
+ {
+ // nPattern will be decremented by iterateCodePoints().
+ cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+ if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
+ {
+ bEscaped = true;
+ cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+ }
+ }
+ else
+ {
+ // A leading '*' is handled below. If the pattern is consumed and
+ // substring match allowed we're good.
+ if (mbWildcardAllowSubstring)
+ setWildcardMatch( aRes, nStartOffset, nString);
+ return aRes;
+ }
+
+ if (cPattern == '*' && !bEscaped)
+ {
+ // '*' is one code unit, so not using iterateCodePoints() is ok.
+ while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
+ --nPattern;
+
+ if (nPattern <= 0)
+ {
+ // First pattern is '*', remaining string matches.
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+ }
+
+ // cPattern will be the previous non-'*' character, nPattern
+ // decremented.
+ cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+ if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
+ {
+ bEscaped = true;
+ cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+ }
+
+ cPatternAfterAsterisk = cPattern;
+ bEscapedAfterAsterisk = bEscaped;
+ nPat = nPattern; // Remember position of pattern before '*', already decremented.
+ nStr = nString; // Remember the current string to be matched.
+ }
+
+ if (nString <= nEndPos)
+ // Whatever leads in pattern, string will not match.
+ return aRes;
+
+ // nString will be decremented by iterateCodePoints().
+ sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
+
+ if ((cPattern != '?' || bEscaped) && cPattern != cString)
+ {
+ if (nPat == -1)
+ // Non-match already without any '*' pattern.
+ return aRes;
+
+ bRewind = true;
+ nPattern = nPat; // Rewind pattern to character before '*', already decremented.
+ cPattern = cPatternAfterAsterisk;
+ bEscaped = bEscapedAfterAsterisk;
+ searchStr.iterateCodePoints( &nStr, -1);
+ nString = nStr; // Restore decremented remembered string position.
+ if (nPat == nAfterFakePattern)
+ {
+ // Next start offset will be this character (exclusive).
+ nStartOffset = nString;
+ }
+ }
+ else
+ {
+ // An unescaped '?' pattern matched any character, or characters
+ // matched. Reset only escaped state.
+ bEscaped = false;
+ }
+ }
+ while (nString > nEndPos);
+
+ if (bRewind)
+ return aRes;
+
+ // Eat leading '*' pattern that matches anything, including nothing.
+ // '*' is one code unit, so not using iterateCodePoints() is ok.
+ while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
+ --nPattern;
+
+ if (nPattern == 0)
+ setWildcardMatch( aRes, nStartOffset, nEndOffset);
+ return aRes;
+}
+
+
static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
static uno::Sequence< OUString > getServiceName_Static()
diff --git a/i18npool/source/search/textsearch.hxx b/i18npool/source/search/textsearch.hxx
index ae189c29df63..49c1dfb24fef 100644
--- a/i18npool/source/search/textsearch.hxx
+++ b/i18npool/source/search/textsearch.hxx
@@ -106,6 +106,20 @@ class TextSearch: public cppu::WeakImplHelper
sal_Int32 startPos, sal_Int32 endPos )
throw(css::uno::RuntimeException);
+ // Members and methods for the wildcard search
+ OUString maWildcardReversePattern;
+ OUString maWildcardReversePattern2;
+ sal_uInt32 mcWildcardEscapeChar;
+ bool mbWildcardAllowSubstring;
+ css::util::SearchResult SAL_CALL
+ WildcardSrchFrwrd( const OUString& searchStr,
+ sal_Int32 startPos, sal_Int32 endPos )
+ throw(css::uno::RuntimeException);
+ css::util::SearchResult SAL_CALL
+ WildcardSrchBkwrd( const OUString& searchStr,
+ sal_Int32 startPos, sal_Int32 endPos )
+ throw(css::uno::RuntimeException);
+
bool IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const;
bool checkCTLStart, checkCTLEnd;