try to search efficiently with a query with many items (tdf#133867)

Autofilter with large documents can create queries that have thousands of items. Searching all of those for every cell using the generic algorithm can be quite slow. First try an optimized search for this case that skips most of the complications and just tries to find in the query items an exact match for the cell. This significantly speeds up tdf#133867 or attachment from comment #2 in tdf#136838. Change-Id: I2bba18da6a101c76398d8c42c4306c53682899c1 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/125746 Tested-by: Jenkins Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
author: Luboš Luňák <l.lunak@collabora.com> 2021-11-23 20:48:02 +0100
committer: Luboš Luňák <l.lunak@collabora.com> 2021-11-25 14:26:42 +0100
commit: 1d0cdc461c43f0ce0eda4961311a972edf9e78e2 (patch)
tree: 44612f305f6544a71bf6d3bba2ce1ef526b2f18b /sc/source
parent: 02f3157f75b654ef7648efdc3004b3f326d5af40 (diff)
1 files changed, 99 insertions, 22 deletions
diff --git a/sc/source/core/data/table3.cxx b/sc/source/core/data/table3.cxx
index 49c075121b61..6012deb77540 100644
--- a/sc/source/core/data/table3.cxx
+++ b/sc/source/core/data/table3.cxx
@@ -2367,25 +2367,6 @@ class QueryEvaluator
         return false;
     }
 
-    bool isRealWildOrRegExp(const ScQueryEntry& rEntry) const
-    {
-        if (mrParam.eSearchType == utl::SearchParam::SearchType::Normal)
-            return false;
-
-        return isTextMatchOp(rEntry);
-    }
-
-    bool isTestWildOrRegExp(const ScQueryEntry& rEntry) const
-    {
-        if (!mpTestEqualCondition)
-            return false;
-
-        if (mrParam.eSearchType == utl::SearchParam::SearchType::Normal)
-            return false;
-
-        return (rEntry.eOp == SC_LESS_EQUAL || rEntry.eOp == SC_GREATER_EQUAL);
-    }
-
     void setupTransliteratorIfNeeded()
     {
         if (!mpTransliteration)
@@ -2414,6 +2395,25 @@ public:
     {
     }
 
+    bool isRealWildOrRegExp(const ScQueryEntry& rEntry) const
+    {
+        if (mrParam.eSearchType == utl::SearchParam::SearchType::Normal)
+            return false;
+
+        return isTextMatchOp(rEntry);
+    }
+
+    bool isTestWildOrRegExp(const ScQueryEntry& rEntry) const
+    {
+        if (!mpTestEqualCondition)
+            return false;
+
+        if (mrParam.eSearchType == utl::SearchParam::SearchType::Normal)
+            return false;
+
+        return (rEntry.eOp == SC_LESS_EQUAL || rEntry.eOp == SC_GREATER_EQUAL);
+    }
+
     static bool isQueryByValue(
         const ScQueryEntry::Item& rItem, ScRefCellValue& rCell)
     {
@@ -2937,7 +2937,7 @@ public:
 
 std::pair<bool,bool> validQueryProcessEntry(SCROW nRow, SCCOL nCol, SCTAB nTab, const ScQueryParam& rParam,
     ScRefCellValue& aCell, bool* pbTestEqualCondition, const ScInterpreterContext* pContext, QueryEvaluator& aEval,
-    const ScQueryEntry& rEntry )
+    const ScDocument& rDoc, const ScQueryEntry& rEntry )
 {
     std::pair<bool,bool> aRes(false, false);
     const ScQueryEntry::QueryItemsType& rItems = rEntry.GetQueryItems();
@@ -2952,10 +2952,87 @@ std::pair<bool,bool> validQueryProcessEntry(SCROW nRow, SCCOL nCol, SCTAB nTab,
         }
         return aRes;
     }
-    // Generic handling.
+    if( rEntry.eOp == SC_EQUAL && rItems.size() >= 10 )
+    {
+        // If there are many items to query for (autofilter does this), then try to search
+        // efficiently in those items. So first search all the items of the relevant type,
+        // If that does not find anything, fall back to the generic code.
+        double value = 0;
+        bool valid = true;
+        // For ScQueryEntry::ByValue check that the cell either is a value or is a formula
+        // that has a value and is not an error (those are compared as strings). This
+        // is basically simplified isQueryByValue().
+        if( aCell.meType == CELLTYPE_VALUE )
+            value = aCell.mfValue;
+        else if (aCell.meType == CELLTYPE_FORMULA && aCell.mpFormula->GetErrCode() != FormulaError::NONE
+            && aCell.mpFormula->IsValue())
+        {
+            value = aCell.mpFormula->GetValue();
+        }
+        else
+            valid = false;
+        if(valid)
+        {
+            for (const auto& rItem : rItems)
+            {
+                // For speed don't bother comparing approximately here, usually there either
+                // will be an exact match or it wouldn't match anyway.
+                if (rItem.meType == ScQueryEntry::ByValue
+                    && value == rItem.mfVal)
+                {
+                    return std::make_pair(true, true);
+                }
+            }
+        }
+    }
     const svl::SharedString* cellSharedString = nullptr;
     OUString cellString;
     bool cellStringSet = false;
+    if( rEntry.eOp == SC_EQUAL && rItems.size() >= 10 )
+    {
+        // The same as above but for strings. Try to optimize the case when
+        // it's a svl::SharedString comparison (case sensitive or not).
+        // That happens when SC_EQUAL is used, whole cell matching is enabled,
+        // and a regexp is not wanted, see compareByString() above.
+        if(rDoc.GetDocOptions().IsMatchWholeCell()
+             && !aEval.isRealWildOrRegExp(rEntry) && !aEval.isTestWildOrRegExp(rEntry))
+        {
+            if(!cellStringSet)
+            {
+                cellString = aEval.getCellString(aCell, nRow, rEntry, pContext, &cellSharedString);
+                cellStringSet = true;
+            }
+            // For ScQueryEntry::ByString check that the cell is represented by a shared string,
+            // which means it's either a string cell or a formula error. This is not as
+            // generous as isQueryByString() but it should be enough and better be safe.
+            if(cellSharedString != nullptr)
+            {
+                if (rParam.bCaseSens)
+                {
+                    for (const auto& rItem : rItems)
+                    {
+                        if (rItem.meType == ScQueryEntry::ByString
+                            && cellSharedString->getData() == rItem.maString.getData())
+                        {
+                            return std::make_pair(true, true);
+                        }
+                    }
+                }
+                else
+                {
+                    for (const auto& rItem : rItems)
+                    {
+                        if (rItem.meType == ScQueryEntry::ByString
+                            && cellSharedString->getDataIgnoreCase() == rItem.maString.getDataIgnoreCase())
+                        {
+                            return std::make_pair(true, true);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // Generic handling.
     for (const auto& rItem : rItems)
     {
         if (rItem.meType == ScQueryEntry::ByTextColor)
@@ -3059,7 +3136,7 @@ bool ScTable::ValidQuery(
             aCell = GetCellValue(nCol, nRow);
 
         std::pair<bool,bool> aRes = validQueryProcessEntry(nRow, nCol, nTab, rParam, aCell,
-            pbTestEqualCondition, pContext, aEval, rEntry);
+            pbTestEqualCondition, pContext, aEval, rDocument, rEntry);
 
         if (nPos == -1)
         {
author	Luboš Luňák <l.lunak@collabora.com>	2021-11-23 20:48:02 +0100
committer	Luboš Luňák <l.lunak@collabora.com>	2021-11-25 14:26:42 +0100
commit	1d0cdc461c43f0ce0eda4961311a972edf9e78e2 (patch)
tree	44612f305f6544a71bf6d3bba2ce1ef526b2f18b /sc/source
parent	02f3157f75b654ef7648efdc3004b3f326d5af40 (diff)