Split formula group for OpenCL up into smaller bits

Will make it less demanding on low-end hardware, where the device driver is unresponsive for too long when a OpenCL kernel handling lots of data is executing. This makes Windows restart it which is problematic. I tried several approaches of splitting, both at higher levels in sc and at the lowest level just before creating and executing the OpenCL kernel(s). This seems to be the most minimal and local approach. Doing it at the lower level would have required too much poking into our obscure OpenCL code, like passing an offset parameter to every kernel. Use a simple heuristic to find out whether to split. On the problematic low-end devices, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT is 4, while for more performant devices it is 1 or 8. Change-Id: If16d152710057b34d09ef0203960e1fbb9ac067f Reviewed-on: https://gerrit.libreoffice.org/18613 Reviewed-by: Michael Meeks <michael.meeks@collabora.com> Tested-by: Michael Meeks <michael.meeks@collabora.com>
author: Tor Lillqvist <tml@collabora.com> 2015-09-10 21:58:28 +0300
committer: Andras Timar <andras.timar@collabora.com> 2015-09-19 21:32:18 +0200
commit: f4bf1255c989cf51e04ebba8f2349da34ea7616b (patch)
tree: bbeb5fd5a4db65ff2bd0b6e3d208444a459ca66d
parent: 773d411a6ad399c943524c5dbe7800e7f00a8aeb (diff)
4 files changed, 123 insertions, 20 deletions
diff --git a/include/clew/clew.h b/include/clew/clew.h
index 94b6c29d9262..e5cfaf0836be 100644
--- a/include/clew/clew.h
+++ b/include/clew/clew.h
@@ -416,6 +416,7 @@ typedef struct _cl_image_format {
 
 // cl_device_info
 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
 #define CL_DEVICE_NAME                              0x102B
diff --git a/include/opencl/openclwrapper.hxx b/include/opencl/openclwrapper.hxx
index 75ecbc85a606..e3f967ee46c5 100644
--- a/include/opencl/openclwrapper.hxx
+++ b/include/opencl/openclwrapper.hxx
@@ -52,6 +52,7 @@ struct GPUEnv
     int mnCmdQueuePos;
     bool mnKhrFp64Flag;
     bool mnAmdFp64Flag;
+    cl_uint mnPreferredVectorWidthFloat;
 };
 
 extern OPENCL_DLLPUBLIC GPUEnv gpuEnv;
diff --git a/opencl/source/openclwrapper.cxx b/opencl/source/openclwrapper.cxx
index 5574d2c3fa09..9d03a2780220 100644
--- a/opencl/source/openclwrapper.cxx
+++ b/opencl/source/openclwrapper.cxx
@@ -501,6 +501,11 @@ bool initOpenCLRunEnv( GPUEnv *gpuInfo )
     gpuInfo->mnKhrFp64Flag = bKhrFp64;
     gpuInfo->mnAmdFp64Flag = bAmdFp64;
 
+    gpuInfo->mnPreferredVectorWidthFloat = 0;
+
+    clGetDeviceInfo(gpuInfo->mpArryDevsID[0], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint),
+                    &gpuInfo->mnPreferredVectorWidthFloat, NULL);
+
     return false;
 }
 
diff --git a/sc/source/core/data/formulacell.cxx b/sc/source/core/data/formulacell.cxx
index eb2b374e4b5a..b4ba29b39959 100644
--- a/sc/source/core/data/formulacell.cxx
+++ b/sc/source/core/data/formulacell.cxx
@@ -20,6 +20,7 @@
 #include <sal/config.h>
 
 #include <cassert>
+#include <cstdlib>
 
 #include "formulacell.hxx"
 #include "grouptokenconverter.hxx"
@@ -54,6 +55,7 @@
 #include "types.hxx"
 #include "scopetools.hxx"
 #include "refupdatecontext.hxx"
+#include <opencl/openclwrapper.hxx>
 #include <tokenstringcontext.hxx>
 #include <refhint.hxx>
 #include <listenerquery.hxx>
@@ -3770,6 +3772,36 @@ ScFormulaCell::CompareState ScFormulaCell::CompareByTokenArray( ScFormulaCell& r
     return bInvariant ? EqualInvariant : EqualRelativeRef;
 }
 
+namespace {
+
+// Split N into optimally equal-sized pieces, each not larger than K.
+// Return value P is number of pieces. A returns the number of pieces
+// one larger than N/P, 0..P-1.
+
+int splitup(int N, int K, int& A)
+{
+    assert(N > 0);
+    assert(K > 0);
+
+    A = 0;
+
+    if (N <= K)
+        return 1;
+
+    const int ideal_num_parts = N / K;
+    if (ideal_num_parts * K == N)
+        return ideal_num_parts;
+
+    const int num_parts = ideal_num_parts + 1;
+    const int nominal_part_size = N / num_parts;
+
+    A = N - num_parts * nominal_part_size;
+
+    return num_parts;
+}
+
+} // anonymous namespace
+
 bool ScFormulaCell::InterpretFormulaGroup()
 {
     if (!officecfg::Office::Common::Misc::UseOpenCL::get())
@@ -3805,30 +3837,94 @@ bool ScFormulaCell::InterpretFormulaGroup()
     if (mxGroup->mbInvariant && false)
         return InterpretInvariantFormulaGroup();
 
-    ScTokenArray aCode;
-    ScAddress aTopPos = aPos;
-    aTopPos.SetRow(mxGroup->mpTopCell->aPos.Row());
-    ScGroupTokenConverter aConverter(aCode, *pDocument, *this, mxGroup->mpTopCell->aPos);
-    std::vector<ScTokenArray*> aLoopControl;
-    if (!aConverter.convert(*pCode, aLoopControl))
-    {
-        SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
-        mxGroup->meCalcState = sc::GroupCalcDisabled;
-        return false;
-    }
+    int nMaxGroupLength = INT_MAX;
+
+#ifdef WNT
+    // Heuristic: Certain old low-end OpenCL implementations don't
+    // work for us with too large group lengths. 1000 was determined
+    // empirically to be a good compromise. Looking at the preferred
+    // float vector width seems to be a way to detect these devices.
+    if (opencl::gpuEnv.mnPreferredVectorWidthFloat == 4)
+        nMaxGroupLength = 1000;
+#endif
+
+    if (std::getenv("SC_MAX_GROUP_LENGTH"))
+        nMaxGroupLength = std::atoi(std::getenv("SC_MAX_GROUP_LENGTH"));
+
+    int nNumOnePlus;
+    const int nNumParts = splitup(GetSharedLength(), nMaxGroupLength, nNumOnePlus);
 
-    // The converted code does not have RPN tokens yet.  The interpreter will
-    // generate them.
-    mxGroup->meCalcState = sc::GroupCalcRunning;
-    sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
-    if (pInterpreter == NULL ||
-        !pInterpreter->interpret(*pDocument, mxGroup->mpTopCell->aPos, mxGroup, aCode))
+    int nOffset = 0;
+    int nCurChunkSize;
+    ScAddress aOrigPos = mxGroup->mpTopCell->aPos;
+    for (int i = 0; i < nNumParts; i++, nOffset += nCurChunkSize)
     {
-        SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
-        mxGroup->meCalcState = sc::GroupCalcDisabled;
-        return false;
+        nCurChunkSize = GetSharedLength()/nNumParts + (i < nNumOnePlus ? 1 : 0);
+
+        ScFormulaCellGroupRef xGroup;
+
+        if (nNumParts == 1)
+            xGroup = mxGroup;
+        else
+        {
+            // Ugly hack
+            xGroup = new ScFormulaCellGroup();
+            xGroup->mpTopCell = mxGroup->mpTopCell;
+            xGroup->mpTopCell->aPos = aOrigPos;
+            xGroup->mpTopCell->aPos.IncRow(nOffset);
+            xGroup->mbInvariant = mxGroup->mbInvariant;
+            xGroup->mnLength = nCurChunkSize;
+            xGroup->mpCode = mxGroup->mpCode;
+        }
+
+        ScTokenArray aCode;
+        ScGroupTokenConverter aConverter(aCode, *pDocument, *this, xGroup->mpTopCell->aPos);
+        std::vector<ScTokenArray*> aLoopControl;
+        if (!aConverter.convert(*pCode, aLoopControl))
+        {
+            SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
+            mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+            // Undo the hack above
+            if (nNumParts > 1)
+            {
+                mxGroup->mpTopCell->aPos = aOrigPos;
+                xGroup->mpTopCell = NULL;
+                xGroup->mpCode = NULL;
+            }
+
+            return false;
+        }
+
+        // The converted code does not have RPN tokens yet.  The interpreter will
+        // generate them.
+        xGroup->meCalcState = mxGroup->meCalcState = sc::GroupCalcRunning;
+        sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
+        if (pInterpreter == NULL ||
+            !pInterpreter->interpret(*pDocument, xGroup->mpTopCell->aPos, xGroup, aCode))
+        {
+            SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
+            mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+            // Undo the hack above
+            if (nNumParts > 1)
+            {
+                mxGroup->mpTopCell->aPos = aOrigPos;
+                xGroup->mpTopCell = NULL;
+                xGroup->mpCode = NULL;
+            }
+
+            return false;
+        }
+        if (nNumParts > 1)
+        {
+            xGroup->mpTopCell = NULL;
+            xGroup->mpCode = NULL;
+        }
     }
 
+    if (nNumParts > 1)
+        mxGroup->mpTopCell->aPos = aOrigPos;
     mxGroup->meCalcState = sc::GroupCalcEnabled;
     return true;
 }
author	Tor Lillqvist <tml@collabora.com>	2015-09-10 21:58:28 +0300
committer	Andras Timar <andras.timar@collabora.com>	2015-09-19 21:32:18 +0200
commit	f4bf1255c989cf51e04ebba8f2349da34ea7616b (patch)
tree	bbeb5fd5a4db65ff2bd0b6e3d208444a459ca66d
parent	773d411a6ad399c943524c5dbe7800e7f00a8aeb (diff)