summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuboš Luňák <l.lunak@collabora.com>2018-11-28 13:49:01 +0100
committerLuboš Luňák <l.lunak@collabora.com>2018-12-03 15:32:05 +0100
commit1b489b74fe28007749bec8a3ebd56901a7652734 (patch)
tree795981b71df991d284bca18fbf02a803ca58436d
parent95ff92118d28af20e1b3a2e5f0d5f8c6f79fb2a8 (diff)
restore functionality of OpenCL ParallelReductionVectorRef class
These functions were removed in 149a4d7566, although they quite clearly do serve a purpose, and then e9586cf0b5 removed the no longer used variables. Without this, ParallelReductionVectorRef class simply does not work. E.g. SUM(B1:B250) would not sum all the cells, but just one (testcase is Test::testFormulaRefUpdateRange with OpenCL forced). Change-Id: Iad1aeba2fbae2b4fc1d4236030adbb9ce6d972a1 Reviewed-on: https://gerrit.libreoffice.org/64235 Tested-by: Jenkins Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
-rw-r--r--sc/source/core/opencl/formulagroupcl.cxx372
1 files changed, 372 insertions, 0 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 71acf15e9590..4db3fb4ebe11 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -1221,6 +1221,7 @@ public:
std::shared_ptr<SlidingFunctionBase>& CodeGen, int index)
: Base(config, s, ft, index)
, mpCodeGen(CodeGen)
+ , mpClmem2(nullptr)
{
FormulaToken* t = ft->GetFormulaToken();
if (t->GetType() != formula::svDoubleVectorRef)
@@ -1229,6 +1230,198 @@ public:
bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed();
}
+
+ /// Emit the definition for the auxiliary reduction kernel
+ virtual void GenSlidingWindowFunction( std::stringstream& ss )
+ {
+ if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
+ {
+ std::string name = Base::GetName();
+ ss << "__kernel void " << name;
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result =" <<
+ mpCodeGen->GetBottom();
+ ss << ";\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ if (mpDVR->IsStartFixed())
+ ss << " int offset = 0;\n";
+ else // if (!mpDVR->IsStartFixed())
+ ss << " int offset = get_group_id(1);\n";
+ if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = offset + windowSize;\n";
+ else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = windowSize + get_group_id(1);\n";
+ else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ ss << " end = min(end, arrayLength);\n";
+
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = " << mpCodeGen->GetBottom() << ";\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < end) {\n";
+ ss << " tmp = legalize(" << mpCodeGen->Gen2(
+ "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
+ ss << " tmp = legalize(" << mpCodeGen->Gen2(
+ "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n";
+ ss << " } else if ((loopOffset + lidx + offset) < end)\n";
+ ss << " tmp = legalize(" << mpCodeGen->Gen2(
+ "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] = ";
+ // Special case count
+ if (dynamic_cast<OpCount*>(mpCodeGen.get()))
+ ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+ else
+ ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result =";
+ if (dynamic_cast<OpCount*>(mpCodeGen.get()))
+ ss << "current_result + shm_buf[0]";
+ else
+ ss << mpCodeGen->Gen2("current_result", "shm_buf[0]");
+ ss << ";\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }
+ else
+ {
+ std::string name = Base::GetName();
+ /*sum reduction*/
+ ss << "__kernel void " << name << "_sum";
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result =" <<
+ mpCodeGen->GetBottom();
+ ss << ";\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ if (mpDVR->IsStartFixed())
+ ss << " int offset = 0;\n";
+ else // if (!mpDVR->IsStartFixed())
+ ss << " int offset = get_group_id(1);\n";
+ if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = offset + windowSize;\n";
+ else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = windowSize + get_group_id(1);\n";
+ else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ ss << " end = min(end, arrayLength);\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = " << mpCodeGen->GetBottom() << ";\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < end) {\n";
+ ss << " tmp = legalize(";
+ ss << "(A[loopOffset + lidx + offset]+ tmp)";
+ ss << ", tmp);\n";
+ ss << " tmp = legalize((A[loopOffset + lidx + offset + 256]+ tmp)";
+ ss << ", tmp);\n";
+ ss << " } else if ((loopOffset + lidx + offset) < end)\n";
+ ss << " tmp = legalize((A[loopOffset + lidx + offset] + tmp)";
+ ss << ", tmp);\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] = ";
+ ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result =";
+ ss << "current_result + shm_buf[0]";
+ ss << ";\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ /*count reduction*/
+ ss << "__kernel void " << name << "_count";
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result =" <<
+ mpCodeGen->GetBottom();
+ ss << ";\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ if (mpDVR->IsStartFixed())
+ ss << " int offset = 0;\n";
+ else // if (!mpDVR->IsStartFixed())
+ ss << " int offset = get_group_id(1);\n";
+ if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = offset + windowSize;\n";
+ else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+ ss << " int end = windowSize + get_group_id(1);\n";
+ else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+ ss << " int end = windowSize;\n";
+ ss << " end = min(end, arrayLength);\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = " << mpCodeGen->GetBottom() << ";\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < end) {\n";
+ ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)";
+ ss << ", tmp);\n";
+ ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset+256])?tmp:tmp+1.0)";
+ ss << ", tmp);\n";
+ ss << " } else if ((loopOffset + lidx + offset) < end)\n";
+ ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)";
+ ss << ", tmp);\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] = ";
+ ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result =";
+ ss << "current_result + shm_buf[0];";
+ ss << ";\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }
+
+ }
+
+ virtual std::string GenSlidingWindowDeclRef( bool ) const
+ {
+ std::stringstream ss;
+ if (!bIsStartFixed && !bIsEndFixed)
+ ss << Base::GetName() << "[i + gid0]";
+ else
+ ss << Base::GetName() << "[i]";
+ return ss.str();
+ }
+
/// Controls how the elements in the DoubleVectorRef are traversed
size_t GenReductionLoopHeader(
std::stringstream& ss, int nResultSize, bool& needBody )
@@ -1254,6 +1447,183 @@ public:
return nCurWindowSize;
}
+ virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram )
+ {
+ assert(Base::mpClmem == nullptr);
+
+ openclwrapper::KernelEnv kEnv;
+ openclwrapper::setKernelEnv(&kEnv);
+ cl_int err;
+ size_t nInput = mpDVR->GetArrayLength();
+ size_t nCurWindowSize = mpDVR->GetRefRowSize();
+ // create clmem buffer
+ if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr)
+ throw Unhandled(__FILE__, __LINE__);
+ double* pHostBuffer = const_cast<double*>(
+ mpDVR->GetArrays()[Base::mnIndex].mpNumericArray);
+ size_t szHostBuffer = nInput * sizeof(double);
+ Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
+ cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR,
+ szHostBuffer,
+ pHostBuffer, &err);
+ SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer);
+
+ mpClmem2 = clCreateBuffer(kEnv.mpkContext,
+ CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+ sizeof(double) * w, nullptr, nullptr);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
+ SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w));
+
+ // reproduce the reduction function name
+ std::string kernelName;
+ if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
+ kernelName = Base::GetName() + "_reduction";
+ else
+ kernelName = Base::GetName() + "_sum_reduction";
+ cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+ if (err != CL_SUCCESS)
+ throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
+ SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram);
+
+ // set kernel arg of reduction kernel
+ // TODO(Wei Wei): use unique name for kernel
+ cl_mem buf = Base::GetCLBuffer();
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf);
+ err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+ static_cast<void*>(&buf));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2);
+ err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput);
+ err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize);
+ err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ // set work group size and execute
+ size_t global_work_size[] = { 256, static_cast<size_t>(w) };
+ size_t const local_work_size[] = { 256, 1 };
+ SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel);
+ err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
+ global_work_size, local_work_size, 0, nullptr, nullptr);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__);
+ err = clFinish(kEnv.mpkCmdQueue);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clFinish", err, __FILE__, __LINE__);
+ if (dynamic_cast<OpAverage*>(mpCodeGen.get()))
+ {
+ /*average need more reduction kernel for count computing*/
+ std::unique_ptr<double[]> pAllBuffer(new double[2 * w]);
+ double* resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
+ mpClmem2,
+ CL_TRUE, CL_MAP_READ, 0,
+ sizeof(double) * w, 0, nullptr, nullptr,
+ &err));
+ if (err != CL_SUCCESS)
+ throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__);
+
+ for (int i = 0; i < w; i++)
+ pAllBuffer[i] = resbuf[i];
+ err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr);
+ if (err != CL_SUCCESS)
+ throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__);
+
+ kernelName = Base::GetName() + "_count_reduction";
+ redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+ if (err != CL_SUCCESS)
+ throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
+ SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram);
+
+ // set kernel arg of reduction kernel
+ buf = Base::GetCLBuffer();
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf);
+ err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+ static_cast<void*>(&buf));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2);
+ err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput);
+ err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize);
+ err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+ // set work group size and execute
+ size_t global_work_size1[] = { 256, static_cast<size_t>(w) };
+ size_t const local_work_size1[] = { 256, 1 };
+ SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel);
+ err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
+ global_work_size1, local_work_size1, 0, nullptr, nullptr);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__);
+ err = clFinish(kEnv.mpkCmdQueue);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clFinish", err, __FILE__, __LINE__);
+ resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
+ mpClmem2,
+ CL_TRUE, CL_MAP_READ, 0,
+ sizeof(double) * w, 0, nullptr, nullptr,
+ &err));
+ if (err != CL_SUCCESS)
+ throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__);
+ for (int i = 0; i < w; i++)
+ pAllBuffer[i + w] = resbuf[i];
+ err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr);
+ // FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails?
+ if (CL_SUCCESS != err)
+ SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err));
+ if (mpClmem2)
+ {
+ err = clReleaseMemObject(mpClmem2);
+ SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err));
+ mpClmem2 = nullptr;
+ }
+ mpClmem2 = clCreateBuffer(kEnv.mpkContext,
+ cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
+ w * sizeof(double) * 2, pAllBuffer.get(), &err);
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
+ SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get());
+ }
+ // set kernel arg
+ SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2);
+ err = clSetKernelArg(k, argno, sizeof(cl_mem), &(mpClmem2));
+ if (CL_SUCCESS != err)
+ throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+ return 1;
+ }
+
+ ~ParallelReductionVectorRef()
+ {
+ if (mpClmem2)
+ {
+ cl_int err;
+ err = clReleaseMemObject(mpClmem2);
+ SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err));
+ mpClmem2 = nullptr;
+ }
+ }
+
size_t GetArrayLength() const { return mpDVR->GetArrayLength(); }
size_t GetWindowSize() const { return mpDVR->GetRefRowSize(); }
@@ -1267,6 +1637,8 @@ protected:
const formula::DoubleVectorRefToken* mpDVR;
// from parent nodes
std::shared_ptr<SlidingFunctionBase> mpCodeGen;
+ // controls whether to invoke the reduction kernel during marshaling or not
+ cl_mem mpClmem2;
};
class Reduction : public SlidingFunctionBase