diff options
author | Luboš Luňák <l.lunak@collabora.com> | 2018-11-28 13:49:01 +0100 |
---|---|---|
committer | Luboš Luňák <l.lunak@collabora.com> | 2018-12-03 15:32:05 +0100 |
commit | 1b489b74fe28007749bec8a3ebd56901a7652734 (patch) | |
tree | 795981b71df991d284bca18fbf02a803ca58436d | |
parent | 95ff92118d28af20e1b3a2e5f0d5f8c6f79fb2a8 (diff) |
restore functionality of OpenCL ParallelReductionVectorRef class
These functions were removed in 149a4d7566, although they quite clearly
do serve a purpose, and then e9586cf0b5 removed the no longer used
variables.
Without this, ParallelReductionVectorRef class simply does not work.
E.g. SUM(B1:B250) would not sum all the cells, but just one (testcase
is Test::testFormulaRefUpdateRange with OpenCL forced).
Change-Id: Iad1aeba2fbae2b4fc1d4236030adbb9ce6d972a1
Reviewed-on: https://gerrit.libreoffice.org/64235
Tested-by: Jenkins
Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
-rw-r--r-- | sc/source/core/opencl/formulagroupcl.cxx | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index 71acf15e9590..4db3fb4ebe11 100644 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -1221,6 +1221,7 @@ public: std::shared_ptr<SlidingFunctionBase>& CodeGen, int index) : Base(config, s, ft, index) , mpCodeGen(CodeGen) + , mpClmem2(nullptr) { FormulaToken* t = ft->GetFormulaToken(); if (t->GetType() != formula::svDoubleVectorRef) @@ -1229,6 +1230,198 @@ public: bIsStartFixed = mpDVR->IsStartFixed(); bIsEndFixed = mpDVR->IsEndFixed(); } + + /// Emit the definition for the auxiliary reduction kernel + virtual void GenSlidingWindowFunction( std::stringstream& ss ) + { + if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) + { + std::string name = Base::GetName(); + ss << "__kernel void " << name; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; + ss << " tmp = legalize(" << mpCodeGen->Gen2( + "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; + ss << " tmp = legalize(" << mpCodeGen->Gen2( + "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n"; + ss << " } else if ((loopOffset + lidx + offset) < end)\n"; + ss << " tmp = legalize(" << mpCodeGen->Gen2( + "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + // Special case count + if (dynamic_cast<OpCount*>(mpCodeGen.get())) + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + else + ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + if (dynamic_cast<OpCount*>(mpCodeGen.get())) + ss << "current_result + shm_buf[0]"; + else + ss << mpCodeGen->Gen2("current_result", "shm_buf[0]"); + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } + else + { + std::string name = Base::GetName(); + /*sum reduction*/ + ss << "__kernel void " << name << "_sum"; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; + ss << " tmp = legalize("; + ss << "(A[loopOffset + lidx + offset]+ tmp)"; + ss << ", tmp);\n"; + ss << " tmp = legalize((A[loopOffset + lidx + offset + 256]+ tmp)"; + ss << ", tmp);\n"; + ss << " } else if ((loopOffset + lidx + offset) < end)\n"; + ss << " tmp = legalize((A[loopOffset + lidx + offset] + tmp)"; + ss << ", tmp);\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + ss << "current_result + shm_buf[0]"; + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + /*count reduction*/ + ss << "__kernel void " << name << "_count"; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; + ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; + ss << ", tmp);\n"; + ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset+256])?tmp:tmp+1.0)"; + ss << ", tmp);\n"; + ss << " } else if ((loopOffset + lidx + offset) < end)\n"; + ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; + ss << ", tmp);\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + ss << "current_result + shm_buf[0];"; + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } + + } + + virtual std::string GenSlidingWindowDeclRef( bool ) const + { + std::stringstream ss; + if (!bIsStartFixed && !bIsEndFixed) + ss << Base::GetName() << "[i + gid0]"; + else + ss << Base::GetName() << "[i]"; + return ss.str(); + } + /// Controls how the elements in the DoubleVectorRef are traversed size_t GenReductionLoopHeader( std::stringstream& ss, int nResultSize, bool& needBody ) @@ -1254,6 +1447,183 @@ public: return nCurWindowSize; } + virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ) + { + assert(Base::mpClmem == nullptr); + + openclwrapper::KernelEnv kEnv; + openclwrapper::setKernelEnv(&kEnv); + cl_int err; + size_t nInput = mpDVR->GetArrayLength(); + size_t nCurWindowSize = mpDVR->GetRefRowSize(); + // create clmem buffer + if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr) + throw Unhandled(__FILE__, __LINE__); + double* pHostBuffer = const_cast<double*>( + mpDVR->GetArrays()[Base::mnIndex].mpNumericArray); + size_t szHostBuffer = nInput * sizeof(double); + Base::mpClmem = clCreateBuffer(kEnv.mpkContext, + cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR, + szHostBuffer, + pHostBuffer, &err); + SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer); + + mpClmem2 = clCreateBuffer(kEnv.mpkContext, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + sizeof(double) * w, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w)); + + // reproduce the reduction function name + std::string kernelName; + if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) + kernelName = Base::GetName() + "_reduction"; + else + kernelName = Base::GetName() + "_sum_reduction"; + cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); + + // set kernel arg of reduction kernel + // TODO(Wei Wei): use unique name for kernel + cl_mem buf = Base::GetCLBuffer(); + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); + err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), + static_cast<void*>(&buf)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); + err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); + err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + // set work group size and execute + size_t global_work_size[] = { 256, static_cast<size_t>(w) }; + size_t const local_work_size[] = { 256, 1 }; + SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, + global_work_size, local_work_size, 0, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError("clFinish", err, __FILE__, __LINE__); + if (dynamic_cast<OpAverage*>(mpCodeGen.get())) + { + /*average need more reduction kernel for count computing*/ + std::unique_ptr<double[]> pAllBuffer(new double[2 * w]); + double* resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, + mpClmem2, + CL_TRUE, CL_MAP_READ, 0, + sizeof(double) * w, 0, nullptr, nullptr, + &err)); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); + + for (int i = 0; i < w; i++) + pAllBuffer[i] = resbuf[i]; + err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__); + + kernelName = Base::GetName() + "_count_reduction"; + redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); + + // set kernel arg of reduction kernel + buf = Base::GetCLBuffer(); + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); + err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), + static_cast<void*>(&buf)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); + err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); + err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + // set work group size and execute + size_t global_work_size1[] = { 256, static_cast<size_t>(w) }; + size_t const local_work_size1[] = { 256, 1 }; + SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, + global_work_size1, local_work_size1, 0, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError("clFinish", err, __FILE__, __LINE__); + resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, + mpClmem2, + CL_TRUE, CL_MAP_READ, 0, + sizeof(double) * w, 0, nullptr, nullptr, + &err)); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); + for (int i = 0; i < w; i++) + pAllBuffer[i + w] = resbuf[i]; + err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); + // FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails? + if (CL_SUCCESS != err) + SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err)); + if (mpClmem2) + { + err = clReleaseMemObject(mpClmem2); + SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); + mpClmem2 = nullptr; + } + mpClmem2 = clCreateBuffer(kEnv.mpkContext, + cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR, + w * sizeof(double) * 2, pAllBuffer.get(), &err); + if (CL_SUCCESS != err) + throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get()); + } + // set kernel arg + SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(k, argno, sizeof(cl_mem), &(mpClmem2)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + return 1; + } + + ~ParallelReductionVectorRef() + { + if (mpClmem2) + { + cl_int err; + err = clReleaseMemObject(mpClmem2); + SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); + mpClmem2 = nullptr; + } + } + size_t GetArrayLength() const { return mpDVR->GetArrayLength(); } size_t GetWindowSize() const { return mpDVR->GetRefRowSize(); } @@ -1267,6 +1637,8 @@ protected: const formula::DoubleVectorRefToken* mpDVR; // from parent nodes std::shared_ptr<SlidingFunctionBase> mpCodeGen; + // controls whether to invoke the reduction kernel during marshaling or not + cl_mem mpClmem2; }; class Reduction : public SlidingFunctionBase |