diff options
author | Wei Wei <weiwei@multicorewareinc.com> | 2013-11-15 17:33:19 -0600 |
---|---|---|
committer | I-Jui (Ray) Sung <ray@multicorewareinc.com> | 2013-11-15 18:02:20 -0600 |
commit | 1e3bc2925c0ec1b03d6ae7cf3f281b0df3ec88d3 (patch) | |
tree | 1c9b6da1792ebc37dc9933fd9b410d8b5d220ace /sc | |
parent | 501bc66c780ab8fde801eeedc1f7c89762050713 (diff) |
GPU Calc: implemented parallel reduction for SUMIFS
For now only works for fixed and sliding fixed-sized windows.
Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d
Signed-off-by: I-Jui (Ray) Sung <ray@multicorewareinc.com>
Diffstat (limited to 'sc')
-rw-r--r-- | sc/source/core/opencl/formulagroupcl.cxx | 163 | ||||
-rw-r--r-- | sc/source/core/opencl/op_math.cxx | 225 | ||||
-rw-r--r-- | sc/source/core/opencl/op_math.hxx | 4 | ||||
-rw-r--r-- | sc/source/core/opencl/opbase.cxx | 43 | ||||
-rw-r--r-- | sc/source/core/opencl/opbase.hxx | 6 |
5 files changed, 365 insertions, 76 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index e4b6bfbd4286..79c33ae09caa 100644 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -419,42 +419,51 @@ public: bIsStartFixed = mpDVR->IsStartFixed(); bIsEndFixed = mpDVR->IsEndFixed(); } + virtual bool NeedParallelReduction(void) const + { + return GetWindowSize()> 100 && + ( (GetStartFixed() && GetEndFixed()) || + (!GetStartFixed() && !GetEndFixed()) ) ; + } virtual void GenSlidingWindowFunction(std::stringstream &ss) { - std::string name = Base::GetName(); - ss << "__kernel void "<<name; - ss << "_reduction(__global double* A, " - "__global double *result,int arrayLength,int windowSize){\n"; - ss << " double tmp, current_result = 0.0;\n"; - ss << " int writePos = get_group_id(1);\n"; - ss << " int offset = get_group_id(1);\n"; - ss << " int lidx = get_local_id(0);\n"; - ss << " __local double shm_buf[256];\n"; - ss << " if (arrayLength == windowSize)\n"; - ss << " offset = 0;\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " int loop = arrayLength/512 + 1;\n"; - ss << " for (int l=0; l<loop; l++){\n"; - ss << " tmp = 0.0;\n"; - ss << " int loopOffset = l*512;\n"; - ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; - ss << " tmp = A[loopOffset + lidx + offset] + " - "A[loopOffset + lidx + offset + 256];\n"; - ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; - ss << " tmp = A[loopOffset + lidx + offset];\n"; - ss << " shm_buf[lidx] = tmp;\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " for (int i = 128; i >0; i/=2) {\n"; - ss << " if (lidx < i)\n"; - ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " current_result += shm_buf[0];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " result[writePos] = current_result;\n"; - ss << "}\n"; + if (dynamic_cast<OpSum*>(mpCodeGen.get())) + { + std::string name = Base::GetName(); + ss << "__kernel void "<<name; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result = 0.0;\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int offset = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + ss << " if (arrayLength == windowSize)\n"; + ss << " offset = 0;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = 0.0;\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset] + " + "A[loopOffset + lidx + offset + 256];\n"; + ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset];\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result += shm_buf[0];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } } @@ -573,11 +582,16 @@ public: if (CL_SUCCESS != err) throw OpenCLError(err); // reproduce the reduction function name - std::string kernelName = Base::GetName() + "_reduction"; + std::string kernelName; + if (dynamic_cast<OpSum*>(mpCodeGen.get())) + kernelName = Base::GetName() + "_reduction"; + else throw Unhandled(); + cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); if (err != CL_SUCCESS) throw OpenCLError(err); // set kernel arg of reduction kernel + // TODO(Wei Wei): use unique name for kernel err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), (void *)&(Base::mpClmem)); if (CL_SUCCESS != err) @@ -621,6 +635,14 @@ public: } } + size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); } + + size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); } + + size_t GetStartFixed(void) const {return bIsStartFixed; } + + size_t GetEndFixed(void) const {return bIsEndFixed; } + protected: bool bIsStartFixed, bIsEndFixed; const formula::DoubleVectorRefToken *mpDVR; @@ -1001,6 +1023,75 @@ public: { i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram); } + if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get())) + { + assert(mpClmem == NULL); + // Obtain cl context + KernelEnv kEnv; + OpenclDevice::setKernelEnv(&kEnv); + cl_int err; + DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr = + dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *> + (mvSubArguments[0].get()); + cl_mem mpClmem2; + + if (OpSumCodeGen->NeedReductionKernel()) + { + assert(slidingArgPtr); + size_t nInput = slidingArgPtr -> GetArrayLength(); + size_t nCurWindowSize = slidingArgPtr -> GetWindowSize(); + std::vector<cl_mem> vclmem; + + for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e; + ++it) + { + vclmem.push_back((*it)->GetCLBuffer()); + } + mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE, + sizeof(double)*nVectorWidth, NULL, &err); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + std::string kernelName = "SumIfs_reduction"; + cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError(err); + + // set kernel arg of reduction kernel + for (size_t j=0; j< vclmem.size(); j++){ + err = clSetKernelArg(redKernel, j, sizeof(cl_mem), + (void *)&vclmem[j]); + if (CL_SUCCESS != err) + throw OpenCLError(err); + } + err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize); + if (CL_SUCCESS != err) + throw OpenCLError(err); + // set work group size and execute + size_t global_work_size[] = {256, (size_t)nVectorWidth }; + size_t local_work_size[] = {256, 1}; + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL, + global_work_size, local_work_size, 0, NULL, NULL); + if (CL_SUCCESS != err) + throw OpenCLError(err); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + // Pass mpClmem2 to the "real" kernel + err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError(err); + } + } return i; } diff --git a/sc/source/core/opencl/op_math.cxx b/sc/source/core/opencl/op_math.cxx index 5e5c7494bc2c..30eb759303f5 100644 --- a/sc/source/core/opencl/op_math.cxx +++ b/sc/source/core/opencl/op_math.cxx @@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, size_t nCurWindowSize = pCurDVR->GetArrayLength() < pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength(): pCurDVR->GetRefRowSize() ; + + mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction(); + if (mNeedReductionKernel) + { + // generate reduction functions + ss << "__kernel void "; + ss << "SumIfs_reduction( "; + for (unsigned i = 0; i < vSubArguments.size(); i++) + { + if (i) + ss << ","; + vSubArguments[i]->GenSlidingWindowDecl(ss); + } + ss << ", __global double *result,int arrayLength,int windowSize"; + + ss << ")\n{\n"; + ss << " double tmp =0;\n"; + ss << " int i ;\n"; + + GenTmpVariables(ss,vSubArguments); + ss << " double current_result = 0.0;\n"; + ss << " int writePos = get_group_id(1);\n"; + if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) + ss << " int offset = 0;\n"; + else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) + ss << " int offset = get_group_id(1);\n"; + else + throw Unhandled(); + // actually unreachable + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = 0.0;\n"; + ss << " int loopOffset = l*512;\n"; + + ss << " int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n"; + ss << " if (p2 < min(offset + windowSize, arrayLength)) {\n"; + ss << " tmp0 = 0.0;\n"; + int mm=0; + std::string p1 = "p1"; + std::string p2 = "p2"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p1); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1); + ss << ""; + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << "))"; + ss << "{\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p1); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n\n"; + } + mm=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p2); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2); + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p2); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j< vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n"; + } + ss << " }\n"; + + ss << " else if (p1 < min(arrayLength, offset + windowSize)) {\n"; + mm=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p1); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1); + + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p1); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n\n"; + } + + ss << " }\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result += shm_buf[0];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + }// finish generate reduction code + // generate functions as usual ss << "\ndouble " << sSymName; ss << "_"<< BinFuncName() <<"("; for (unsigned i = 0; i < vSubArguments.size(); i++) @@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, ss << ")\n {\n"; ss <<" int gid0=get_global_id(0);\n"; ss << " double tmp =0;\n"; - ss << " int i ;\n"; - GenTmpVariables(ss,vSubArguments); - ss << " for (i = "; - if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) { - ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n"; - } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { - ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n"; - } else { - ss << "0; i < "<< nCurWindowSize <<"; i++)\n"; - } - ss << " {\n"; - if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) - { - ss<< " int doubleIndex =i+gid0;\n"; - }else - { - ss<< " int doubleIndex =i;\n"; - } - ss<< " int singleIndex =gid0;\n"; - int m=0; - for(unsigned j=1;j<vSubArguments.size();j+=2,m++) - { - CheckSubArgumentIsNan(ss,vSubArguments,j); - CheckSubArgumentIsNan(ss,vSubArguments,j+1); - ss <<" if(isequal("; - ss <<"tmp"; - ss <<j; - ss <<" , "; - ss << "tmp"; - ss << j+1; - ss << ")){\n"; - } - CheckSubArgumentIsNan(ss,vSubArguments,0); - ss << " tmp += tmp0;\n"; - for(unsigned j=1;j<=vSubArguments.size();j+=2,m--) - { - for(int n = 0;n<m+1;n++) + if (!mNeedReductionKernel) + { + ss << " int i ;\n"; + GenTmpVariables(ss,vSubArguments); + ss << " for (i = "; + if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) { + ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n"; + } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { + ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n"; + } else { + ss << "0; i < "<< nCurWindowSize <<"; i++)\n"; + } + ss << " {\n"; + if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { - ss << " "; + ss<< " int doubleIndex =i+gid0;\n"; + }else + { + ss<< " int doubleIndex =i;\n"; } - ss<< "}\n"; - } + ss<< " int singleIndex =gid0;\n"; + int m=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,m++) + { + CheckSubArgumentIsNan(ss,vSubArguments,j); + CheckSubArgumentIsNan(ss,vSubArguments,j+1); + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan(ss,vSubArguments,0); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<=vSubArguments.size();j+=2,m--) + { + for(int n = 0;n<m+1;n++) + { + ss << " "; + } + ss<< "}\n"; + } + } + if (mNeedReductionKernel) + { + ss << "tmp ="; + vSubArguments[0]->GenDeclRef(ss); + ss << "[gid0];\n"; + } ss << "return tmp;\n"; ss << "}"; } diff --git a/sc/source/core/opencl/op_math.hxx b/sc/source/core/opencl/op_math.hxx index 01cbc82fee40..7081b00b952f 100644 --- a/sc/source/core/opencl/op_math.hxx +++ b/sc/source/core/opencl/op_math.hxx @@ -33,9 +33,13 @@ public: class OpSumIfs: public CheckVariables { public: + OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {} virtual void GenSlidingWindowFunction(std::stringstream &ss, const std::string sSymName, SubArguments &vSubArguments); virtual std::string BinFuncName(void) const { return "SumIfs"; } + bool NeedReductionKernel(void) const { return mNeedReductionKernel; } +protected: + bool mNeedReductionKernel; }; class OpCosh: public Normal { diff --git a/sc/source/core/opencl/opbase.cxx b/sc/source/core/opencl/opbase.cxx index 699833c3c285..07425dfe245e 100644 --- a/sc/source/core/opencl/opbase.cxx +++ b/sc/source/core/opencl/opbase.cxx @@ -156,6 +156,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss, ss << vSubArguments[i]->GenSlidingWindowDeclRef(); ss<<";\n"; } + +void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss, + SubArguments &vSubArguments, int argumentNum, std::string p) +{ + int i = argumentNum; + if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble) + { + ss <<" tmp"; + ss <<i; + ss << "="; + vSubArguments[i]->GenDeclRef(ss); + ss<<";\n"; + return; + } + +#ifdef ISNAN + ss<< " tmp"; + ss<< i; + ss<< "= fsum("; + vSubArguments[i]->GenDeclRef(ss); + if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svDoubleVectorRef) + ss<<"["<< p.c_str()<< "]"; + else if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svSingleVectorRef) + ss<<"[get_group_id(1)]"; + ss<<", 0);\n"; + return; +#endif + ss <<" tmp"; + ss <<i; + ss << "="; + vSubArguments[i]->GenDeclRef(ss); + if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svDoubleVectorRef) + ss<<"["<< p.c_str()<< "]"; + else if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svSingleVectorRef) + ss<<"[get_group_id(1)]"; + + ss<<";\n"; +} + void CheckVariables::CheckAllSubArgumentIsNan( std::stringstream & ss, SubArguments & vSubArguments) { diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx index 41e45877e2e8..6b475df42a6b 100644 --- a/sc/source/core/opencl/opbase.hxx +++ b/sc/source/core/opencl/opbase.hxx @@ -104,6 +104,9 @@ public: virtual void DumpInlineFun(std::set<std::string>& , std::set<std::string>& ) const {} const std::string& GetName(void) const { return mSymName; } + cl_mem GetCLBuffer(void) const {return mpClmem; } + virtual bool NeedParallelReduction(void) const { return false; } + protected: const std::string mSymName; FormulaTreeNodeRef mFormulaTree; @@ -157,6 +160,9 @@ public: SubArguments &vSubArguments, int argumentNum); void CheckAllSubArgumentIsNan(std::stringstream &ss, SubArguments &vSubArguments); + // only check isNan + void CheckSubArgumentIsNan2(std::stringstream &ss, + SubArguments &vSubArguments, int argumentNum, std::string p); }; }} |