diff options
author | Haidong Lian <haidong@multicorewareinc.com> | 2013-08-30 15:35:17 -0400 |
---|---|---|
committer | Kohei Yoshida <kohei.yoshida@gmail.com> | 2013-08-30 15:58:59 -0400 |
commit | e791fbfc0435f4a9522288154132df2760ef14a2 (patch) | |
tree | c3de2be83a00830ebd8f9e55dd679e9083f9793a /sc | |
parent | ccf7b15c0a5776c6431fdcb0c0e2b0f3935ae3dc (diff) |
Patch for milestone1-0829-v4.
1. Add the parser based on RPN;
2. For test sample1 named "ground-water-daily.xls", using the compound formula
to do calculation;
Add the compound kernels:
Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE
-(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)";
3. For formulae which do not work in GPU, they'll work in CPU;
4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as
the sequence of RPN;
5. Add the start and end position to fit for the sliding window;
6. Modify kernels by using vector for AMD GPU.
Conflicts:
sc/source/core/opencl/formulagroupcl.cxx
sc/source/core/opencl/openclwrapper.cxx
Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125
Diffstat (limited to 'sc')
-rw-r--r-- | sc/source/core/opencl/formulagroupcl.cxx | 1151 | ||||
-rw-r--r-- | sc/source/core/opencl/oclkernels.hxx | 239 | ||||
-rw-r--r-- | sc/source/core/opencl/openclwrapper.cxx | 1359 | ||||
-rw-r--r-- | sc/source/core/opencl/openclwrapper.hxx | 182 |
4 files changed, 2188 insertions, 743 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index 274af4e442c8..a835c461abbb 100644 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -19,6 +19,10 @@ #include "openclwrapper.hxx" +#define SRCDATASIZE 100 +#define SINGLEARRAYLEN 100 +#define DOUBLEARRAYLEN 100 +#define SVDOUBLELEN 100 namespace sc { // A single public entry point for a factory function: @@ -38,30 +42,632 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2) }//dbg-t TimeValue aTimeBefore, aTimeAfter; /////////////////////////////////////// +class SourceData +{ + const double *mdpSrcPtr; + unsigned int mnDataSize; + const char *mcpFormulaName; + unsigned int mnCol; + int eOp; +public: + SourceData( const double *dpData, unsigned int nSize, uint nCol = 1,const char *cpFormulaName = NULL):mdpSrcPtr(dpData),mnDataSize(nSize),mcpFormulaName(cpFormulaName),mnCol(nCol) + { + } + SourceData():mdpSrcPtr(NULL),mnDataSize(0) + { + } + void setSrcPtr( const double *dpTmpDataPtr) + { + mdpSrcPtr = dpTmpDataPtr; + } + void setSrcSize( int nSize ) + { + mnDataSize = nSize; + } + const double * getDouleData() + { + return mdpSrcPtr; + } + unsigned int getDataSize() + { + return mnDataSize; + } + void print() + { + for( uint i=0; i<mnDataSize; i++ ) + printf( " The SourceData is %f and data size is %d\n",mdpSrcPtr[i],mnDataSize ); + } + void printFormula() + { + printf("--------The formulaname is %s and the eOp is %d---------\n",mcpFormulaName,eOp); + } + void setFormulaName(const char *cpFormulaName) + { + this->mcpFormulaName = cpFormulaName; + } + const char *getFormulaName() + { + return mcpFormulaName; + } + void seteOp(int op) + { + this->eOp = op; + } + int geteOp() + { + return eOp; + } + int getColNum() + { + return mnCol; + } + +}; class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware { + SourceData *mSrcDataStack[SRCDATASIZE]; + unsigned int mnStackPointer,mnDoublePtrCount; + uint * mnpOclStartPos; + uint * mnpOclEndPos; + SingleVectorFormula *mSingleArray[SINGLEARRAYLEN]; + DoubleVectorFormula *mDoubleArray[DOUBLEARRAYLEN]; + double mdpSvdouble[SVDOUBLELEN]; + double *mdpSrcDoublePtr[SVDOUBLELEN]; + uint mnSingleCount; + uint mnDoubleCount; + uint mnSvDoubleCount; + uint mnOperatorGroup[100]; + uint mnOperatorCount; + char mcHostName[100]; + uint mnPositonLen; + size_t mnRowSize; public: FormulaGroupInterpreterOpenCL() : FormulaGroupInterpreterSoftware() { - OclCalc::InitEnv(); + mnStackPointer = 0; + mnpOclEndPos = NULL; + mnpOclStartPos = NULL; + mnSingleCount = 0; + mnDoubleCount = 0; + mnSvDoubleCount = 0; + mnOperatorCount = 0; + mnPositonLen = 0; + mnDoublePtrCount = 0; + OclCalc::initEnv(); } virtual ~FormulaGroupInterpreterOpenCL() { - OclCalc::ReleaseOpenclRunEnv(); + OclCalc::releaseOpenclRunEnv(); } - virtual ScMatrixRef inverseMatrix(const ScMatrix& rMat); - virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos, - const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode); + virtual ScMatrixRef inverseMatrix( const ScMatrix& rMat ); + virtual bool interpret( ScDocument& rDoc, const ScAddress& rTopPos, + const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode ); + void collectDoublePointers( double *temp ) + { + if( mnDoublePtrCount < SRCDATASIZE ) + { + mdpSrcDoublePtr[mnDoublePtrCount++] = temp; + } + else + { + printf( "The mdpSrcDoublePtr is full now.\n" ); + double *dtmp = NULL; + if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL ) + { + free( dtmp ); + dtmp = NULL; + } + } + } + + void freeDoublePointers() + { + while( mnDoublePtrCount > 0 ) + { + double *dtmp = NULL; + if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL ) + { + free( dtmp ); + dtmp = NULL; + } + } + } + + + void srdDataPush( SourceData *temp ) + { + if( mnStackPointer < SRCDATASIZE ) + { + mSrcDataStack[mnStackPointer++] = temp; + } + else + printf( "The stack is full now.\n" ); + } + SourceData *srdDataPop( void ) + { + if( mnStackPointer <= 0 ) + { + printf( "The stack was empty\n" ); + return NULL; + } + return mSrcDataStack[--mnStackPointer]; + } + unsigned int getDataSize() + { + return mnStackPointer; + } + void printStackInfo() + { + printf( "/********The stack size is %d*********\\\n",mnStackPointer ); + for ( int i = mnStackPointer - 1; i >= 0; i-- ) + mSrcDataStack[i]->print(); + } + bool getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen); + bool chooseFunction(OclCalc &ocl_calc,double *&dpResult); + bool isStockHistory(); + bool isGroundWater(); +}; +bool FormulaGroupInterpreterOpenCL::getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen) +{ + uint nColPosition = 0; + ScTokenArray * rCodePos = rCode.Clone(); + static int nCountPosSize = nRowSize; + bool isAllocFormulaOclBuf = true; + for ( const formula::FormulaToken* p = rCodePos->First(); p; p = rCodePos->Next() ) + { + switch ( p->GetType() ) + { + case formula::svDoubleVectorRef: + { + nColPosition++; + break; + } + } + } + int nPositionSize = nColPosition * nRowSize; + npOclStartPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) ); + npOclEndPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) ); + if ( nCountPosSize < nPositionSize ) + { + nCountPosSize = nPositionSize; + isAllocFormulaOclBuf = false; + } + for ( sal_Int32 i = 0; i < xGroup->mnLength; ++i ) + { + ScTokenArray * rCodeTemp = rCode.Clone(); + int j = 0; + for ( const formula::FormulaToken* p = rCodeTemp->First(); p; p = rCodeTemp->Next() ) + { + switch (p->GetType()) + { + case formula::svDoubleVectorRef: + { + const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p); + size_t nRowStart = p2->IsStartFixed() ? 0 : i; + size_t nRowEnd = p2->GetRefRowSize() - 1; + if (!p2->IsEndFixed()) + nRowEnd += i; + npOclStartPos[j*nRowSize+i] = nRowStart;//record the start position + npOclEndPos[j*nRowSize+i] = nRowEnd;//record the end position + j++; + } + } + } + } + *nPositonLen = nPositionSize; + //Now the pos array is 0 1 2 3 4 5 0 1 2 3 4 5; + return isAllocFormulaOclBuf; +} + +bool FormulaGroupInterpreterOpenCL::isStockHistory() +{ + bool isHistory = false; + if( (mnOperatorGroup[0]== 224) && (mnOperatorGroup[1]== 227) && (mnOperatorGroup[2]== 41) && (mnOperatorGroup[3]== 43) && (mnOperatorGroup[4]== 41) ) + { + strcpy(mcHostName,"OclOperationColumnN"); + isHistory = true; + } + else if( (mnOperatorGroup[0] == 226) && (mnOperatorGroup[1] == 42) ) + { + strcpy(mcHostName,"OclOperationColumnH"); + isHistory = true; + } + else if((mnOperatorGroup[0] == 213) && (mnOperatorGroup[1] == 43) && (mnOperatorGroup[2] == 42) ) + { + strcpy(mcHostName,"OclOperationColumnJ"); + isHistory = true; + } + return isHistory; +} + +bool FormulaGroupInterpreterOpenCL::isGroundWater() +{ + bool GroundWater=false; + + if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )|| + (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1)) + { + GroundWater = true; + } + return GroundWater; +} + +bool FormulaGroupInterpreterOpenCL::chooseFunction( OclCalc &ocl_calc, double *&dpResult ) +{ + const double * dpOclSrcData = NULL; + unsigned int nSrcDataSize = 0; + const double *dpLeftData = NULL; + const double *dpRightData = NULL; + if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )|| + (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1)) + { + double delta = 0.0; + const double *pArrayToSubtractOneElementFrom; + const double *pGroundWaterDataArray; + uint nSrcData = 0; + if( mnSvDoubleCount!=1 ) + { + pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData; + pGroundWaterDataArray= mDoubleArray[0]->mdpInputData; + nSrcData = mDoubleArray[0]->mnInputDataSize; + } + else + { + pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData; + pGroundWaterDataArray=NULL; + delta = mdpSvdouble[0]; + } + ocl_calc.oclGroundWaterGroup( mnOperatorGroup,mnOperatorCount,pGroundWaterDataArray,pArrayToSubtractOneElementFrom,nSrcData,mnRowSize,delta,mnpOclStartPos,mnpOclEndPos,dpResult); + } + else if( isStockHistory() ) + { + return false; + } + else if(((mnSvDoubleCount==0)&&(mnSingleCount==0)&&(mnDoubleCount==1)) && + ((mnOperatorGroup[0] == ocAverage)||(mnOperatorGroup[0] == ocMax)||(mnOperatorGroup[0] == ocMin))) + { + if(mnOperatorGroup[0] == ocAverage) + strcpy(mcHostName,"oclFormulaAverage"); + if(mnOperatorGroup[0] == ocMax) + strcpy(mcHostName,"oclFormulaMax"); + if(mnOperatorGroup[0] == ocMin) + strcpy(mcHostName,"oclFormulaMin"); + DoubleVectorFormula * doubleTemp = mDoubleArray[--mnDoubleCount]; + nSrcDataSize = doubleTemp->mnInputDataSize; + dpOclSrcData = doubleTemp->mdpInputData; + if ( ocl_calc.getOpenclState()) + { + if ( ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 ) + { + ocl_calc.createFormulaBuf64Bits( nSrcDataSize, mnRowSize ); + ocl_calc.mapAndCopy64Bits( dpOclSrcData,mnpOclStartPos,mnpOclEndPos,nSrcDataSize,mnRowSize ); + ocl_calc.oclHostFormulaStatistics64Bits( mcHostName, dpResult, mnRowSize ); + } + else + { + ocl_calc.createFormulaBuf32Bits( nSrcDataSize, mnPositonLen ); + ocl_calc.mapAndCopy32Bits( dpOclSrcData, mnpOclStartPos, mnpOclEndPos, nSrcDataSize, mnRowSize); + ocl_calc.oclHostFormulaStatistics32Bits( mcHostName, dpResult, mnRowSize ); + } + } + } + else if((mnSvDoubleCount==0)&&(mnSingleCount==1)&&(mnDoubleCount==0)) + { + dpLeftData = mSingleArray[0]->mdpInputLeftData; + dpRightData = mSingleArray[0]->mdpInputRightData; + if(mnOperatorGroup[0] == ocAdd) + strcpy(mcHostName,"oclSignedAdd"); + if(mnOperatorGroup[0] == ocSub) + strcpy(mcHostName,"oclSignedSub"); + if(mnOperatorGroup[0] == ocMul) + strcpy(mcHostName,"oclSignedMul"); + if(mnOperatorGroup[0] == ocDiv) + strcpy(mcHostName,"oclSignedDiv"); + if ( ocl_calc.getOpenclState()) + { + if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 ) + { + ocl_calc.createArithmeticOptBuf64Bits( mnRowSize ); + ocl_calc.mapAndCopy64Bits(dpLeftData,dpRightData,mnRowSize); + ocl_calc.oclHostArithmeticOperator64Bits( mcHostName,dpResult,mnRowSize ); + } + else + { + ocl_calc.createArithmeticOptBuf32Bits( mnRowSize ); + ocl_calc.mapAndCopy32Bits(dpLeftData,dpRightData,mnRowSize); + ocl_calc.oclHostArithmeticOperator32Bits( mcHostName,dpResult,mnRowSize ); + } + } + } + else if( (mnSingleCount>1) && (mnSvDoubleCount==0) && (mnDoubleCount==0) ) + { + const double* dpArray[100] = {}; + int j=0; + for( uint i = 0; i < mnSingleCount; i++ ) + { + dpArray[j++] = mSingleArray[i]->mdpInputLeftData; + if( NULL != mSingleArray[i]->mdpInputRightData ) + dpArray[j++] = mSingleArray[i]->mdpInputRightData; + } + double *dpMoreColArithmetic = (double *)malloc( sizeof(double) * j * mnRowSize ); + if( NULL == dpMoreColArithmetic ) + { + printf("Memory alloc error!\n"); + return false; + } + for( uint i = 0; i < j*mnRowSize; i++ ) + { + dpMoreColArithmetic[i] = dpArray[i/mnRowSize][i%mnRowSize]; + } + if ( ocl_calc.getOpenclState()) + { + if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 ) + { + ocl_calc.createMoreColArithmeticBuf64Bits( j * mnRowSize, mnOperatorCount ); + ocl_calc.mapAndCopyMoreColArithmetic64Bits( dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount ); + ocl_calc.oclMoreColHostArithmeticOperator64Bits( mnRowSize, mnOperatorCount, dpResult,mnRowSize ); + } + else + { + ocl_calc.createMoreColArithmeticBuf32Bits( j* mnRowSize, mnOperatorCount ); + ocl_calc.mapAndCopyMoreColArithmetic32Bits(dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount); + ocl_calc.oclMoreColHostArithmeticOperator32Bits( mnRowSize, mnOperatorCount, dpResult, mnRowSize ); + } + } + } + else + { + return false; + } + return true; +} + +class agency +{ +public: + double *calculate(int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt); }; -ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat) +double * agency::calculate( int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt) +{ + const double *dpLeftData = NULL; + const double *dpRightData = NULL; + const double *dpOclSrcData=NULL; + if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 ) + { + switch( nOclOp ) + { + case ocAdd: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset(rResult,0,rowSize); + ocl_calc.oclHostArithmeticStash64Bits( "oclSignedAdd",dpLeftData,dpRightData,rResult,temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) ); + break; + } + case ocSub: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = ( double * )malloc( sizeof(double) * nDataSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostArithmeticStash64Bits( "oclSignedSub",dpLeftData,dpRightData,rResult,temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData(rResult,nDataSize) ); + break; + } + case ocMul: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostArithmeticStash64Bits( "oclSignedMul",dpLeftData,dpRightData,rResult,temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) ); + break; + } + case ocDiv: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = ( double * )malloc( sizeof(double) * nDataSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostArithmeticStash64Bits( "oclSignedDiv",dpLeftData,dpRightData,rResult,temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) ); + break; + } + case ocMax: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * rowSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) ); + break; + } + case ocMin: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * rowSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) ); + break; + } + case ocAverage: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * rowSize ); + memset( rResult,0,rowSize ); + ocl_calc.oclHostFormulaStash64Bits( "oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) ); + break; + } + default: + fprintf( stderr,"No OpenCL function for this calculation.\n" ); + break; + } + } + else + { + switch( nOclOp ) + { + case ocAdd: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset(rResult,0,rowSize); + ocl_calc.oclHostArithmeticStash32Bits( "oclSignedAdd", dpLeftData, dpRightData, rResult, temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) ); + break; + } + case ocSub: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset( rResult, 0, rowSize ); + ocl_calc.oclHostArithmeticStash32Bits( "oclSignedSub", dpLeftData, dpRightData, rResult, temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) ); + break; + } + case ocMul: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc(sizeof(double) * nDataSize ); + memset( rResult, 0, rowSize ); + ocl_calc.oclHostArithmeticStash32Bits( "oclSignedMul", dpLeftData, dpRightData, rResult, temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData( rResult, nDataSize ) ); + break; + } + case ocDiv: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + SourceData *temp2 = formulaInterprt->srdDataPop(); + nDataSize = temp2->getDataSize(); + dpLeftData = temp2->getDouleData(); + dpRightData = temp->getDouleData(); + nDataSize = temp2->getDataSize(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset( rResult, 0, rowSize ); + ocl_calc.oclHostArithmeticStash32Bits( "oclSignedDiv", dpLeftData, dpRightData, rResult, temp->getDataSize() ); + formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) ); + break; + } + case ocMax: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc(sizeof(double) * nDataSize ); + memset(rResult,0,rowSize); + ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMax", dpOclSrcData, npOclStartPos, npOclEndPos, rResult,nDataSize, rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult, rowSize ) ); + break; + } + case ocMin: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset( rResult, 0, rowSize ); + ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMin", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) ); + break; + } + case ocAverage: + { + unsigned int nDataSize = 0; + SourceData *temp = formulaInterprt->srdDataPop(); + nDataSize = temp->getDataSize(); + dpOclSrcData = temp->getDouleData(); + double *rResult = NULL; // Point to the output data from GPU + rResult = (double *)malloc( sizeof(double) * nDataSize ); + memset( rResult, 0, rowSize); + ocl_calc.oclHostFormulaStash32Bits( "oclFormulaAverage", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize ); + formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) ); + break; + } + default: + fprintf(stderr,"No OpenCL function for this calculation.\n"); + break; + } + } + return NULL; +} + +ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix( const ScMatrix& rMat ) { SCSIZE nC, nR; - rMat.GetDimensions(nC, nR); - if (nC != nR || nC == 0) + rMat.GetDimensions( nC, nR ); + if ( nC != nR || nC == 0 ) // Input matrix must be square. Return an empty matrix on failure and // the caller will calculate it via CPU. return ScMatrixRef(); @@ -70,29 +676,30 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat) // the last, chained together in a single array. std::vector<double> aDoubles; rMat.GetDoubleArray(aDoubles); + float * fpOclMatrixSrc = NULL; float * fpOclMatrixDst = NULL; double * dpOclMatrixSrc = NULL; double * dpOclMatrixDst = NULL; uint nMatrixSize = nC * nR; static OclCalc aOclCalc; - if ( aOclCalc.GetOpenclState() ) + if ( aOclCalc.getOpenclState() ) { if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 ) { - aOclCalc.CreateBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize ); + aOclCalc.createBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize ); for ( uint i = 0; i < nC; i++ ) for ( uint j = 0; j < nR; j++ ) dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i]; - aOclCalc.OclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR ); + aOclCalc.oclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR ); } else { - aOclCalc.CreateBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize ); + aOclCalc.createBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize ); for ( uint i = 0; i < nC; i++ ) for ( uint j = 0; j < nR; j++ ) fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i]; - aOclCalc.OclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR ); + aOclCalc.oclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR ); } } @@ -111,374 +718,242 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat) #else // Another way is to put the values one column at a time. const double* p = &aDoubles[0]; - for (SCSIZE i = 0; i < nC; ++i) + for( SCSIZE i = 0; i < nC; ++i ) { - xInv->PutDouble(p, nR, i, 0); + xInv->PutDouble( p, nR, i, 0 ); p += nR; } #endif return xInv; } - -bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos, - const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode) +bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, const ScAddress& rTopPos, + const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode ) { - generateRPNCode(rDoc, rTopPos, rCode); - - size_t rowSize = xGroup->mnLength; - fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize); + generateRPNCode( rDoc, rTopPos, rCode ); + mnRowSize = xGroup->mnLength; + fprintf( stderr,"mnRowSize at begin is ...%ld.\n",(long)mnRowSize ); // The row quantity can be gotten from p2->GetArrayLength() - uint nCount1 = 0, nCount2 = 0, nCount3 = 0; int nOclOp = 0; - double *rResult = NULL; // Point to the output data from GPU - rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C) - if(NULL==rResult) - { - printf("malloc err\n"); - return false; - } - memset(rResult,0,rowSize); - float * fpOclSrcData = NULL; // Point to the input data from CPU - double * dpOclSrcData = NULL; - uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100)) - uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100)) - float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData) - float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData) - // The rightData can't be zero for "/" - double * dpLeftData = NULL; - double * dpRightData = NULL; - - float * fpSaveData=NULL; //It is a temp pointer point the preparing memory; - float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator - float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator - double * dpSaveData=NULL; - double * dpSumProMergeLfData = NULL; - double * dpSumProMergeRtData = NULL; - uint * npSumSize=NULL; //It is a array to save the matix sizt(col *row) - int nSumproductSize=0; //It is the merge array size - bool aIsAlloc=false; //It is a flag to judge the fpSumProMergeLfData existed - unsigned int nCountMatix=0; //It is a count to save the calculate times + const double * dpOclSrcData = NULL; + const double * dpBinaryData = NULL; static OclCalc ocl_calc; - bool isSumProduct=false; - if(ocl_calc.GetOpenclState()) + unsigned int nSrcDataSize = 0; + + const double *dpResult = NULL; + double *pResult = (double *)malloc(sizeof(double) * mnRowSize); + double *dpSvDouble = NULL; + bool isSample = false; + + mnSingleCount = 0; + mnDoubleCount = 0; + mnSvDoubleCount = 0; + mnOperatorCount = 0; + mnPositonLen = 0; + if ( ocl_calc.getOpenclState() ) { - // Don't know how large the size will be applied previously, so create them as the rowSize or 65536 - // Don't know which formulae will be used previously, so create buffers for different formulae used probably - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) + getPosition(rCode,xGroup,mnRowSize,mnpOclStartPos,mnpOclEndPos,&mnPositonLen); + const formula::FormulaToken* p = rCode.FirstRPN(); + + bool isSingle = false; + int nCountNum=0; + do { - ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize); - ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize); - } - else + if ( ocPush != p->GetOpCode()) + { + nOclOp = p->GetOpCode(); + mnOperatorGroup[mnOperatorCount++] = nOclOp; + } + else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() ) + { + mnSingleCount++; + } + if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() ) + { + mnSvDoubleCount++; + } + } while ( NULL != ( p = rCode.NextRPN() ) ); + if( isGroundWater() ) { - ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize); - ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize); + isSample = true; } - //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos); - } -/////////////////////////////////////////////////////////////////////////////////////////// - - // Until we implement group calculation for real, decompose the group into - // individual formula token arrays for individual calculation. - ScAddress aTmpPos = rTopPos; - for (sal_Int32 i = 0; i < xGroup->mnLength; ++i) - { - aTmpPos.SetRow(xGroup->mnStart + i); - ScTokenArray aCode2; - for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next()) + mnOperatorCount = 0; + mnSingleCount = 0; + mnSvDoubleCount = 0; + p = rCode.FirstRPN(); + if(isSample) { - switch (p->GetType()) + do { - case formula::svSingleVectorRef: + if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() ) { - const formula::SingleVectorRefToken* p2 = static_cast<const formula::SingleVectorRefToken*>(p); - const double* pArray = p2->GetArray(); - aCode2.AddDouble(static_cast<size_t>(i) < p2->GetArrayLength() ? pArray[i] : 0.0); + mdpSvdouble[mnSvDoubleCount++] = p->GetDouble(); } - break; - case formula::svDoubleVectorRef: + else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType()) { - const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p); - const std::vector<const double*>& rArrays = p2->GetArrays(); - size_t nColSize = rArrays.size(); - size_t nRowStart = p2->IsStartFixed() ? 0 : i; - size_t nRowEnd = p2->GetRefRowSize() - 1; - if (!p2->IsEndFixed()) - nRowEnd += i; - size_t nRowSize = nRowEnd - nRowStart + 1; - //store the a matix`s rowsize and colsize,use it to calculate the matix`s size - ocl_calc.nFormulaRowSize = nRowSize; - ocl_calc.nFormulaColSize = nColSize; - ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0)); - if(ocl_calc.GetOpenclState()) + const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p ); + const std::vector< const double* >& rArrays = pDvr->GetArrays(); + uint rArraysSize = rArrays.size(); + int nMoreColSize = 0; + DoubleVectorFormula *SvDoubleTemp = new DoubleVectorFormula(); + if( rArraysSize > 1 ) { - npOclStartPos[i] = nRowStart; // record the start position - npOclEndPos[i] = nRowEnd; // record the end position - } - int nTempOpcode; - const formula::FormulaToken* pTemp = p; - pTemp=aCode2.Next(); - nTempOpcode=pTemp->GetOpCode(); - while(1) - { - nTempOpcode=pTemp->GetOpCode(); - if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush) - break; - pTemp=aCode2.Next(); - } - if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct)) - { - //nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize - //the other all use nRowSize times . and it must aligen so add nRowSize-1. - nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1; - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) - ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize); - else - ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize); - aIsAlloc = true; - isSumProduct=true; - } - if(isSumProduct) - { - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) - { - if(nCountMatix%2==0) - dpSaveData = dpSumProMergeLfData; - else - dpSaveData = dpSumProMergeRtData; - } - else + double *dpMoreColData = NULL; + for ( uint loop=0; loop < rArraysSize; loop++ ) { - if(nCountMatix%2==0) - fpSaveData = fpSumProMergeLfData; - else - fpSaveData = fpSumProMergeRtData; - } - } - for (size_t nCol = 0; nCol < nColSize; ++nCol) - { - const double* pArray = rArrays[nCol]; - if( NULL==pArray ) - { - fprintf(stderr,"Error: pArray is NULL!\n"); - free(rResult); - return false; - } - if(ocl_calc.GetOpenclState()) - { - for( size_t u=nRowStart; u<=nRowEnd; u++ ) - { - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) - { - dpOclSrcData[u] = pArray[u]; - //fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]); - if(isSumProduct) - dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u]; - } - else - { - // Many video cards can't support double type in kernel, so need transfer the double to float - fpOclSrcData[u] = (float)pArray[u]; - //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]); - if(isSumProduct) - fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u]; - } - } - } - - for (size_t nRow = 0; nRow < nRowSize; ++nRow) - { - if (nRowStart + nRow < p2->GetArrayLength()) + dpOclSrcData = rArrays[loop]; + nSrcDataSize = pDvr->GetArrayLength(); + nMoreColSize += nSrcDataSize; + dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double)); + for ( uint j = nMoreColSize - nSrcDataSize, i = 0; i < nSrcDataSize; i++, j++ ) { - double fVal = pArray[nRowStart+nRow]; - pMat->PutDouble(fVal, nCol, nRow); + dpMoreColData[j] = dpOclSrcData[i]; } } + dpOclSrcData = dpMoreColData; + nSrcDataSize = nMoreColSize; } - - ScMatrixToken aTok(pMat); - aCode2.AddToken(aTok); - if(isSumProduct) + else { - npSumSize[nCountMatix/2] =nRowSize*nColSize; - nCountMatix++; + dpOclSrcData = rArrays[0]; + nSrcDataSize = pDvr->GetArrayLength(); + SvDoubleTemp->mdpInputData = dpOclSrcData; + SvDoubleTemp->mnInputDataSize = nSrcDataSize; + SvDoubleTemp->mnInputStartPosition = mnpOclStartPos[nCountNum*mnRowSize]; + SvDoubleTemp->mnInputEndPosition = mnpOclEndPos[nCountNum*mnRowSize]; + SvDoubleTemp->mnInputStartOffset = mnpOclStartPos[nCountNum*mnRowSize+1]-mnpOclStartPos[nCountNum*mnRowSize]; + SvDoubleTemp->mnInputEndOffset = mnpOclEndPos[nCountNum*mnRowSize+1]-mnpOclEndPos[nCountNum*mnRowSize]; + mDoubleArray[mnDoubleCount++] = SvDoubleTemp; + nCountNum++; } } - break; - default: - aCode2.AddToken(*p); - } - } - - ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos); - if (!pDest) - { - free(rResult); - return false; + else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() ) + { + const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p ); + dpBinaryData = pSvr->GetArray(); + uint nArrayLen = pSvr->GetArrayLength(); + SingleVectorFormula *SignleTemp = new SingleVectorFormula() ; + if(isSingle) + { + SignleTemp = mSingleArray[--mnSingleCount]; + SignleTemp->mdpInputRightData = dpBinaryData; + SignleTemp->mnInputRightDataSize = nArrayLen; + SignleTemp->mnInputRightStartPosition = 0; + SignleTemp->mnInputRightOffset = 0; + isSingle = false; + } + else + { + SignleTemp = new SingleVectorFormula(); + SignleTemp->mdpInputLeftData = dpBinaryData; + SignleTemp->mnInputLeftDataSize = nArrayLen; + SignleTemp->mdpInputRightData = NULL; + SignleTemp->mnInputRightDataSize = 0; + SignleTemp->mnInputLeftStartPosition = 0; + SignleTemp->mnInputLeftOffset = 0; + isSingle = true; + } + mSingleArray[mnSingleCount++] = SignleTemp; + } + else + { + nOclOp = p->GetOpCode(); + mnOperatorGroup[mnOperatorCount++] = nOclOp; + } + } while ( NULL != ( p = rCode.NextRPN() ) ); + if ( !chooseFunction( ocl_calc, pResult ) ) + return false; + else + dpResult = pResult; } - if(ocl_calc.GetOpenclState()) + else { - const formula::FormulaToken *pCur = aCode2.First(); - aCode2.Reset(); - while( ( pCur = aCode2.Next() ) != NULL ) + agency aChooseAction; + + do { - OpCode eOp = pCur->GetOpCode(); - if(eOp==0) + if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() ) + { + dpSvDouble = (double *) malloc( sizeof(double ) * mnRowSize ); + double dTempValue = p->GetDouble(); + for ( uint i = 0; i < mnRowSize; i++ ) + dpSvDouble[i] = dTempValue; + srdDataPush( new SourceData( dpSvDouble, mnRowSize ) ); + collectDoublePointers( dpSvDouble ); + } + else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType()) { - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) + const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p ); + const std::vector< const double* >& rArrays = pDvr->GetArrays(); + unsigned int rArraysSize = rArrays.size(); + int nMoreColSize = 0; + if(rArraysSize > 1) { - if(nCount3%2==0) - dpLeftData[nCount1++] = pCur->GetDouble(); - else - dpRightData[nCount2++] = pCur->GetDouble(); - nCount3++; + double *dpMoreColData = NULL; + for( uint loop=0; loop < rArraysSize; loop++ ) + { + dpOclSrcData = rArrays[loop]; + nSrcDataSize = pDvr->GetArrayLength(); + nMoreColSize += nSrcDataSize; + dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double)); + for(uint j=nMoreColSize-nSrcDataSize,i=0;i<nSrcDataSize;i++,j++) + { + dpMoreColData[j] = dpOclSrcData[i]; + } + } + dpOclSrcData = dpMoreColData; + nSrcDataSize = nMoreColSize; + collectDoublePointers( dpMoreColData ); } else { - if(nCount3%2==0) - fpLeftData[nCount1++] = (float)pCur->GetDouble(); - else - fpRightData[nCount2++] = (float)pCur->GetDouble(); - nCount3++; + dpOclSrcData = rArrays[0]; + nSrcDataSize = pDvr->GetArrayLength(); } + srdDataPush( new SourceData( dpOclSrcData,nSrcDataSize,rArraysSize ) ); } - else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep) - nOclOp = eOp; - -// if(count1>0){//dbg -// fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]); -// count1--; -// } -// if(count2>0){//dbg -// fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]); -// count2--; -// } - } - } - - if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState()) - { - //fprintf(stderr,"ccCPU flow...\n\n"); - generateRPNCode(rDoc, aTmpPos, aCode2); - ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2); - aInterpreter.Interpret(); - pDest->SetResultToken(aInterpreter.GetResultToken().get()); - pDest->ResetDirty(); - pDest->SetChanged(true); + else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() ) + { + const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p ); + dpBinaryData = pSvr->GetArray(); + nSrcDataSize = pSvr->GetArrayLength(); + srdDataPush( new SourceData( dpBinaryData, nSrcDataSize ) ); + } + else + { + nOclOp = p->GetOpCode(); + aChooseAction.calculate(nOclOp,mnRowSize,ocl_calc,mnpOclStartPos,mnpOclEndPos,this); + mnSingleCount = 0; + mnDoubleCount = 0; + mnSvDoubleCount = 0; + mnOperatorCount = 0; + mnPositonLen = 0; + } + } while ( NULL != ( p = rCode.NextRPN() ) ); + SourceData * sResult = srdDataPop(); + dpResult = sResult->getDouleData(); } - } // for loop end (xGroup->mnLength) - - // For GPU calculation - if(getenv("SC_GPU")&&ocl_calc.GetOpenclState()) - { - fprintf(stderr,"ggGPU flow...\n\n"); - printf(" oclOp is... %d\n",nOclOp); - osl_getSystemTime(&aTimeBefore); //timer - if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1) + rDoc.SetFormulaResults( rTopPos, dpResult, mnRowSize ); + freeDoublePointers(); + if ( pResult ) { - fprintf(stderr,"ggGPU double precision flow...\n\n"); - //double precision - switch(nOclOp) - { - case ocAdd: - ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1); - break; - case ocSub: - ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1); - break; - case ocMul: - ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1); - break; - case ocDiv: - ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1); - break; - case ocMax: - ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocMin: - ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocAverage: - ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocSum: - ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocCount: - ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocSumProduct: - ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize); - break; - default: - fprintf(stderr,"No OpenCL function for this calculation.\n"); - break; - } + free( pResult ); + pResult = NULL; } - else + if ( mnpOclStartPos ) { - fprintf(stderr,"ggGPU float precision flow...\n\n"); - //float precision - switch(nOclOp) - { - case ocAdd: - ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1); - break; - case ocSub: - ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1); - break; - case ocMul: - ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1); - break; - case ocDiv: - ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1); - break; - case ocMax: - ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocMin: - ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocAverage: - ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocSum: - ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocCount: - ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize); - break; - case ocSumProduct: - ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize); - break; - default: - fprintf(stderr,"No OpenCL function for this calculation.\n"); - break; - } + free( mnpOclStartPos ); + mnpOclStartPos = NULL; } - - ///////////////////////////////////////////////////// - osl_getSystemTime(&aTimeAfter); - double diff = getTimeDiff(aTimeAfter, aTimeBefore); - //if (diff >= 1.0) + if ( mnpOclEndPos ) { - fprintf(stderr,"OpenCL,diff...%f.\n",diff); + free( mnpOclEndPos ); + mnpOclEndPos = NULL; } -///////////////////////////////////////////////////// - -//rResult[i]; -// for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results -// fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]); -// } - - // Insert the double data, in rResult[i] back into the document - rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength); - } - - free(rResult); - - return true; + return true; + } // getOpenclState() End + else + return false; } /// Special case of formula compiler for groundwatering @@ -489,11 +964,11 @@ public: FormulaGroupInterpreterSoftware() { fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n"); - OclCalc::InitEnv(); + OclCalc::initEnv(); } virtual ~FormulaGroupInterpreterGroundwater() { - OclCalc::ReleaseOpenclRunEnv(); + OclCalc::releaseOpenclRunEnv(); } virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); } @@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA fprintf (stderr, "Calculate !"); - double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray, + double *pResult = ocl_calc.oclSimpleDeltaOperation( eOp, pGroundWaterDataArray, pArrayToSubtractOneElementFrom, (size_t) xGroup->mnLength, delta ); RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed"); diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx index bcd7db093a84..53917b3bf2a9 100644 --- a/sc/source/core/opencl/oclkernels.hxx +++ b/sc/source/core/opencl/oclkernels.hxx @@ -7,8 +7,8 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#ifndef _OCL_KERNEL_H_ -#define _OCL_KERNEL_H_ +#ifndef SC_OCLKERNELS_HXX +#define SC_OCLKERNELS_HXX #ifndef USE_EXTERNAL_KERNEL #define KERNEL( ... )# __VA_ARGS__ @@ -24,6 +24,97 @@ const char *kernel_src = KERNEL( \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n \n#else\n \n#endif\n +inline fp_t oclAverage( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray) +{ + uint start = startArray[id]; + uint end = endArray[id]; + fp_t fSum = 0.0; + fp_t zero[16] = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f}; + fp_t16 vSum=vload16(0,zero); + fp_t16 ptr; + __global fp_t *p = values; + p+= start; + + for(int i = 0; i < (end - start + 1)/16; ++i) + { + ptr=vload16(0,p); + vSum += ptr; + p+=16; + } + int lastData = (end-start+1)%16; + for(int i = 0; i <lastData; i++) + { + fSum += *p; + p+=1; + } + vSum.s01234567 = vSum.s01234567+vSum.s89abcdef; + vSum.s0123 = vSum.s0123+vSum.s4567; + vSum.s01 = vSum.s01+vSum.s23; + vSum.s0 = vSum.s0+vSum.s1; + fSum = vSum.s0+fSum; + fp_t fVal = fSum/(end-start+1); + return fVal; +} +inline fp_t oclMax( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray) +{ + uint start = startArray[id]; + uint end = endArray[id]; + fp_t fMax = values[start]; + fp_t zero[16] = {fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax}; + fp_t16 vMax=vload16(0,zero); + //Max + fp_t16 ptr; + __global fp_t *p = values; + p+= start; + for(int i = 0; i < (end - start + 1)/16; ++i) + { + ptr=vload16(0,p); + vMax = fmax(vMax,ptr); + p+=16; + } + int lastData = (end-start+1)%16; + for(int i = 0; i <lastData; i++) + { + fMax = fmax(fMax,*p); + p+=1; + } + vMax.s01234567 = fmax(vMax.s01234567, vMax.s89abcdef); + vMax.s0123 = fmax(vMax.s0123, vMax.s4567); + vMax.s01 = fmax(vMax.s01, vMax.s23); + vMax.s0 = fmax(vMax.s0, vMax.s1); + fMax = fmax(vMax.s0, fMax); + return fMax; +} +inline fp_t oclMin( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray) +{ + uint start = startArray[id]; + uint end = endArray[id]; + fp_t fMin = values[start]; + fp_t zero[16] = {fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin}; + fp_t16 vMin=vload16(0,zero); + //Min + fp_t16 ptr; + __global fp_t *p = values; + p+= start; + for(int i = 0; i < (end - start + 1)/16; ++i) + { + ptr=vload16(0,p); + vMin = fmin(vMin,ptr); + p+=16; + } + int lastData = (end-start+1)%16; + for(int i = 0; i <lastData; i++) + { + fMin = fmin(fMin,*p); + p+=1; + } + vMin.s01234567 = fmin(vMin.s01234567, vMin.s89abcdef); + vMin.s0123 = fmin(vMin.s0123, vMin.s4567); + vMin.s01 = fmin(vMin.s01, vMin.s23); + vMin.s0 = fmin(vMin.s0, vMin.s1); + fMin = fmin(vMin.s0, fMin); + return fMin; +} __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) { @@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global otData[id] = ltData[id] + rtData[id]; } - __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) { const unsigned int id = get_global_id(0); @@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global __kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) { int id = get_global_id(0); - otData[id] =ltData[id] * rtData[id]; + otData[id] = ltData[id] * rtData[id]; } __kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData) { const unsigned int id = get_global_id(0); - otData[id] = ltData[id] / rtData[id]; + fp_t divisor = rtData[id]; + if ( divisor != 0 ) + otData[id] = ltData[id] / divisor; + else + otData[id] = 0.0; } __kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) { const unsigned int id = get_global_id(0); - unsigned int startFlag = start[id]; - unsigned int endFlag = end[id]; - fp_t fMinVal = input[startFlag]; - for(int i=startFlag;i<=endFlag;i++) - { - fMinVal = fmin( fMinVal, input[i] ); - } - output[id] = fMinVal; + fp_t fVal = oclMin(id,input,start,end); + output[id] = fVal ; } __kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) { const unsigned int id = get_global_id(0); - unsigned int startFlag = start[id]; - unsigned int endFlag = end[id]; - fp_t fMaxVal = input[startFlag]; - for ( int i = startFlag; i <= endFlag; i++ ) - { - fMaxVal = fmax( fMaxVal, input[i] ); - } - output[id] = fMaxVal; + fp_t fVal = oclMax(id,input,start,end); + output[id] = fVal ; } //Sum __kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) @@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f __kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output) { const unsigned int id = get_global_id(0); - fp_t sum=0.0; - for(int i = start[id];i<=end[id];i++) - sum += input[i]; - output[id] = sum / (end[id]-start[id]+1); -} + fp_t fVal = oclAverage(id,input,start,end); + output[id] = fVal ; +} //Sumproduct __kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize) { @@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s // Min fp_t fMinVal = values[start]; - for(int i=start+1;i < end;i++) + for ( int i = start + 1; i < end; i++ ) { if(values[i]<fMinVal) fMinVal = values[i]; @@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId]; fpP[nMax*nDimension+nId] = dMovebuffer; } -__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY) +__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY,__global uint* npDim) { int nId = get_global_id(0); - int nDimension = get_global_size(0); - + int nDimension = npDim[nId]; + fp_t fsum = 0.0; for ( int yi=0; yi < nDimension; yi++ ) { - fp_t fsum = 0.0; + fsum = 0.0; for ( int yj=0; yj < nDimension; yj++ ) { fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension]; @@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat } for ( int xi = nDimension - 1; xi >= 0; xi-- ) { - fp_t fsum = 0.0; + fsum = 0.0; for ( int xj = 0; xj < nDimension; xj++ ) { fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj]; @@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat } } +__kernel void oclAverageAdd(__global fp_t *values,__global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fVal = oclAverage(id,values,startArray,endArray); + output[id] = fVal + addend[id]; +} + +__kernel void oclAverageSub(__global fp_t *values,__global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fVal = oclAverage(id,values,startArray,endArray); + output[id] = fVal - subtract[id]; +} + +__kernel void oclAverageMul(__global fp_t *values,__global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fVal = oclAverage(id,values,startArray,endArray); + output[id] = fVal * multiplier[id]; +} +__kernel void oclAverageDiv(__global fp_t *values,__global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fVal = oclAverage(id,values,startArray,endArray); + fp_t divisor = div[id]; + if ( divisor != 0 ) + output[id] = fVal / divisor; + else + output[id] = 0.0; +} + +__kernel void oclMinAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMin = oclMin(id,values,startArray,endArray); + output[id] = fMin + addend[id]; +} + +__kernel void oclMinSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMin = oclMin(id,values,startArray,endArray); + output[id] = fMin - subtract[id]; +} +__kernel void oclMinMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMin = oclMin(id,values,startArray,endArray); + output[id] = fMin * multiplier[id]; +} +__kernel void oclMinDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMin = oclMin(id,values,startArray,endArray); + fp_t divisor = div[id]; + if ( divisor != 0 ) + output[id] = fMin / divisor; + else + output[id] = 0.0; +} +__kernel void oclMaxAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMax = oclMax(id,values,startArray,endArray); + output[id] = fMax + addend[id]; +} + +__kernel void oclMaxSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMax = oclMax(id,values,startArray,endArray); + output[id] = fMax - subtract[id]; +} +__kernel void oclMaxMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMax = oclMax(id,values,startArray,endArray); + output[id] = fMax * multiplier[id]; +} +__kernel void oclMaxDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + fp_t fMax = oclMax(id,values,startArray,endArray); + fp_t divisor = div[id]; + if ( divisor != 0 ) + output[id] = fMax / divisor; + else + output[id] = 0.0; +} + +__kernel void oclSub( fp_t ltData, __global fp_t *rtData, __global fp_t *outData ) +{ + const unsigned int id = get_global_id(0); + outData[id] = ltData - rtData[id]; +} ); #endif // USE_EXTERNAL_KERNEL diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx index 47b906be813b..6db498b67f53 100644 --- a/sc/source/core/opencl/openclwrapper.cxx +++ b/sc/source/core/opencl/openclwrapper.cxx @@ -7,19 +7,19 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include "openclwrapper.hxx" - +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <cmath> #include "sal/config.h" #include "random.hxx" +#include "openclwrapper.hxx" #include "oclkernels.hxx" #ifdef SAL_WIN32 #include <Windows.h> #endif -//#define USE_KERNEL_FILE - +//#define USE_MAP_BUFFER +using namespace std; GPUEnv OpenclDevice::gpuEnv; int OpenclDevice::isInited =0; @@ -44,9 +44,9 @@ int OpenclDevice::isInited =0; HINSTANCE HOpenclDll = NULL; void * OpenclDll = NULL; -int OpenclDevice::LoadOpencl() +int OpenclDevice::loadOpencl() { - //fprintf(stderr, " LoadOpenclDllxx... \n"); + //fprintf(stderr, " loadOpenclDllxx... \n"); OpenclDll = static_cast<HINSTANCE>( HOpenclDll ); OpenclDll = LoadLibrary( OPENCL_DLL_NAME ); if ( !static_cast<HINSTANCE>( OpenclDll ) ) @@ -59,7 +59,7 @@ int OpenclDevice::LoadOpencl() return OCLSUCCESS; } -void OpenclDevice::FreeOpenclDll() +void OpenclDevice::freeOpenclDll() { fprintf(stderr, " Free opencllo.dll ... \n"); if ( !static_cast<HINSTANCE>( OpenclDll ) ) @@ -67,39 +67,39 @@ void OpenclDevice::FreeOpenclDll() } #endif -int OpenclDevice::InitEnv() +int OpenclDevice::initEnv() { #ifdef SAL_WIN32 while( 1 ) { - if( 1 == LoadOpencl() ) + if( 1 == loadOpencl() ) break; } #endif - InitOpenclRunEnv( 0 ); + initOpenclRunEnv( 0 ); return 1; } -int OpenclDevice::ReleaseOpenclRunEnv() +int OpenclDevice::releaseOpenclRunEnv() { - ReleaseOpenclEnv( &gpuEnv ); + releaseOpenclEnv( &gpuEnv ); #ifdef SAL_WIN32 - FreeOpenclDll(); + freeOpenclDll(); #endif return 1; } /////////////////////////////////////////////////////// /////////////////////////////////////////////////////// -inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName ) +inline int OpenclDevice::addKernelConfig( int kCount, const char *kName ) { if ( kCount < 1 ) - fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "AddKernelConfig\n" ); + fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "addKernelConfig\n" ); strcpy( gpuEnv.mArrykernelNames[kCount-1], kName ); gpuEnv.mnKernelCount++; return 0; } -int OpenclDevice::RegistOpenclKernel() +int OpenclDevice::registOpenclKernel() { if ( !gpuEnv.mnIsUserCreated ) memset( &gpuEnv, 0, sizeof(gpuEnv) ); @@ -107,37 +107,58 @@ int OpenclDevice::RegistOpenclKernel() gpuEnv.mnFileCount = 0; //argc; gpuEnv.mnKernelCount = 0UL; - AddKernelConfig( 1, (const char*) "oclFormulaMin" ); - AddKernelConfig( 2, (const char*) "oclFormulaMax" ); - AddKernelConfig( 3, (const char*) "oclFormulaSum" ); - AddKernelConfig( 4, (const char*) "oclFormulaCount" ); - AddKernelConfig( 5, (const char*) "oclFormulaAverage" ); - AddKernelConfig( 6, (const char*) "oclFormulaSumproduct" ); - AddKernelConfig( 7, (const char*) "oclFormulaMtxInv" ); - - AddKernelConfig( 8, (const char*) "oclSignedAdd" ); - AddKernelConfig( 9, (const char*) "oclSignedSub" ); - AddKernelConfig( 10, (const char*) "oclSignedMul" ); - AddKernelConfig( 11, (const char*) "oclSignedDiv" ); - AddKernelConfig( 12, (const char*) "oclAverageDelta" ); - AddKernelConfig( 13, (const char*) "oclMaxDelta" ); - AddKernelConfig( 14, (const char*) "oclMinDelta" ); - AddKernelConfig( 15, (const char*) "oclSubDelta" ); - AddKernelConfig( 16, (const char*) "oclLUDecomposition" ); + addKernelConfig( 1, (const char*) "oclFormulaMin" ); + addKernelConfig( 2, (const char*) "oclFormulaMax" ); + addKernelConfig( 3, (const char*) "oclFormulaSum" ); + addKernelConfig( 4, (const char*) "oclFormulaCount" ); + addKernelConfig( 5, (const char*) "oclFormulaAverage" ); + addKernelConfig( 6, (const char*) "oclFormulaSumproduct" ); + addKernelConfig( 7, (const char*) "oclFormulaMtxInv" ); + + addKernelConfig( 8, (const char*) "oclSignedAdd" ); + addKernelConfig( 9, (const char*) "oclSignedSub" ); + addKernelConfig( 10, (const char*) "oclSignedMul" ); + addKernelConfig( 11, (const char*) "oclSignedDiv" ); + addKernelConfig( 12, (const char*) "oclAverageDelta" ); + addKernelConfig( 13, (const char*) "oclMaxDelta" ); + addKernelConfig( 14, (const char*) "oclMinDelta" ); + addKernelConfig( 15, (const char*) "oclSubDelta" ); + addKernelConfig( 16, (const char*) "oclLUDecomposition" ); + addKernelConfig( 17, (const char*) "oclAverageDeltaRPN" ); + addKernelConfig( 18, (const char*) "oclMaxDeltaRPN" ); + addKernelConfig( 19, (const char*) "oclMinDeltaRPN" ); + addKernelConfig( 20, (const char*) "oclMoreColArithmeticOperator" ); + addKernelConfig( 21, (const char*) "oclColumnH" ); + addKernelConfig( 22, (const char*) "oclColumnL" ); + addKernelConfig( 23, (const char*) "oclColumnN" ); + addKernelConfig( 24, (const char*) "oclColumnJ" ); + addKernelConfig( 25, (const char*) "oclMaxSub" ); + addKernelConfig( 26, (const char*) "oclAverageSub" ); + addKernelConfig( 27, (const char*) "oclMinSub" ); + addKernelConfig( 28, (const char*) "oclMaxAdd" ); + addKernelConfig( 29, (const char*) "oclAverageAdd" ); + addKernelConfig( 30, (const char*) "oclMinAdd" ); + addKernelConfig( 31, (const char*) "oclMaxMul" ); + addKernelConfig( 32, (const char*) "oclAverageMul" ); + addKernelConfig( 33, (const char*) "oclMinMul" ); + addKernelConfig( 34, (const char*) "oclMaxDiv" ); + addKernelConfig( 35, (const char*) "oclAverageDiv" ); + addKernelConfig( 36, (const char*) "oclMinDiv" ); + addKernelConfig( 37, (const char*) "oclSub" );// for svDouble type return 0; } OpenclDevice::OpenclDevice() { - //InitEnv(); + //initEnv(); } OpenclDevice::~OpenclDevice() { - //ReleaseOpenclRunEnv(); + //releaseOpenclRunEnv(); } -int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) +int OpenclDevice::setKernelEnv( KernelEnv *envInfo ) { envInfo->mpkContext = gpuEnv.mpContext; envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; @@ -146,12 +167,12 @@ int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) return 1; } -int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName ) +int OpenclDevice::checkKernelName( KernelEnv *envInfo, const char *kernelName ) { - //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount); + //printf("checkKernelName,total count of kernels...%d\n", gpuEnv.kernelCount); int kCount; int nFlag = 0; - for ( kCount = 0; kCount < gpuEnv.mnKernelCount; kCount++ ) + for ( kCount=0; kCount < gpuEnv.mnKernelCount; kCount++ ) { if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[kCount]) == 0 ) { @@ -174,7 +195,7 @@ int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName ) return 1; } -int OpenclDevice::ConvertToString( const char *filename, char **source ) +int OpenclDevice::convertToString( const char *filename, char **source ) { int file_size; size_t result; @@ -211,7 +232,7 @@ int OpenclDevice::ConvertToString( const char *filename, char **source ) return 0; } -int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) +int OpenclDevice::binaryGenerated( const char * clFileName, FILE ** fhandle ) { unsigned int i = 0; cl_int clStatus; @@ -219,16 +240,28 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) char *str = NULL; FILE *fd = NULL; cl_uint numDevices=0; - clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform - CL_DEVICE_TYPE_GPU, // device_type - 0, // num_entries - NULL, // devices ID - &numDevices); + if ( getenv("SC_OPENCLCPU") ) + { + clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform + CL_DEVICE_TYPE_CPU, // device_type for CPU device + 0, // num_entries + NULL, // devices ID + &numDevices); + } + else + { + clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform + CL_DEVICE_TYPE_GPU, // device_type for GPU device + 0, // num_entries + NULL, // devices ID + &numDevices); + } + CHECK_OPENCL( clStatus, "clGetDeviceIDs" ); for ( i = 0; i < numDevices; i++ ) { + char fileName[256] = { 0 }, cl_name[128] = { 0 }; if ( gpuEnv.mpArryDevsID[i] != 0 ) { - char fileName[256] = { 0 }, cl_name[128] = { 0 }; char deviceName[1024]; clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL ); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); @@ -248,7 +281,7 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) } -int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) +int OpenclDevice::writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) { FILE *output = NULL; output = fopen( fileName, "wb" ); @@ -264,7 +297,7 @@ int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, s } -int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName ) +int OpenclDevice::generatBinFromKernelSource( cl_program program, const char * clFileName ) { unsigned int i = 0; cl_int clStatus; @@ -307,12 +340,6 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); if ( binaries[i] == NULL ) { - for ( unsigned int j = 0; j < i ; j++) - { - if (binaries[j]) - free(binaries[j]); - } - free(binaries); return 0; } } @@ -329,9 +356,10 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c /* dump out each binary into its own separate file. */ for ( i = 0; i < numDevices; i++ ) { + char fileName[256] = { 0 }, cl_name[128] = { 0 }; + if ( binarySizes[i] != 0 ) { - char fileName[256] = { 0 }, cl_name[128] = { 0 }; char deviceName[1024]; clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); @@ -342,7 +370,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c cl_name[str - clFileName] = '\0'; sprintf( fileName, "./%s-%s.bin", cl_name, deviceName ); - if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) + if ( !writeBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) { printf("opencl-wrapper: write binary[%s] failds\n", fileName); return 0; @@ -381,7 +409,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c return 1; } -int OpenclDevice::InitOpenclAttr( OpenCLEnv * env ) +int OpenclDevice::initOpenclAttr( OpenCLEnv * env ) { if ( gpuEnv.mnIsUserCreated ) return 1; @@ -396,7 +424,7 @@ int OpenclDevice::InitOpenclAttr( OpenCLEnv * env ) return 0; } -int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env ) +int OpenclDevice::createKernel( char * kernelname, KernelEnv * env ) { int clStatus; @@ -406,13 +434,13 @@ int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env ) return clStatus != CL_SUCCESS ? 1 : 0; } -int OpenclDevice::ReleaseKernel( KernelEnv * env ) +int OpenclDevice::releaseKernel( KernelEnv * env ) { int clStatus = clReleaseKernel( env->mpkKernel ); return clStatus != CL_SUCCESS ? 1 : 0; } -int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) +int OpenclDevice::releaseOpenclEnv( GPUEnv *gpuInfo ) { int i = 0; int clStatus = 0; @@ -447,18 +475,18 @@ int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) return 1; } -int OpenclDevice::RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ) +int OpenclDevice::runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ) { - printf("oclwrapper:RunKernel_wrapper...\n"); - if ( RegisterKernelWrapper( kernelName, function ) != 1 ) + printf("oclwrapper:runKernel_wrapper...\n"); + if ( registerKernelWrapper( kernelName, function ) != 1 ) { - fprintf(stderr, "Error:RunKernel_wrapper:RegisterKernelWrapper fail!\n"); + fprintf(stderr, "Error:runKernel_wrapper:registerKernelWrapper fail!\n"); return -1; } - return ( RunKernel( kernelName, usrdata ) ); + return ( runKernel( kernelName, usrdata ) ); } -int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) +int OpenclDevice::cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) { int i; for ( i = 0; i < gpuEnvCached->mnFileCount; i++ ) @@ -475,19 +503,20 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl return 0; } -int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) +int OpenclDevice::compileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) { cl_int clStatus = 0; size_t length; - char *binary; + char *buildLog = NULL, *binary; const char *source; size_t source_size[1]; - int binary_status, binaryExisted, idx; - cl_uint numDevices; - FILE *fd; + int b_error, binary_status, binaryExisted, idx; + size_t numDevices; + cl_device_id *mpArryDevsID; + FILE *fd, *fd1; const char* filename = "kernel.cl"; - fprintf(stderr, "CompileKernelFile ... \n"); - if ( CachedOfKernerPrg(gpuInfo, filename) == 1 ) + fprintf(stderr, "compileKernelFile ... \n"); + if ( cachedOfKernerPrg(gpuInfo, filename) == 1 ) { return 1; } @@ -498,33 +527,31 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) source_size[0] = strlen( source ); binaryExisted = 0; - if ( ( binaryExisted = BinaryGenerated( filename, &fd ) ) == 1 ) + if ( ( binaryExisted = binaryGenerated( filename, &fd ) ) == 1 ) { clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices), &numDevices, NULL ); CHECK_OPENCL( clStatus, "clGetContextInfo" ); - cl_device_id *mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); + mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); if ( mpArryDevsID == NULL ) { return 0; } - int b_error = 0; + b_error = 0; length = 0; b_error |= fseek( fd, 0, SEEK_END ) < 0; b_error |= ( length = ftell(fd) ) <= 0; b_error |= fseek( fd, 0, SEEK_SET ) < 0; if ( b_error ) { - free(mpArryDevsID); return 0; } binary = (char*) malloc( length + 2 ); if ( !binary ) { - free(mpArryDevsID); return 0; } @@ -537,12 +564,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) // grab the handles to all of the devices in the context. clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL ); - if (clStatus != CL_SUCCESS) - { - fprintf (stderr, "OpenCL error code is %d at " SAL_DETAIL_WHERE " when clGetContextInfo .\n", clStatus); - free(binary); - return 0; - } + CHECK_OPENCL( clStatus, "clGetContextInfo" ); fprintf(stderr, "Create kernel from binary\n"); gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices, @@ -600,7 +622,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) printf("opencl create build log fail\n"); return 0; } - char* buildLog = (char*) malloc( length ); + buildLog = (char*) malloc( length ); if ( buildLog == (char*) NULL ) { return 0; @@ -618,11 +640,10 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) if ( clStatus != CL_SUCCESS ) { printf("opencl program build info fail\n"); - free(buildLog); return 0; } - FILE *fd1 = fopen( "kernel-build.log", "w+" ); + fd1 = fopen( "kernel-build.log", "w+" ); if ( fd1 != NULL ) { fwrite( buildLog, sizeof(char), length, fd1 ); @@ -636,17 +657,17 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) strcpy( gpuEnv.mArryKnelSrcFile[idx], filename ); if ( binaryExisted == 0 ) - GeneratBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename ); + generatBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename ); gpuInfo->mnFileCount += 1; return 1; } -int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function) +int OpenclDevice::getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function) { int i; - //printf("----------------OpenclDevice::GetKernelEnvAndFunc\n"); + //printf("----------------OpenclDevice::getKernelEnvAndFunc\n"); for ( i = 0; i < gpuEnv.mnKernelCount; i++ ) { if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 ) @@ -662,14 +683,14 @@ int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, c return 0; } -int OpenclDevice::RunKernel( const char *kernelName, void **userdata) +int OpenclDevice::runKernel( const char *kernelName, void **userdata) { KernelEnv kEnv; cl_kernel_function function; int status; memset( &kEnv, 0, sizeof( KernelEnv ) ); - status = GetKernelEnvAndFunc( kernelName, &kEnv, &function ); + status = getKernelEnvAndFunc( kernelName, &kEnv, &function ); strcpy( kEnv.mckKernelName, kernelName ); if ( status == 1 ) { @@ -680,8 +701,9 @@ int OpenclDevice::RunKernel( const char *kernelName, void **userdata) return 0; } -int OpenclDevice::InitOpenclRunEnv( int argc ) +int OpenclDevice::initOpenclRunEnv( int argc ) { + int status = 0; if ( MAX_CLKERNEL_NUM <= 0 ) { return 1; @@ -691,9 +713,9 @@ int OpenclDevice::InitOpenclRunEnv( int argc ) if ( !isInited ) { - RegistOpenclKernel(); + registOpenclKernel(); //initialize devices, context, comand_queue - int status = InitOpenclRunEnv( &gpuEnv ); + status = initOpenclRunEnv( &gpuEnv ); if ( status ) { printf("init_opencl_env failed.\n"); @@ -709,36 +731,38 @@ int OpenclDevice::InitOpenclRunEnv( int argc ) if( gpuEnv.mnKhrFp64Flag ) { printf("----use khr double type in kernel----\n"); - status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double" ); + status = compileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); } else if( gpuEnv.mnAmdFp64Flag ) { printf("----use amd double type in kernel----\n"); - status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double" ); + status = compileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); } else { printf("----use float type in kernel----\n"); - status = CompileKernelFile( &gpuEnv, "-Dfp_t=float" ); + status = compileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" ); } if ( status == 0 || gpuEnv.mnKernelCount == 0 ) { - printf("CompileKernelFile failed.\n"); + printf("compileKernelFile failed.\n"); return 1; } - printf("CompileKernelFile successed.\n"); + printf("compileKernelFile successed.\n"); isInited = 1; } return 0; } -int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) +int OpenclDevice::initOpenclRunEnv( GPUEnv *gpuInfo ) { size_t length; cl_int clStatus; cl_uint numPlatforms, numDevices; cl_platform_id *platforms; cl_context_properties cps[3]; + char platformName[256]; + unsigned int i; // Have a look at the available platforms. @@ -765,8 +789,7 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) return 1; } - char platformName[256]; - for ( size_t i = 0; i < numPlatforms; i++ ) + for ( i = 0; i < numPlatforms; i++ ) { clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof( platformName ), platformName, NULL ); @@ -781,13 +804,22 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) //if( !strcmp( platformName, "Advanced Micro Devices, Inc." )) { gpuInfo->mpPlatformID = platforms[i]; - - clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform - CL_DEVICE_TYPE_GPU, // device_type - 0, // num_entries - NULL, // devices - &numDevices); - + if ( getenv("SC_OPENCLCPU") ) + { + clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform + CL_DEVICE_TYPE_CPU, // device_type for CPU device + 0, // num_entries + NULL, // devices + &numDevices); + } + else + { + clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform + CL_DEVICE_TYPE_GPU, // device_type for GPU device + 0, // num_entries + NULL, // devices + &numDevices); + } if ( clStatus != CL_SUCCESS ) continue; @@ -806,8 +838,15 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties) gpuInfo->mpPlatformID; cps[2] = 0; - // Check for GPU. - gpuInfo->mDevType = CL_DEVICE_TYPE_GPU; + // Set device type for OpenCL + if ( getenv("SC_OPENCLCPU") ) + { + gpuInfo->mDevType = CL_DEVICE_TYPE_CPU; + } + else + { + gpuInfo->mDevType = CL_DEVICE_TYPE_GPU; + } gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) @@ -845,7 +884,6 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) } clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "GetCommandQueueInfo" ); // Check device extensions for double type size_t aDevExtInfoSize = 0; @@ -875,10 +913,10 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) return 0; } -int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ) +int OpenclDevice::registerKernelWrapper( const char *kernelName, cl_kernel_function function ) { int i; - //printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.mnKernelCount); + //printf("oclwrapper:registerKernelWrapper...%d\n", gpuEnv.mnKernelCount); for ( i = 0; i < gpuEnv.mnKernelCount; i++ ) { if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 ) @@ -890,13 +928,13 @@ int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_funct return 0; } -void OpenclDevice::SetOpenclState( int state ) +void OpenclDevice::setOpenclState( int state ) { //printf("OpenclDevice::setOpenclState...\n"); isInited = state; } -int OpenclDevice::GetOpenclState() +int OpenclDevice::getOpenclState() { return isInited; } @@ -906,54 +944,46 @@ OclCalc::OclCalc() fprintf(stderr,"OclCalc:: init opencl ...\n"); nFormulaColSize = 0; nFormulaRowSize = 0; + nArithmeticLen = 0; + nFormulaLen = 0; + mpClmemSrcData = NULL; + mpClmemStartPos = NULL; + mpClmemEndPos = NULL; + mpClmemLeftData = NULL; + mpClmemRightData = NULL; + mpClmemMergeLfData = NULL; + mpClmemMergeRtData = NULL; + mpClmemMatixSumSize = NULL; + mpClmemeOp = NULL; } OclCalc::~OclCalc() { - fprintf(stderr,"OclCalc:: opencl end ...\n"); + releaseOclBuffer(); } - -///////////////////////////////////////////////////////////////////////////// -int OclCalc::CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ) +int OclCalc::releaseOclBuffer(void) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); - - mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(double), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(unsigned int), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(unsigned int), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - - dpSrcData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL(clStatus,"clEnqueueMapBuffer"); - clFinish(kEnv.mpkCmdQueue); - npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); - clFinish( kEnv.mpkCmdQueue ); - npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL( clStatus,"clEnqueueMapBuffer" ); - clFinish( kEnv.mpkCmdQueue ); - //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); - return 0; + CHECK_OPENCL_RELEASE( clStatus, mpClmemSrcData ); + CHECK_OPENCL_RELEASE( clStatus, mpClmemStartPos ); + CHECK_OPENCL_RELEASE( clStatus, mpClmemEndPos ); + CHECK_OPENCL_RELEASE( clStatus, mpClmemLeftData ); + CHECK_OPENCL_RELEASE( clStatus, mpClmemRightData ); + fprintf(stderr,"OclCalc:: opencl end ...\n"); + return 1; } -int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize ) +///////////////////////////////////////////////////////////////////////////// + +int OclCalc::createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize ) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); + setKernelEnv( &kEnv ); - mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), + mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nBufferSize * sizeof(double), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), + mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nBufferSize * sizeof(double), NULL, &clStatus ); CHECK_OPENCL( clStatus,"clCreateBuffer" ); dpLeftData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue,mpClmemLeftData,CL_TRUE,CL_MAP_WRITE,0, @@ -967,54 +997,169 @@ int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); return 0; } -int OclCalc::CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ) + +int OclCalc::mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); - mpClmemMergeLfData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(double), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemMergeRtData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(double), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemMatixSumSize = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(unsigned int), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); + double * dpSrcDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE,CL_MAP_WRITE, 0, + nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + unsigned int *npStartPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE,CL_MAP_WRITE, 0, + nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + unsigned int *npEndPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE, 0, + nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for(int i=0;i<nBufferSize;i++) + dpSrcDataMap[i] = dpTempSrcData[i]; + for(int i=0;i<nRowsize;i++) + { + npStartPosMap[i] = unStartPos[i]; + npEndPosMap[i] = unEndPos[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, dpSrcDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPosMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPosMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} +int OclCalc::mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize ) +{ + cl_int clStatus = 0; + double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + double *dpRightDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemRightData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( int i = 0; i < nBufferSize; i++ ) + { + dpLeftDataMap[i] = dpTempLeftData[i]; + dpRightDataMap[i] = dpTempRightData[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, dpRightDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} - dpSumProMergeLfData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeLfData, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(double), - 0, NULL, NULL, &clStatus ); +int OclCalc::mapAndCopyArithmetic64Bits( const double *dpMoreColArithmetic, int nBufferSize ) +{ + cl_int clStatus = 0; + double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); - clFinish(kEnv.mpkCmdQueue); - dpSumProMergeRtData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeRtData, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(double), - 0, NULL, NULL, &clStatus ); + clFinish( kEnv.mpkCmdQueue ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - npSumSize = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMatixSumSize, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nBufferSize * sizeof(uint), - 0, NULL, NULL, &clStatus ); + for ( int i = 0; i < nBufferSize; i++ ) + { + dpLeftDataMap[i] = dpMoreColArithmetic[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} +int OclCalc::mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic, int nBufferSize, uint *npeOp, uint neOpSize ) +{ + cl_int clStatus = 0; + double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); + uint *dpeOpMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemeOp, CL_TRUE, CL_MAP_WRITE, + 0, neOpSize * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( int i = 0; i < nBufferSize; i++ ) + { + dpLeftDataMap[i] = dpMoreColArithmetic[i]; + } + for( uint i = 0; i<neOpSize; i++) + { + dpeOpMap[i] = npeOp[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemeOp, dpeOpMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} + +int OclCalc::createFormulaBuf64Bits( int nBufferSize, int rowSize ) +{ + cl_int clStatus = 0; + setKernelEnv( &kEnv ); + mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(double), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + nFormulaLen = nBufferSize; + mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + rowSize * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + rowSize * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clFinish( kEnv.mpkCmdQueue ); return 0; } -int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult,int nRowSize ) + +int OclCalc::createArithmeticOptBuf64Bits( int nBufferSize ) +{ + cl_int clStatus = 0; + nArithmeticLen = nBufferSize; + setKernelEnv( &kEnv ); + mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(double), NULL, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(double), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + return 0; +} + +int OclCalc::createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize ) +{ + cl_int clStatus = 0; + nArithmeticLen = nBufferSize; + setKernelEnv( &kEnv ); + mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(double), NULL, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + mpClmemeOp = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + neOpSize * sizeof(uint), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + return 0; + +} + +int OclCalc::oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult,int nRowSize ) { cl_int clStatus = 0; size_t global_work_size[1]; - CheckKernelName( &kEnv, aKernelName ); + checkKernelName( &kEnv, aKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftData, 0, NULL, NULL ); - CHECK_OPENCL(clStatus,"clEnqueueUnmapMemObject"); - clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, fpRightData, 0, NULL, NULL ); - CHECK_OPENCL(clStatus,"clEnqueueUnmapMemObject"); - clFinish( kEnv.mpkCmdQueue ); - cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(double), NULL, &clStatus); CHECK_OPENCL( clStatus, "clCreateBuffer" ); @@ -1032,9 +1177,11 @@ int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *f CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); - rResult = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE,CL_MAP_READ, + double *dpOutPut = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE,CL_MAP_READ, 0, nRowSize*sizeof(double), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + for ( int i = 0; i < nRowSize; i++ ) + rResult[i] = dpOutPut[i]; clFinish( kEnv.mpkCmdQueue ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, rResult, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); @@ -1043,31 +1190,163 @@ int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *f CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); - clStatus = clReleaseMemObject( mpClmemLeftData ); - CHECK_OPENCL( clStatus, "clReleaseMemObject"); - clStatus = clReleaseMemObject( mpClmemRightData ); + clStatus = clReleaseMemObject( clResult ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + return 0; +} + +int OclCalc::oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + const char *aKernelName = "oclMoreColArithmeticOperator"; + checkKernelName( &kEnv,aKernelName ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(double), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_int), (void *)&nDataSize ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&mpClmemeOp ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_int), (void *)&neOpSize ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&clResult ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + global_work_size[0] = nRowSize; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, + NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); + clFinish( kEnv.mpkCmdQueue ); + double * hostMapResult = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE, CL_MAP_READ, 0, + nRowSize*sizeof(double), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( int i = 0; i < nRowSize; i++) + rResult[i] = hostMapResult[i]; // from gpu float type to cpu double type + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, hostMapResult, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clStatus = clFinish(kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel(kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( clResult ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + return 0; + +} + +int OclCalc::oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + setKernelEnv( &kEnv ); + checkKernelName( &kEnv, aKernelName ); + + cl_mem clLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + nRowSize * sizeof(double), (void *)dpLeftData, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + nRowSize * sizeof(double), (void *)dpRightData, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, + nRowSize * sizeof(double), NULL, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&clLeftData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clRightData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clResult ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + + global_work_size[0] = nRowSize; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, + NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, clResult, CL_TRUE, 0, nRowSize * sizeof(double), (double *)rResult, 0, NULL, NULL); + CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" ); + + clStatus = clFinish( kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel( kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( clResult ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clLeftData ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clRightData ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); return 0; } -int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int size ) +int OclCalc::oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size ) { cl_int clStatus = 0; size_t global_work_size[1]; - CheckKernelName( &kEnv, aKernelName ); + setKernelEnv( &kEnv ); + checkKernelName( &kEnv, aKernelName ); + cl_mem clSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + nBufferSize * sizeof(double), (void *)dpSrcData, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + size * sizeof(unsigned int), (void *)nStartPos, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + size * sizeof(unsigned int), (void *)nEndPos, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&clSrcData); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clStartPos ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clEndPos ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&outputCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + global_work_size[0] = size; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); - clFinish(kEnv.mpkCmdQueue); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject"); + + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, outputCl, CL_TRUE, 0, size * sizeof(double), (double *)output, 0, NULL, NULL); + CHECK_OPENCL( clStatus, "clReadBuffer" ); clFinish( kEnv.mpkCmdQueue ); + clStatus = clFinish(kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel( kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( outputCl ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clSrcData ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clStartPos ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clEndPos ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + return 0; +} + +int OclCalc::oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int size ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + checkKernelName( &kEnv, aKernelName ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&mpClmemSrcData); @@ -1082,8 +1361,14 @@ int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fp clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); - output = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, - 0, size*sizeof(double), 0, NULL, NULL, &clStatus ); + double *dpOutPut = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE,CL_MAP_READ, + 0, size*sizeof(double), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + for ( int i = 0; i < size; i++ ) + { + output[i] = dpOutPut[i]; + } + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, outputCl, output, 0, NULL, NULL ); @@ -1092,21 +1377,15 @@ int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fp CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); - clStatus = clReleaseMemObject(mpClmemSrcData ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject(mpClmemStartPos ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemEndPos ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); clStatus = clReleaseMemObject( outputCl ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); return 0; } -int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize ) +int OclCalc::oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize ) { const char *cpKernelName = "oclFormulaCount"; - CheckKernelName( &kEnv, cpKernelName ); + checkKernelName( &kEnv, cpKernelName ); cl_int clStatus; size_t global_work_size[1]; kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); @@ -1141,12 +1420,6 @@ int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); - clStatus = clReleaseMemObject( mpClmemSrcData ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemStartPos ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemEndPos ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); clStatus = clReleaseMemObject( clpOutput ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); return 0; @@ -1157,14 +1430,14 @@ int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *save the npStart array eg:a4-a8;b10-b14,the npStart will store a4,b10,and the npEnd will store a8,b14 range.So it can if(i +1)%2 to judge * the a cloumn or b cloumn npStart range.so as b bolumn. */ -int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double *dpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize ) +int OclCalc::oclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double *dpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize ) { cl_int clStatus; size_t global_work_size[1]; memset(dpOutput,0,nSize); const char *cpFirstKernelName = "oclSignedMul"; const char *cpSecondKernelName = "oclFormulaSumproduct"; - CheckKernelName( &kEnv, cpFirstKernelName ); + checkKernelName( &kEnv, cpFirstKernelName ); kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kEnv.mckKernelName,&clStatus); CHECK_OPENCL( clStatus, "clCreateKernel" ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemMergeLfData, dpSumProMergeLfData, 0, NULL, NULL ); @@ -1197,7 +1470,7 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double CHECK_OPENCL( clStatus, "clReleaseMemObject" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - CheckKernelName( &kEnv, cpSecondKernelName ); + checkKernelName( &kEnv, cpSecondKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); cl_mem clpOutput = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nSize* sizeof(double), NULL, &clStatus ); @@ -1220,11 +1493,13 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double NULL, global_work_size, NULL, 0, NULL, NULL); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); - dpOutput = (double *)clEnqueueMapBuffer(kEnv.mpkCmdQueue, clpOutput, CL_TRUE, CL_MAP_READ, 0, - nSize*sizeof(double), 0, NULL, NULL, &clStatus ); + double * outputMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpOutput, CL_TRUE, CL_MAP_READ, + 0, nSize*sizeof(double), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpOutput, dpOutput, 0, NULL, NULL ); + for ( int i = 0; i < nSize; i++ ) + dpOutput[i] = outputMap[i]; + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpOutput, outputMap, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); clStatus = clFinish( kEnv.mpkCmdQueue ); CHECK_OPENCL( clStatus, "clFinish" ); @@ -1239,46 +1514,67 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double return 0; } -int OclCalc::CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ) +int OclCalc::createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize ) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); - mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(float), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(unsigned int), NULL, &clStatus ); + nArithmeticLen = nBufferSize; + setKernelEnv( &kEnv ); + mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(float), NULL, &clStatus); CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nBufferSize * sizeof(unsigned int), NULL, &clStatus ); + mpClmemeOp = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + neOpSize * sizeof(uint), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); - fpSrcData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + return 0; +} + +int OclCalc::createArithmeticOptBuf32Bits( int nBufferSize ) +{ + cl_int clStatus = 0; + setKernelEnv( &kEnv ); + nArithmeticLen = nBufferSize; + mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(float), NULL, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(float), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); clFinish( kEnv.mpkCmdQueue ); - npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, - nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + return 0; +} + +int OclCalc::createFormulaBuf32Bits( int nBufferSize, int rowSize ) +{ + cl_int clStatus = 0; + setKernelEnv( &kEnv ); + nFormulaLen = nBufferSize; + + mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + nBufferSize * sizeof(float), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + rowSize * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + rowSize * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); clFinish( kEnv.mpkCmdQueue ); - //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); return 0; } -int OclCalc::CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize ) +int OclCalc::createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize ) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); + setKernelEnv( &kEnv ); mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), nBufferSize * sizeof(float), NULL, &clStatus); CHECK_OPENCL( clStatus, "clCreateBuffer" ); mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), nBufferSize * sizeof(unsigned int), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); - fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, + fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION, 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); @@ -1289,48 +1585,120 @@ int OclCalc::CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nB //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); return 0; } -int OclCalc::CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ) + +int OclCalc::mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize) { cl_int clStatus = 0; - SetKernelEnv( &kEnv ); - mpClmemMergeLfData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(float), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemMergeRtData = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(float), NULL, &clStatus); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - mpClmemMatixSumSize = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), - nMatixSize * sizeof(unsigned int), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer" ); - fpSumProMergeLfData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeLfData, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(float), 0, NULL, NULL, &clStatus ); + float *fpSrcData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE, 0, + nBufferSize * sizeof(float) , 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - fpSumProMergeRtData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeRtData, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(float), 0, NULL, NULL, &clStatus ); + unsigned int *npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE, 0, + nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - npSumSize = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMatixSumSize, CL_TRUE, - CL_MAP_WRITE_INVALIDATE_REGION, 0, nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus ); + unsigned int *npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE, 0, + nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus ); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos); - return 0; + for(int i=0;i<nBufferSize;i++) + { + fpSrcData[i] = (float)dpTempSrcData[i]; + } + for(int i=0;i<nRowsize;i++) + { + npStartPos[i] = unStartPos[i]; + npEndPos[i] = unEndPos[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; } -int OclCalc::OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize ) +int OclCalc::mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize ) { cl_int clStatus = 0; - size_t global_work_size[1]; - CheckKernelName( &kEnv,aKernelName ); - kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel" ); + float *fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + float *fpRightData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemRightData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for(int i=0;i<nBufferSize;i++) + { + fpLeftData[i] = (float)dpTempLeftData[i]; + fpRightData[i] = (float)dpTempRightData[i]; + } clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftData, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); clFinish( kEnv.mpkCmdQueue ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, fpRightData, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); clFinish( kEnv.mpkCmdQueue ); + return 1; +} +int OclCalc::mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize ) +{ + cl_int clStatus = 0; + float *dpLeftDataMap = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( int i = 0; i < nBufferSize; i++ ) + { + dpLeftDataMap[i] = (float)dpMoreColArithmetic[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} +int OclCalc::mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize, uint *npeOp, uint neOpSize ) +{ + cl_int clStatus = 0; + float *fpLeftDataMap = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE, + 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + uint *dpeOpMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemeOp, CL_TRUE, CL_MAP_WRITE, + 0, neOpSize * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( int i = 0; i < nBufferSize; i++ ) + { + fpLeftDataMap[i] = (float)dpMoreColArithmetic[i]; + } + for( uint i = 0; i<neOpSize; i++ ) + { + dpeOpMap[i] = npeOp[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftDataMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemeOp, dpeOpMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + return 1; +} + +int OclCalc::oclHostArithmeticOperator32Bits( const char* aKernelName,double *rResult, int nRowSize ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + + checkKernelName( &kEnv,aKernelName ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(float), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData ); @@ -1356,31 +1724,61 @@ int OclCalc::OclHostArithmeticOperator32Bits( const char* aKernelName, float *fp CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel(kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); - clStatus = clReleaseMemObject(mpClmemLeftData ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemRightData ); - CHECK_OPENCL( clStatus, "clReleaseMemObject" ); clStatus = clReleaseMemObject( clResult ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); return 0; } -int OclCalc::OclHostFormulaStatistics32Bits(const char* aKernelName,float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int size) +int OclCalc::oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize ) { cl_int clStatus = 0; size_t global_work_size[1]; - CheckKernelName( &kEnv, aKernelName ); - kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); + const char *aKernelName = "oclMoreColArithmeticOperator"; + checkKernelName( &kEnv,aKernelName ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(float), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_int), (void *)&nDataSize ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&mpClmemeOp ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_int), (void *)&neOpSize ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&clResult ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + global_work_size[0] = nRowSize; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, + NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + float * hostMapResult = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE, CL_MAP_READ, 0, + nRowSize*sizeof(float), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); clFinish( kEnv.mpkCmdQueue ); - clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL ); + for ( int i = 0; i < nRowSize; i++) + rResult[i] = hostMapResult[i]; // from gpu float type to cpu double type + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, hostMapResult, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); - clFinish( kEnv.mpkCmdQueue ); + clStatus = clFinish(kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel(kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( clResult ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + return 0; +} + +int OclCalc::oclHostFormulaStatistics32Bits(const char* aKernelName,double *output,int size) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + checkKernelName( &kEnv, aKernelName ); + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(float), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemSrcData ); @@ -1407,21 +1805,141 @@ int OclCalc::OclHostFormulaStatistics32Bits(const char* aKernelName,float *fpSrc CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); - clStatus = clReleaseMemObject( mpClmemSrcData ); + clStatus = clReleaseMemObject( outputCl ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemStartPos ); + return 0; +} + +int OclCalc::oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + setKernelEnv( &kEnv ); + checkKernelName( &kEnv, aKernelName ); + float *fpLeftData = (float *)malloc( sizeof(float) * nRowSize ); + float *fpRightData = (float *)malloc( sizeof(float) * nRowSize ); + float *fpResult = (float *)malloc( sizeof(float) * nRowSize ); + for(int i=0;i<nRowSize;i++) + { + fpLeftData[i] = (float)dpLeftData[i]; + fpRightData[i] = (float)dpRightData[i]; + } + cl_mem clLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + nRowSize * sizeof(float), (void *)fpLeftData, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR), + nRowSize * sizeof(float), (void *)fpRightData, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, + nRowSize * sizeof(float), NULL, &clStatus); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&clLeftData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clRightData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clResult ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + + global_work_size[0] = nRowSize; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, + NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, clResult, CL_TRUE, 0, nRowSize * sizeof(float), (float *)fpResult, 0, NULL, NULL); + CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" ); + for(int i=0;i<nRowSize;i++) + rResult[i] = (double)fpResult[i]; + if(fpResult) + { + free(fpResult); + fpResult = NULL; + } + clStatus = clFinish( kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel( kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( clResult ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - clStatus = clReleaseMemObject( mpClmemEndPos ); + clStatus = clReleaseMemObject( clLeftData ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clRightData ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + return 0; +} + +int OclCalc::oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size ) +{ + cl_int clStatus = 0; + size_t global_work_size[1]; + setKernelEnv( &kEnv ); + checkKernelName( &kEnv, aKernelName ); + float *fpSrcData = (float *)malloc( sizeof(float) * nBufferSize ); + float *fpResult = (float *)malloc( sizeof(float) * size ); + for(int i=0;i<nBufferSize;i++) + fpSrcData[i] = (float)dpSrcData[i]; + cl_mem clSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR), + nBufferSize * sizeof(float), (void *)fpSrcData, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR), + size * sizeof(unsigned int), (void *)nStartPos, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR), + size * sizeof(unsigned int), (void *)nEndPos, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&clSrcData); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clStartPos ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clEndPos ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&outputCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + global_work_size[0] = size; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, outputCl, CL_TRUE, 0, size * sizeof(float), (double *)fpResult, 0, NULL, NULL); + CHECK_OPENCL( clStatus, "clReadBuffer" ); + for(int i = 0;i<size;i++) + output[i] = (float)fpResult[i]; + clFinish( kEnv.mpkCmdQueue ); + if(fpResult) + { + free(fpResult); + fpResult = NULL; + } + clStatus = clFinish(kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel( kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( outputCl ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clSrcData ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clStartPos ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + clStatus = clReleaseMemObject( clEndPos ); + CHECK_OPENCL( clStatus, "clReleaseMemObject" ); return 0; } -int OclCalc::OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize ) +int OclCalc::oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize ) { const char *cpKernelName = "oclFormulaCount"; - CheckKernelName( &kEnv, cpKernelName ); + checkKernelName( &kEnv, cpKernelName ); cl_int clStatus; size_t global_work_size[1]; kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); @@ -1470,14 +1988,14 @@ int OclCalc::OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double } //sumproduct -int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize ) +int OclCalc::oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize ) { cl_int clStatus; size_t global_work_size[1]; memset(dpOutput,0,nSize); const char *cpFirstKernelName = "oclSignedMul"; const char *cpSecondKernelName = "oclFormulaSumproduct"; - CheckKernelName( &kEnv, cpFirstKernelName ); + checkKernelName( &kEnv, cpFirstKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemMergeLfData, fpSumProMergeLfData, 0, NULL, NULL ); @@ -1508,7 +2026,7 @@ int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float * CHECK_OPENCL( clStatus, "clReleaseMemObject" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); - CheckKernelName( &kEnv,cpSecondKernelName ); + checkKernelName( &kEnv,cpSecondKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); cl_mem clpOutput = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nSize* sizeof(float), NULL, &clStatus ); @@ -1556,12 +2074,13 @@ int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float * static cl_mem allocateDoubleBuffer( KernelEnv &rEnv, const double *_pValues, size_t nElements, cl_int *pStatus ) { // Ugh - horrible redundant copying ... - cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), nElements * sizeof(double), NULL, pStatus); double *pValues = (double *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, nElements * sizeof(double), 0, NULL, NULL, NULL); clFinish(rEnv.mpkCmdQueue); - memcpy( pValues, _pValues, nElements*sizeof(double) ); + for ( int i = 0; i < (int)nElements; i++ ) + pValues[i] = _pValues[i]; clEnqueueUnmapMemObject( rEnv.mpkCmdQueue, xValues, pValues, 0, NULL, NULL ); clFinish( rEnv.mpkCmdQueue ); return xValues; @@ -1570,7 +2089,7 @@ static cl_mem allocateDoubleBuffer( KernelEnv &rEnv, const double *_pValues, siz static cl_mem allocateFloatBuffer( KernelEnv &rEnv, const double *_pValues, size_t nElements, cl_int *pStatus ) { // Ugh - horrible redundant copying ... - cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), + cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), nElements * sizeof(float), NULL, pStatus); float *pValues = (float *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, nElements * sizeof(float), 0, NULL, NULL, NULL ); @@ -1583,9 +2102,185 @@ static cl_mem allocateFloatBuffer( KernelEnv &rEnv, const double *_pValues, size return xValues; } -double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double del ) +int OclCalc::oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double del ,uint *nStartPos,uint *nEndPos,double *dpResult) +{ + setKernelEnv( &kEnv ); + + char kernelName[256] = ""; + double delta = del; + bool subFlag = false; + strcat(kernelName,"ocl"); + for ( size_t i = 0; i < eOpNum; i++ ) + { + switch ( eOp[i] ) + { + case ocAdd: + strcat(kernelName,"Add"); + break; + case ocSub: + strcat(kernelName,"Sub"); + break; + case ocMul: + strcat(kernelName,"Mul"); + break; + case ocDiv: + strcat(kernelName,"Div"); + break; + case ocMax: + strcat(kernelName,"Max"); + break; + case ocMin: + strcat(kernelName,"Min"); + break; + case ocAverage: + strcat(kernelName,"Average"); + break; + default: + assert( false ); + break; + } + } + checkKernelName( &kEnv, kernelName ); + cl_int clStatus; + size_t global_work_size[1]; + if ( ( eOpNum == 1 ) && ( eOp[0] == ocSub ) ) + subFlag = true; + + kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kernelName, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateKernel" ); + + cl_mem valuesCl = NULL, subtractCl = NULL, outputCl = NULL, startPosCL = NULL, endPosCL = NULL; + + if(!subFlag) + { + startPosCL = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), + nElements * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + endPosCL = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), + nElements * sizeof(unsigned int), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + unsigned int *npStartPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, startPosCL, CL_TRUE, CL_MAP_WRITE, 0, + nElements * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + unsigned int *npEndPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, endPosCL, CL_TRUE, CL_MAP_WRITE, 0, + nElements * sizeof(uint), 0, NULL, NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + + for(uint i=0;i<nElements;i++) + { + npStartPosMap[i]=nStartPos[i]; + npEndPosMap[i]=nEndPos[i]; + } + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, startPosCL, npStartPosMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + clFinish( kEnv.mpkCmdQueue ); + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, endPosCL, npEndPosMap, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); + + if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag ) + { + valuesCl = allocateDoubleBuffer( kEnv, pOpArray, nSrcDataSize, &clStatus ); + subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus ); + } + else + { + valuesCl = allocateFloatBuffer( kEnv, pOpArray, nSrcDataSize, &clStatus ); + subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE , nElements * sizeof(float), NULL, &clStatus); + } + CHECK_OPENCL( clStatus, "clCreateBuffer" ); + + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&subtractCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&startPosCL ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&endPosCL ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + + fprintf( stderr, "prior to enqueue range kernel\n" ); + } + else + { + if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag ) + { + subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_double), (void *)&delta ); + CHECK_OPENCL( clStatus, "clSetKernelArg"); + } + else + { + float fTmp = (float)delta; + subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus ); + clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_float), (void *)&fTmp ); + CHECK_OPENCL( clStatus, "clSetKernelArg"); + } + clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&subtractCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg"); + clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&outputCl ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + } + global_work_size[0] = nElements; + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); + clFinish( kEnv.mpkCmdQueue ); + + if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag ) + { + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, + outputCl, + CL_TRUE,0, + nElements * sizeof(double), + (void *)dpResult,0,NULL,NULL); + CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + } + else + { + float *afBuffer = new float[nElements]; + if ( !afBuffer ) + return -1; + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, + outputCl, + CL_TRUE,0, + nElements * sizeof(float), + (void *)afBuffer,0,NULL,NULL); + CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" ); + clFinish( kEnv.mpkCmdQueue ); + for ( size_t i = 0; i < nElements; i++ ) + { + dpResult[i] = (double)afBuffer[i]; + } + if ( !afBuffer ) + delete [] afBuffer; + } + + clStatus = clFinish( kEnv.mpkCmdQueue ); + CHECK_OPENCL( clStatus, "clFinish" ); + clStatus = clReleaseKernel( kEnv.mpkKernel ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + + CHECK_OPENCL_RELEASE( clStatus, valuesCl ); + CHECK_OPENCL_RELEASE( clStatus, subtractCl ); + CHECK_OPENCL_RELEASE( clStatus, outputCl ); + CHECK_OPENCL_RELEASE( clStatus, startPosCL ); + CHECK_OPENCL_RELEASE( clStatus, endPosCL ); + + fprintf( stderr, "completed opencl operation\n" ); + + return 0; +} +double *OclCalc::oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double del ) { - SetKernelEnv( &kEnv ); + setKernelEnv( &kEnv ); // select a kernel: cut & paste coding is utterly evil. const char *kernelName = NULL; @@ -1613,7 +2308,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co default: assert( false ); } - CheckKernelName( &kEnv, kernelName ); + checkKernelName( &kEnv, kernelName ); cl_int clStatus; size_t global_work_size[1]; @@ -1637,13 +2332,13 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co { valuesCl = allocateDoubleBuffer( kEnv, pOpArray, nElements, &clStatus ); subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); - outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus ); } else { valuesCl = allocateFloatBuffer( kEnv, pOpArray, nElements, &clStatus ); subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); - outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(float), NULL, &clStatus); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus); } CHECK_OPENCL( clStatus, "clCreateBuffer" ); @@ -1668,7 +2363,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag ) { subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); - outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_double), (void *)&delta ); CHECK_OPENCL( clStatus, "clSetKernelArg"); } @@ -1676,7 +2371,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co { float fTmp = (float)delta; subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus ); - outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(float), NULL, &clStatus ); + outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_float), (void *)&fTmp ); CHECK_OPENCL( clStatus, "clSetKernelArg"); } @@ -1695,28 +2390,34 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co return NULL; // leak. if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag ) { - double *pOutput = (double *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE, - CL_MAP_READ,0,nElements*sizeof(double), 0,NULL,NULL,NULL); - clFinish(kEnv.mpkCmdQueue); - memcpy( pResult, pOutput, nElements * sizeof(double) ); - clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pResult,0,NULL,NULL); - clFinish(kEnv.mpkCmdQueue); + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, + outputCl, + CL_TRUE,0, + nElements * sizeof(double), + (void *)pResult,0,NULL,NULL); } else { - float *pOutput = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE, - CL_MAP_READ, 0, nElements*sizeof(float), 0, NULL, NULL, NULL ); - clFinish( kEnv.mpkCmdQueue ); + float *afBuffer = new float[nElements]; + if ( !afBuffer ) + return NULL; + clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, + outputCl, + CL_TRUE,0, + nElements * sizeof(float), + (void *)afBuffer,0,NULL,NULL); for ( int i = 0; i < (int)nElements; i++ ) - pResult[i] = (double)pOutput[i]; - clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, outputCl, pOutput, 0, NULL, NULL ); - clFinish( kEnv.mpkCmdQueue ); + pResult[i] = (double)afBuffer[i]; + if ( !afBuffer ) + delete [] afBuffer; } + CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" ); clStatus = clFinish( kEnv.mpkCmdQueue ); CHECK_OPENCL( clStatus, "clFinish" ); clStatus = clReleaseKernel( kEnv.mpkKernel ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); + if ( valuesCl != NULL ) { clStatus = clReleaseMemObject( valuesCl ); @@ -1737,7 +2438,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co return pResult; } -int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim ) +int OclCalc::oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim ) { cl_int clStatus = 0; uint nMatrixSize = nDim * nDim; @@ -1746,7 +2447,10 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM CHECK_OPENCL( clStatus, "clCreateBuffer" ); cl_mem clpYData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nMatrixSize * sizeof(double), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clpNData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE ), nDim * sizeof(uint), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); double * dpY = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpYData, CL_TRUE, CL_MAP_WRITE, 0, nMatrixSize * sizeof(double), 0, NULL,NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); memset( dpY, 0, nMatrixSize*sizeof(double) ); memset( dpOclMatrixDst, 0, nMatrixSize*sizeof(double) ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpYData, dpY, 0, NULL, NULL ); @@ -1758,14 +2462,20 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM for (uint j=0;j<nDim;j++) { if ( i == j ) - dpP[i*nDim+j]=1.0; + dpP[i*nDim+j] = 1.0; else - dpP[i*nDim+j]=0.0; + dpP[i*nDim+j] = 0.0; } } clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpPData, dpP, 0, NULL, NULL ); + uint * npDim = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpNData, CL_TRUE, CL_MAP_WRITE, 0, nDim * sizeof(uint), 0, NULL,NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + for ( uint i = 0; i < nDim; i++ ) + npDim[i] = nDim; + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpNData, npDim, 0, NULL, NULL ); + CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); - CheckKernelName( &kEnv,aKernelName ); + checkKernelName( &kEnv,aKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData ); @@ -1805,6 +2515,9 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM CHECK_OPENCL( clStatus, "clSetKernelArg" ); clStatus = clSetKernelArg( kernel_solve, 3, sizeof(cl_mem), (void *)&clpYData ); CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kernel_solve, 4, sizeof(cl_mem), (void *)&clpNData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kernel_solve, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); @@ -1819,18 +2532,23 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( mpClmemLeftData ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + mpClmemLeftData = NULL; clStatus = clReleaseMemObject( mpClmemRightData ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + mpClmemRightData = NULL; clStatus = clReleaseKernel( kernel_solve ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( clpPData ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( clpYData ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( clpNData ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); + return 0; } -int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim ) +int OclCalc::oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim ) { cl_int clStatus = 0; uint nMatrixSize = nDim * nDim; @@ -1840,7 +2558,10 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa CHECK_OPENCL( clStatus, "clCreateBuffer" ); cl_mem clpYData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nMatrixSize * sizeof(float), NULL, &clStatus ); CHECK_OPENCL( clStatus, "clCreateBuffer" ); + cl_mem clpNData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE ), nDim * sizeof(uint), NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clCreateBuffer" ); float * fpY = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpYData, CL_TRUE, CL_MAP_WRITE, 0, nMatrixSize * sizeof(float), 0, NULL,NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); memset( fpY, 0, nMatrixSize*sizeof(float) ); memset( fpOclMatrixDst, 0, nMatrixSize*sizeof(float) ); clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpYData, fpY, 0, NULL, NULL ); @@ -1859,7 +2580,12 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa } clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpPData, fpP, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" ); - CheckKernelName( &kEnv,aKernelName ); + uint * npDim = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpNData, CL_TRUE, CL_MAP_WRITE, 0, nDim * sizeof(uint), 0, NULL,NULL, &clStatus ); + CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" ); + for ( uint i = 0; i < nDim; i++ ) + npDim[i] = nDim; + clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpNData, npDim, 0, NULL, NULL ); + checkKernelName( &kEnv,aKernelName ); kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel" ); clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData ); @@ -1902,6 +2628,9 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa CHECK_OPENCL( clStatus, "clSetKernelArg" ); clStatus = clSetKernelArg( kernel_solve, 3, sizeof(cl_mem), (void *)&clpYData ); CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clSetKernelArg( kernel_solve, 4, sizeof(cl_mem), (void *)&clpNData ); + CHECK_OPENCL( clStatus, "clSetKernelArg" ); + clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kernel_solve, 1, NULL, global_work_size, NULL, 0, NULL, NULL ); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); clFinish( kEnv.mpkCmdQueue ); @@ -1916,14 +2645,18 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( mpClmemLeftData ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + mpClmemLeftData = NULL; clStatus = clReleaseMemObject( mpClmemRightData ); CHECK_OPENCL( clStatus, "clReleaseMemObject" ); + mpClmemRightData = NULL; clStatus = clReleaseKernel( kernel_solve ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( clpPData ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); clStatus = clReleaseMemObject( clpYData ); CHECK_OPENCL( clStatus, "clReleaseKernel" ); + clStatus = clReleaseMemObject( clpNData ); + CHECK_OPENCL( clStatus, "clReleaseKernel" ); return 0; } diff --git a/sc/source/core/opencl/openclwrapper.hxx b/sc/source/core/opencl/openclwrapper.hxx index 173ae58831d1..cf3b4f1a17a2 100644 --- a/sc/source/core/opencl/openclwrapper.hxx +++ b/sc/source/core/opencl/openclwrapper.hxx @@ -7,8 +7,8 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#ifndef SC_OPENCL_WRAPPER_H -#define SC_OPENCL_WRAPPER_H +#ifndef SC_OPENCLWRAPPER_HXX +#define SC_OPENCLWRAPPER_HXX #include <config_features.h> #include <formula/opcode.hxx> @@ -87,6 +87,19 @@ if( status != CL_SUCCESS ) \ return 0; \ } +#define CHECK_OPENCL_VOID(status,name) \ +if( status != CL_SUCCESS ) \ +{ \ + printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when %s .\n", status, name); \ +} + +#define CHECK_OPENCL_RELEASE(status,name) \ +if ( name != NULL ) \ + clReleaseMemObject( name ); \ +if( status != CL_SUCCESS ) \ +{ \ + printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when clReleaseMemObject( %s ).\n", status, #name); \ +} #define MAX_KERNEL_STRING_LEN 64 #define MAX_CLFILE_NUM 50 @@ -119,25 +132,48 @@ typedef struct char kernelName[MAX_KERNEL_NAME_LEN + 1]; char *kernelStr; } kernel_node; - +typedef struct _SingleVectorFormula +{ + const double *mdpInputLeftData; + const double *mdpInputRightData; + size_t mnInputLeftDataSize; + size_t mnInputRightDataSize; + uint mnInputLeftStartPosition; + uint mnInputRightStartPosition; + int mnInputLeftOffset; + int mnInputRightOffset; +} SingleVectorFormula; + +typedef struct _DoubleVectorFormula +{ + const double *mdpInputData; + size_t mnInputDataSize; + uint mnInputStartPosition; + uint mnInputEndPosition; + int mnInputStartOffset; + int mnInputEndOffset; +} DoubleVectorFormula; class OpenclCalcBase { public: OpenclCalcBase(){}; virtual ~OpenclCalcBase(){}; - virtual int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize )=0; - virtual int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize )=0; - virtual int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0; - virtual int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0; - virtual int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>& dpResult, uint nDim)=0; + virtual int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize )=0; + virtual int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0; + virtual int oclHostFormulaStatistics64Bits( const char* aKernelName,double *&output, int outputSize )=0; + virtual int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0; + virtual int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0; + virtual int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim)=0; + virtual int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0; - virtual int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize )=0; - virtual int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize )=0; - virtual int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0; - virtual int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0; - virtual int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0; + virtual int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize )=0; + virtual int oclHostFormulaStatistics32Bits( const char* aKernelName,double *output, int outputSize )=0; + virtual int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0; + virtual int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0; + virtual int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0; - virtual double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0; + virtual int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double delta,uint *nStartPos,uint *nEndPos ,double *deResult)=0; + virtual double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0; }; @@ -151,40 +187,40 @@ public: static int isInited; OpenclDevice(); ~OpenclDevice(); - static int InitEnv(); - static int RegistOpenclKernel(); - static int ReleaseOpenclRunEnv(); - static int InitOpenclRunEnv( GPUEnv *gpu ); - static int ReleaseOpenclEnv( GPUEnv *gpuInfo ); - static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ); - static int InitOpenclRunEnv( int argc ); - static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ); - static int GeneratBinFromKernelSource( cl_program program, const char * clFileName ); - static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ); - static int BinaryGenerated( const char * clFileName, FILE ** fhandle ); - static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); - - int InitOpenclAttr( OpenCLEnv * env ); - int ReleaseKernel( KernelEnv * env ); - int SetKernelEnv( KernelEnv *envInfo ); - int CreateKernel( char * kernelname, KernelEnv * env ); - int RunKernel( const char *kernelName, void **userdata ); - int ConvertToString( const char *filename, char **source ); - int CheckKernelName( KernelEnv *envInfo, const char *kernelName ); - int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ); - int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); - int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); + static int initEnv(); + static int registOpenclKernel(); + static int releaseOpenclRunEnv(); + static int initOpenclRunEnv( GPUEnv *gpu ); + static int releaseOpenclEnv( GPUEnv *gpuInfo ); + static int compileKernelFile( GPUEnv *gpuInfo, const char *buildOption ); + static int initOpenclRunEnv( int argc ); + static int cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ); + static int generatBinFromKernelSource( cl_program program, const char * clFileName ); + static int writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes ); + static int binaryGenerated( const char * clFileName, FILE ** fhandle ); + static int compileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); + + int initOpenclAttr( OpenCLEnv * env ); + int releaseKernel( KernelEnv * env ); + int setKernelEnv( KernelEnv *envInfo ); + int createKernel( char * kernelname, KernelEnv * env ); + int runKernel( const char *kernelName, void **userdata ); + int convertToString( const char *filename, char **source ); + int checkKernelName( KernelEnv *envInfo, const char *kernelName ); + int registerKernelWrapper( const char *kernelName, cl_kernel_function function ); + int runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); + int getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); #ifdef WIN32 - static int LoadOpencl(); - static int OpenclInite(); - static void FreeOpenclDll(); + static int loadOpencl(); + static int openclInite(); + static void freeOpenclDll(); #endif - int GetOpenclState(); - void SetOpenclState( int state ); - inline static int AddKernelConfig( int kCount, const char *kName ); + int getOpenclState(); + void setOpenclState( int state ); + inline static int addKernelConfig( int kCount, const char *kName ); }; @@ -201,6 +237,10 @@ public: cl_mem mpClmemMergeLfData; cl_mem mpClmemMergeRtData; cl_mem mpClmemMatixSumSize; + cl_mem mpClmemeOp; + unsigned int nArithmeticLen; + unsigned int nFormulaLen; + unsigned int nClmemLen; unsigned int nFormulaColSize; unsigned int nFormulaRowSize; @@ -208,27 +248,49 @@ public: ~OclCalc(); // for 64bits double - int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize ); - int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize); - int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize ); - int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize); - int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim ); + int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize ); + int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize ); + int oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int outputSize); + int oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size); + int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize ); + int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize); + int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim ); // for 32bits float - int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize ); - int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize); - int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize ); - int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize ); - int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim ); + int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize ); + int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize ); + int oclHostFormulaStatistics32Bits( const char* aKernelName, double *output, int outputSize); + int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize ); + int oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize ); + int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize ); + int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim ); // for groundwater - double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta ); + int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle,size_t nSrcDataSize, size_t nElements, double delta ,uint *nStartPos,uint *nEndPos,double *deResult); + double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta ); /////////////////////////////////////////////////////////////// - int CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ); - int CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize ); - int CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ); - int CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize ); - int CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize ); - int CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize ); + int createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize ); + int mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize ); + int mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize); + int mapAndCopyArithmetic64Bits( const double *dpMoreArithmetic,int nBufferSize ); + int mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize ); + int createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize ); + + int createFormulaBuf64Bits( int nBufferSize, int rowSize ); + int createArithmeticOptBuf64Bits( int nBufferSize ); + + int createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize ); + int mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize ); + int mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize); + int mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize ); + int mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize ); + int createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize ); + int createFormulaBuf32Bits( int nBufferSize, int rowSize ); + int createArithmeticOptBuf32Bits( int nBufferSize ); + int oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size ); + int oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize ); + + int releaseOclBuffer(void); + friend class agency; }; #endif |