summaryrefslogtreecommitdiff
path: root/sc
diff options
context:
space:
mode:
authorHaidong Lian <haidong@multicorewareinc.com>2013-08-30 15:35:17 -0400
committerKohei Yoshida <kohei.yoshida@gmail.com>2013-08-30 15:58:59 -0400
commite791fbfc0435f4a9522288154132df2760ef14a2 (patch)
treec3de2be83a00830ebd8f9e55dd679e9083f9793a /sc
parentccf7b15c0a5776c6431fdcb0c0e2b0f3935ae3dc (diff)
Patch for milestone1-0829-v4.
1. Add the parser based on RPN; 2. For test sample1 named "ground-water-daily.xls", using the compound formula to do calculation; Add the compound kernels: Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE -(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)"; 3. For formulae which do not work in GPU, they'll work in CPU; 4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as the sequence of RPN; 5. Add the start and end position to fit for the sliding window; 6. Modify kernels by using vector for AMD GPU. Conflicts: sc/source/core/opencl/formulagroupcl.cxx sc/source/core/opencl/openclwrapper.cxx Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125
Diffstat (limited to 'sc')
-rw-r--r--sc/source/core/opencl/formulagroupcl.cxx1151
-rw-r--r--sc/source/core/opencl/oclkernels.hxx239
-rw-r--r--sc/source/core/opencl/openclwrapper.cxx1359
-rw-r--r--sc/source/core/opencl/openclwrapper.hxx182
4 files changed, 2188 insertions, 743 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 274af4e442c8..a835c461abbb 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -19,6 +19,10 @@
#include "openclwrapper.hxx"
+#define SRCDATASIZE 100
+#define SINGLEARRAYLEN 100
+#define DOUBLEARRAYLEN 100
+#define SVDOUBLELEN 100
namespace sc {
// A single public entry point for a factory function:
@@ -38,30 +42,632 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2)
}//dbg-t
TimeValue aTimeBefore, aTimeAfter;
///////////////////////////////////////
+class SourceData
+{
+ const double *mdpSrcPtr;
+ unsigned int mnDataSize;
+ const char *mcpFormulaName;
+ unsigned int mnCol;
+ int eOp;
+public:
+ SourceData( const double *dpData, unsigned int nSize, uint nCol = 1,const char *cpFormulaName = NULL):mdpSrcPtr(dpData),mnDataSize(nSize),mcpFormulaName(cpFormulaName),mnCol(nCol)
+ {
+ }
+ SourceData():mdpSrcPtr(NULL),mnDataSize(0)
+ {
+ }
+ void setSrcPtr( const double *dpTmpDataPtr)
+ {
+ mdpSrcPtr = dpTmpDataPtr;
+ }
+ void setSrcSize( int nSize )
+ {
+ mnDataSize = nSize;
+ }
+ const double * getDouleData()
+ {
+ return mdpSrcPtr;
+ }
+ unsigned int getDataSize()
+ {
+ return mnDataSize;
+ }
+ void print()
+ {
+ for( uint i=0; i<mnDataSize; i++ )
+ printf( " The SourceData is %f and data size is %d\n",mdpSrcPtr[i],mnDataSize );
+ }
+ void printFormula()
+ {
+ printf("--------The formulaname is %s and the eOp is %d---------\n",mcpFormulaName,eOp);
+ }
+ void setFormulaName(const char *cpFormulaName)
+ {
+ this->mcpFormulaName = cpFormulaName;
+ }
+ const char *getFormulaName()
+ {
+ return mcpFormulaName;
+ }
+ void seteOp(int op)
+ {
+ this->eOp = op;
+ }
+ int geteOp()
+ {
+ return eOp;
+ }
+ int getColNum()
+ {
+ return mnCol;
+ }
+
+};
class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware
{
+ SourceData *mSrcDataStack[SRCDATASIZE];
+ unsigned int mnStackPointer,mnDoublePtrCount;
+ uint * mnpOclStartPos;
+ uint * mnpOclEndPos;
+ SingleVectorFormula *mSingleArray[SINGLEARRAYLEN];
+ DoubleVectorFormula *mDoubleArray[DOUBLEARRAYLEN];
+ double mdpSvdouble[SVDOUBLELEN];
+ double *mdpSrcDoublePtr[SVDOUBLELEN];
+ uint mnSingleCount;
+ uint mnDoubleCount;
+ uint mnSvDoubleCount;
+ uint mnOperatorGroup[100];
+ uint mnOperatorCount;
+ char mcHostName[100];
+ uint mnPositonLen;
+ size_t mnRowSize;
public:
FormulaGroupInterpreterOpenCL() :
FormulaGroupInterpreterSoftware()
{
- OclCalc::InitEnv();
+ mnStackPointer = 0;
+ mnpOclEndPos = NULL;
+ mnpOclStartPos = NULL;
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ mnDoublePtrCount = 0;
+ OclCalc::initEnv();
}
virtual ~FormulaGroupInterpreterOpenCL()
{
- OclCalc::ReleaseOpenclRunEnv();
+ OclCalc::releaseOpenclRunEnv();
}
- virtual ScMatrixRef inverseMatrix(const ScMatrix& rMat);
- virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos,
- const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode);
+ virtual ScMatrixRef inverseMatrix( const ScMatrix& rMat );
+ virtual bool interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+ const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode );
+ void collectDoublePointers( double *temp )
+ {
+ if( mnDoublePtrCount < SRCDATASIZE )
+ {
+ mdpSrcDoublePtr[mnDoublePtrCount++] = temp;
+ }
+ else
+ {
+ printf( "The mdpSrcDoublePtr is full now.\n" );
+ double *dtmp = NULL;
+ if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+ {
+ free( dtmp );
+ dtmp = NULL;
+ }
+ }
+ }
+
+ void freeDoublePointers()
+ {
+ while( mnDoublePtrCount > 0 )
+ {
+ double *dtmp = NULL;
+ if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+ {
+ free( dtmp );
+ dtmp = NULL;
+ }
+ }
+ }
+
+
+ void srdDataPush( SourceData *temp )
+ {
+ if( mnStackPointer < SRCDATASIZE )
+ {
+ mSrcDataStack[mnStackPointer++] = temp;
+ }
+ else
+ printf( "The stack is full now.\n" );
+ }
+ SourceData *srdDataPop( void )
+ {
+ if( mnStackPointer <= 0 )
+ {
+ printf( "The stack was empty\n" );
+ return NULL;
+ }
+ return mSrcDataStack[--mnStackPointer];
+ }
+ unsigned int getDataSize()
+ {
+ return mnStackPointer;
+ }
+ void printStackInfo()
+ {
+ printf( "/********The stack size is %d*********\\\n",mnStackPointer );
+ for ( int i = mnStackPointer - 1; i >= 0; i-- )
+ mSrcDataStack[i]->print();
+ }
+ bool getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen);
+ bool chooseFunction(OclCalc &ocl_calc,double *&dpResult);
+ bool isStockHistory();
+ bool isGroundWater();
+};
+bool FormulaGroupInterpreterOpenCL::getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen)
+{
+ uint nColPosition = 0;
+ ScTokenArray * rCodePos = rCode.Clone();
+ static int nCountPosSize = nRowSize;
+ bool isAllocFormulaOclBuf = true;
+ for ( const formula::FormulaToken* p = rCodePos->First(); p; p = rCodePos->Next() )
+ {
+ switch ( p->GetType() )
+ {
+ case formula::svDoubleVectorRef:
+ {
+ nColPosition++;
+ break;
+ }
+ }
+ }
+ int nPositionSize = nColPosition * nRowSize;
+ npOclStartPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+ npOclEndPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+ if ( nCountPosSize < nPositionSize )
+ {
+ nCountPosSize = nPositionSize;
+ isAllocFormulaOclBuf = false;
+ }
+ for ( sal_Int32 i = 0; i < xGroup->mnLength; ++i )
+ {
+ ScTokenArray * rCodeTemp = rCode.Clone();
+ int j = 0;
+ for ( const formula::FormulaToken* p = rCodeTemp->First(); p; p = rCodeTemp->Next() )
+ {
+ switch (p->GetType())
+ {
+ case formula::svDoubleVectorRef:
+ {
+ const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
+ size_t nRowStart = p2->IsStartFixed() ? 0 : i;
+ size_t nRowEnd = p2->GetRefRowSize() - 1;
+ if (!p2->IsEndFixed())
+ nRowEnd += i;
+ npOclStartPos[j*nRowSize+i] = nRowStart;//record the start position
+ npOclEndPos[j*nRowSize+i] = nRowEnd;//record the end position
+ j++;
+ }
+ }
+ }
+ }
+ *nPositonLen = nPositionSize;
+ //Now the pos array is 0 1 2 3 4 5 0 1 2 3 4 5;
+ return isAllocFormulaOclBuf;
+}
+
+bool FormulaGroupInterpreterOpenCL::isStockHistory()
+{
+ bool isHistory = false;
+ if( (mnOperatorGroup[0]== 224) && (mnOperatorGroup[1]== 227) && (mnOperatorGroup[2]== 41) && (mnOperatorGroup[3]== 43) && (mnOperatorGroup[4]== 41) )
+ {
+ strcpy(mcHostName,"OclOperationColumnN");
+ isHistory = true;
+ }
+ else if( (mnOperatorGroup[0] == 226) && (mnOperatorGroup[1] == 42) )
+ {
+ strcpy(mcHostName,"OclOperationColumnH");
+ isHistory = true;
+ }
+ else if((mnOperatorGroup[0] == 213) && (mnOperatorGroup[1] == 43) && (mnOperatorGroup[2] == 42) )
+ {
+ strcpy(mcHostName,"OclOperationColumnJ");
+ isHistory = true;
+ }
+ return isHistory;
+}
+
+bool FormulaGroupInterpreterOpenCL::isGroundWater()
+{
+ bool GroundWater=false;
+
+ if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+ (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
+ {
+ GroundWater = true;
+ }
+ return GroundWater;
+}
+
+bool FormulaGroupInterpreterOpenCL::chooseFunction( OclCalc &ocl_calc, double *&dpResult )
+{
+ const double * dpOclSrcData = NULL;
+ unsigned int nSrcDataSize = 0;
+ const double *dpLeftData = NULL;
+ const double *dpRightData = NULL;
+ if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+ (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
+ {
+ double delta = 0.0;
+ const double *pArrayToSubtractOneElementFrom;
+ const double *pGroundWaterDataArray;
+ uint nSrcData = 0;
+ if( mnSvDoubleCount!=1 )
+ {
+ pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+ pGroundWaterDataArray= mDoubleArray[0]->mdpInputData;
+ nSrcData = mDoubleArray[0]->mnInputDataSize;
+ }
+ else
+ {
+ pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+ pGroundWaterDataArray=NULL;
+ delta = mdpSvdouble[0];
+ }
+ ocl_calc.oclGroundWaterGroup( mnOperatorGroup,mnOperatorCount,pGroundWaterDataArray,pArrayToSubtractOneElementFrom,nSrcData,mnRowSize,delta,mnpOclStartPos,mnpOclEndPos,dpResult);
+ }
+ else if( isStockHistory() )
+ {
+ return false;
+ }
+ else if(((mnSvDoubleCount==0)&&(mnSingleCount==0)&&(mnDoubleCount==1)) &&
+ ((mnOperatorGroup[0] == ocAverage)||(mnOperatorGroup[0] == ocMax)||(mnOperatorGroup[0] == ocMin)))
+ {
+ if(mnOperatorGroup[0] == ocAverage)
+ strcpy(mcHostName,"oclFormulaAverage");
+ if(mnOperatorGroup[0] == ocMax)
+ strcpy(mcHostName,"oclFormulaMax");
+ if(mnOperatorGroup[0] == ocMin)
+ strcpy(mcHostName,"oclFormulaMin");
+ DoubleVectorFormula * doubleTemp = mDoubleArray[--mnDoubleCount];
+ nSrcDataSize = doubleTemp->mnInputDataSize;
+ dpOclSrcData = doubleTemp->mdpInputData;
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createFormulaBuf64Bits( nSrcDataSize, mnRowSize );
+ ocl_calc.mapAndCopy64Bits( dpOclSrcData,mnpOclStartPos,mnpOclEndPos,nSrcDataSize,mnRowSize );
+ ocl_calc.oclHostFormulaStatistics64Bits( mcHostName, dpResult, mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createFormulaBuf32Bits( nSrcDataSize, mnPositonLen );
+ ocl_calc.mapAndCopy32Bits( dpOclSrcData, mnpOclStartPos, mnpOclEndPos, nSrcDataSize, mnRowSize);
+ ocl_calc.oclHostFormulaStatistics32Bits( mcHostName, dpResult, mnRowSize );
+ }
+ }
+ }
+ else if((mnSvDoubleCount==0)&&(mnSingleCount==1)&&(mnDoubleCount==0))
+ {
+ dpLeftData = mSingleArray[0]->mdpInputLeftData;
+ dpRightData = mSingleArray[0]->mdpInputRightData;
+ if(mnOperatorGroup[0] == ocAdd)
+ strcpy(mcHostName,"oclSignedAdd");
+ if(mnOperatorGroup[0] == ocSub)
+ strcpy(mcHostName,"oclSignedSub");
+ if(mnOperatorGroup[0] == ocMul)
+ strcpy(mcHostName,"oclSignedMul");
+ if(mnOperatorGroup[0] == ocDiv)
+ strcpy(mcHostName,"oclSignedDiv");
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createArithmeticOptBuf64Bits( mnRowSize );
+ ocl_calc.mapAndCopy64Bits(dpLeftData,dpRightData,mnRowSize);
+ ocl_calc.oclHostArithmeticOperator64Bits( mcHostName,dpResult,mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createArithmeticOptBuf32Bits( mnRowSize );
+ ocl_calc.mapAndCopy32Bits(dpLeftData,dpRightData,mnRowSize);
+ ocl_calc.oclHostArithmeticOperator32Bits( mcHostName,dpResult,mnRowSize );
+ }
+ }
+ }
+ else if( (mnSingleCount>1) && (mnSvDoubleCount==0) && (mnDoubleCount==0) )
+ {
+ const double* dpArray[100] = {};
+ int j=0;
+ for( uint i = 0; i < mnSingleCount; i++ )
+ {
+ dpArray[j++] = mSingleArray[i]->mdpInputLeftData;
+ if( NULL != mSingleArray[i]->mdpInputRightData )
+ dpArray[j++] = mSingleArray[i]->mdpInputRightData;
+ }
+ double *dpMoreColArithmetic = (double *)malloc( sizeof(double) * j * mnRowSize );
+ if( NULL == dpMoreColArithmetic )
+ {
+ printf("Memory alloc error!\n");
+ return false;
+ }
+ for( uint i = 0; i < j*mnRowSize; i++ )
+ {
+ dpMoreColArithmetic[i] = dpArray[i/mnRowSize][i%mnRowSize];
+ }
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createMoreColArithmeticBuf64Bits( j * mnRowSize, mnOperatorCount );
+ ocl_calc.mapAndCopyMoreColArithmetic64Bits( dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount );
+ ocl_calc.oclMoreColHostArithmeticOperator64Bits( mnRowSize, mnOperatorCount, dpResult,mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createMoreColArithmeticBuf32Bits( j* mnRowSize, mnOperatorCount );
+ ocl_calc.mapAndCopyMoreColArithmetic32Bits(dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount);
+ ocl_calc.oclMoreColHostArithmeticOperator32Bits( mnRowSize, mnOperatorCount, dpResult, mnRowSize );
+ }
+ }
+ }
+ else
+ {
+ return false;
+ }
+ return true;
+}
+
+class agency
+{
+public:
+ double *calculate(int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt);
};
-ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
+double * agency::calculate( int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt)
+{
+ const double *dpLeftData = NULL;
+ const double *dpRightData = NULL;
+ const double *dpOclSrcData=NULL;
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ switch( nOclOp )
+ {
+ case ocAdd:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedAdd",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocSub:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = ( double * )malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedSub",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult,nDataSize) );
+ break;
+ }
+ case ocMul:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedMul",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocDiv:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = ( double * )malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedDiv",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocMax:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ case ocMin:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ case ocAverage:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ default:
+ fprintf( stderr,"No OpenCL function for this calculation.\n" );
+ break;
+ }
+ }
+ else
+ {
+ switch( nOclOp )
+ {
+ case ocAdd:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedAdd", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
+ break;
+ }
+ case ocSub:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedSub", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocMul:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc(sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedMul", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult, nDataSize ) );
+ break;
+ }
+ case ocDiv:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedDiv", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
+ break;
+ }
+ case ocMax:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc(sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMax", dpOclSrcData, npOclStartPos, npOclEndPos, rResult,nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize ) );
+ break;
+ }
+ case ocMin:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMin", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
+ break;
+ }
+ case ocAverage:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize);
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaAverage", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
+ break;
+ }
+ default:
+ fprintf(stderr,"No OpenCL function for this calculation.\n");
+ break;
+ }
+ }
+ return NULL;
+}
+
+ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix( const ScMatrix& rMat )
{
SCSIZE nC, nR;
- rMat.GetDimensions(nC, nR);
- if (nC != nR || nC == 0)
+ rMat.GetDimensions( nC, nR );
+ if ( nC != nR || nC == 0 )
// Input matrix must be square. Return an empty matrix on failure and
// the caller will calculate it via CPU.
return ScMatrixRef();
@@ -70,29 +676,30 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
// the last, chained together in a single array.
std::vector<double> aDoubles;
rMat.GetDoubleArray(aDoubles);
+
float * fpOclMatrixSrc = NULL;
float * fpOclMatrixDst = NULL;
double * dpOclMatrixSrc = NULL;
double * dpOclMatrixDst = NULL;
uint nMatrixSize = nC * nR;
static OclCalc aOclCalc;
- if ( aOclCalc.GetOpenclState() )
+ if ( aOclCalc.getOpenclState() )
{
if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 )
{
- aOclCalc.CreateBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
+ aOclCalc.createBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
- aOclCalc.OclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
+ aOclCalc.oclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
}
else
{
- aOclCalc.CreateBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
+ aOclCalc.createBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i];
- aOclCalc.OclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
+ aOclCalc.oclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
}
}
@@ -111,374 +718,242 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
#else
// Another way is to put the values one column at a time.
const double* p = &aDoubles[0];
- for (SCSIZE i = 0; i < nC; ++i)
+ for( SCSIZE i = 0; i < nC; ++i )
{
- xInv->PutDouble(p, nR, i, 0);
+ xInv->PutDouble( p, nR, i, 0 );
p += nR;
}
#endif
return xInv;
}
-
-bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
- const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode)
+bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+ const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode )
{
- generateRPNCode(rDoc, rTopPos, rCode);
-
- size_t rowSize = xGroup->mnLength;
- fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
+ generateRPNCode( rDoc, rTopPos, rCode );
+ mnRowSize = xGroup->mnLength;
+ fprintf( stderr,"mnRowSize at begin is ...%ld.\n",(long)mnRowSize );
// The row quantity can be gotten from p2->GetArrayLength()
- uint nCount1 = 0, nCount2 = 0, nCount3 = 0;
int nOclOp = 0;
- double *rResult = NULL; // Point to the output data from GPU
- rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
- if(NULL==rResult)
- {
- printf("malloc err\n");
- return false;
- }
- memset(rResult,0,rowSize);
- float * fpOclSrcData = NULL; // Point to the input data from CPU
- double * dpOclSrcData = NULL;
- uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
- uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
- float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
- float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
- // The rightData can't be zero for "/"
- double * dpLeftData = NULL;
- double * dpRightData = NULL;
-
- float * fpSaveData=NULL; //It is a temp pointer point the preparing memory;
- float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator
- float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator
- double * dpSaveData=NULL;
- double * dpSumProMergeLfData = NULL;
- double * dpSumProMergeRtData = NULL;
- uint * npSumSize=NULL; //It is a array to save the matix sizt(col *row)
- int nSumproductSize=0; //It is the merge array size
- bool aIsAlloc=false; //It is a flag to judge the fpSumProMergeLfData existed
- unsigned int nCountMatix=0; //It is a count to save the calculate times
+ const double * dpOclSrcData = NULL;
+ const double * dpBinaryData = NULL;
static OclCalc ocl_calc;
- bool isSumProduct=false;
- if(ocl_calc.GetOpenclState())
+ unsigned int nSrcDataSize = 0;
+
+ const double *dpResult = NULL;
+ double *pResult = (double *)malloc(sizeof(double) * mnRowSize);
+ double *dpSvDouble = NULL;
+ bool isSample = false;
+
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ if ( ocl_calc.getOpenclState() )
{
- // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
- // Don't know which formulae will be used previously, so create buffers for different formulae used probably
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ getPosition(rCode,xGroup,mnRowSize,mnpOclStartPos,mnpOclEndPos,&mnPositonLen);
+ const formula::FormulaToken* p = rCode.FirstRPN();
+
+ bool isSingle = false;
+ int nCountNum=0;
+ do
{
- ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
- ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize);
- }
- else
+ if ( ocPush != p->GetOpCode())
+ {
+ nOclOp = p->GetOpCode();
+ mnOperatorGroup[mnOperatorCount++] = nOclOp;
+ }
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ mnSingleCount++;
+ }
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+ {
+ mnSvDoubleCount++;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ if( isGroundWater() )
{
- ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
- ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize);
+ isSample = true;
}
- //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
- }
-///////////////////////////////////////////////////////////////////////////////////////////
-
- // Until we implement group calculation for real, decompose the group into
- // individual formula token arrays for individual calculation.
- ScAddress aTmpPos = rTopPos;
- for (sal_Int32 i = 0; i < xGroup->mnLength; ++i)
- {
- aTmpPos.SetRow(xGroup->mnStart + i);
- ScTokenArray aCode2;
- for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next())
+ mnOperatorCount = 0;
+ mnSingleCount = 0;
+ mnSvDoubleCount = 0;
+ p = rCode.FirstRPN();
+ if(isSample)
{
- switch (p->GetType())
+ do
{
- case formula::svSingleVectorRef:
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
{
- const formula::SingleVectorRefToken* p2 = static_cast<const formula::SingleVectorRefToken*>(p);
- const double* pArray = p2->GetArray();
- aCode2.AddDouble(static_cast<size_t>(i) < p2->GetArrayLength() ? pArray[i] : 0.0);
+ mdpSvdouble[mnSvDoubleCount++] = p->GetDouble();
}
- break;
- case formula::svDoubleVectorRef:
+ else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
- const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
- const std::vector<const double*>& rArrays = p2->GetArrays();
- size_t nColSize = rArrays.size();
- size_t nRowStart = p2->IsStartFixed() ? 0 : i;
- size_t nRowEnd = p2->GetRefRowSize() - 1;
- if (!p2->IsEndFixed())
- nRowEnd += i;
- size_t nRowSize = nRowEnd - nRowStart + 1;
- //store the a matix`s rowsize and colsize,use it to calculate the matix`s size
- ocl_calc.nFormulaRowSize = nRowSize;
- ocl_calc.nFormulaColSize = nColSize;
- ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
- if(ocl_calc.GetOpenclState())
+ const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+ const std::vector< const double* >& rArrays = pDvr->GetArrays();
+ uint rArraysSize = rArrays.size();
+ int nMoreColSize = 0;
+ DoubleVectorFormula *SvDoubleTemp = new DoubleVectorFormula();
+ if( rArraysSize > 1 )
{
- npOclStartPos[i] = nRowStart; // record the start position
- npOclEndPos[i] = nRowEnd; // record the end position
- }
- int nTempOpcode;
- const formula::FormulaToken* pTemp = p;
- pTemp=aCode2.Next();
- nTempOpcode=pTemp->GetOpCode();
- while(1)
- {
- nTempOpcode=pTemp->GetOpCode();
- if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush)
- break;
- pTemp=aCode2.Next();
- }
- if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct))
- {
- //nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize
- //the other all use nRowSize times . and it must aligen so add nRowSize-1.
- nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1;
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
- else
- ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
- aIsAlloc = true;
- isSumProduct=true;
- }
- if(isSumProduct)
- {
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- {
- if(nCountMatix%2==0)
- dpSaveData = dpSumProMergeLfData;
- else
- dpSaveData = dpSumProMergeRtData;
- }
- else
+ double *dpMoreColData = NULL;
+ for ( uint loop=0; loop < rArraysSize; loop++ )
{
- if(nCountMatix%2==0)
- fpSaveData = fpSumProMergeLfData;
- else
- fpSaveData = fpSumProMergeRtData;
- }
- }
- for (size_t nCol = 0; nCol < nColSize; ++nCol)
- {
- const double* pArray = rArrays[nCol];
- if( NULL==pArray )
- {
- fprintf(stderr,"Error: pArray is NULL!\n");
- free(rResult);
- return false;
- }
- if(ocl_calc.GetOpenclState())
- {
- for( size_t u=nRowStart; u<=nRowEnd; u++ )
- {
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- {
- dpOclSrcData[u] = pArray[u];
- //fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]);
- if(isSumProduct)
- dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u];
- }
- else
- {
- // Many video cards can't support double type in kernel, so need transfer the double to float
- fpOclSrcData[u] = (float)pArray[u];
- //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
- if(isSumProduct)
- fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u];
- }
- }
- }
-
- for (size_t nRow = 0; nRow < nRowSize; ++nRow)
- {
- if (nRowStart + nRow < p2->GetArrayLength())
+ dpOclSrcData = rArrays[loop];
+ nSrcDataSize = pDvr->GetArrayLength();
+ nMoreColSize += nSrcDataSize;
+ dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+ for ( uint j = nMoreColSize - nSrcDataSize, i = 0; i < nSrcDataSize; i++, j++ )
{
- double fVal = pArray[nRowStart+nRow];
- pMat->PutDouble(fVal, nCol, nRow);
+ dpMoreColData[j] = dpOclSrcData[i];
}
}
+ dpOclSrcData = dpMoreColData;
+ nSrcDataSize = nMoreColSize;
}
-
- ScMatrixToken aTok(pMat);
- aCode2.AddToken(aTok);
- if(isSumProduct)
+ else
{
- npSumSize[nCountMatix/2] =nRowSize*nColSize;
- nCountMatix++;
+ dpOclSrcData = rArrays[0];
+ nSrcDataSize = pDvr->GetArrayLength();
+ SvDoubleTemp->mdpInputData = dpOclSrcData;
+ SvDoubleTemp->mnInputDataSize = nSrcDataSize;
+ SvDoubleTemp->mnInputStartPosition = mnpOclStartPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputEndPosition = mnpOclEndPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputStartOffset = mnpOclStartPos[nCountNum*mnRowSize+1]-mnpOclStartPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputEndOffset = mnpOclEndPos[nCountNum*mnRowSize+1]-mnpOclEndPos[nCountNum*mnRowSize];
+ mDoubleArray[mnDoubleCount++] = SvDoubleTemp;
+ nCountNum++;
}
}
- break;
- default:
- aCode2.AddToken(*p);
- }
- }
-
- ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos);
- if (!pDest)
- {
- free(rResult);
- return false;
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+ dpBinaryData = pSvr->GetArray();
+ uint nArrayLen = pSvr->GetArrayLength();
+ SingleVectorFormula *SignleTemp = new SingleVectorFormula() ;
+ if(isSingle)
+ {
+ SignleTemp = mSingleArray[--mnSingleCount];
+ SignleTemp->mdpInputRightData = dpBinaryData;
+ SignleTemp->mnInputRightDataSize = nArrayLen;
+ SignleTemp->mnInputRightStartPosition = 0;
+ SignleTemp->mnInputRightOffset = 0;
+ isSingle = false;
+ }
+ else
+ {
+ SignleTemp = new SingleVectorFormula();
+ SignleTemp->mdpInputLeftData = dpBinaryData;
+ SignleTemp->mnInputLeftDataSize = nArrayLen;
+ SignleTemp->mdpInputRightData = NULL;
+ SignleTemp->mnInputRightDataSize = 0;
+ SignleTemp->mnInputLeftStartPosition = 0;
+ SignleTemp->mnInputLeftOffset = 0;
+ isSingle = true;
+ }
+ mSingleArray[mnSingleCount++] = SignleTemp;
+ }
+ else
+ {
+ nOclOp = p->GetOpCode();
+ mnOperatorGroup[mnOperatorCount++] = nOclOp;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ if ( !chooseFunction( ocl_calc, pResult ) )
+ return false;
+ else
+ dpResult = pResult;
}
- if(ocl_calc.GetOpenclState())
+ else
{
- const formula::FormulaToken *pCur = aCode2.First();
- aCode2.Reset();
- while( ( pCur = aCode2.Next() ) != NULL )
+ agency aChooseAction;
+
+ do
{
- OpCode eOp = pCur->GetOpCode();
- if(eOp==0)
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+ {
+ dpSvDouble = (double *) malloc( sizeof(double ) * mnRowSize );
+ double dTempValue = p->GetDouble();
+ for ( uint i = 0; i < mnRowSize; i++ )
+ dpSvDouble[i] = dTempValue;
+ srdDataPush( new SourceData( dpSvDouble, mnRowSize ) );
+ collectDoublePointers( dpSvDouble );
+ }
+ else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+ const std::vector< const double* >& rArrays = pDvr->GetArrays();
+ unsigned int rArraysSize = rArrays.size();
+ int nMoreColSize = 0;
+ if(rArraysSize > 1)
{
- if(nCount3%2==0)
- dpLeftData[nCount1++] = pCur->GetDouble();
- else
- dpRightData[nCount2++] = pCur->GetDouble();
- nCount3++;
+ double *dpMoreColData = NULL;
+ for( uint loop=0; loop < rArraysSize; loop++ )
+ {
+ dpOclSrcData = rArrays[loop];
+ nSrcDataSize = pDvr->GetArrayLength();
+ nMoreColSize += nSrcDataSize;
+ dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+ for(uint j=nMoreColSize-nSrcDataSize,i=0;i<nSrcDataSize;i++,j++)
+ {
+ dpMoreColData[j] = dpOclSrcData[i];
+ }
+ }
+ dpOclSrcData = dpMoreColData;
+ nSrcDataSize = nMoreColSize;
+ collectDoublePointers( dpMoreColData );
}
else
{
- if(nCount3%2==0)
- fpLeftData[nCount1++] = (float)pCur->GetDouble();
- else
- fpRightData[nCount2++] = (float)pCur->GetDouble();
- nCount3++;
+ dpOclSrcData = rArrays[0];
+ nSrcDataSize = pDvr->GetArrayLength();
}
+ srdDataPush( new SourceData( dpOclSrcData,nSrcDataSize,rArraysSize ) );
}
- else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep)
- nOclOp = eOp;
-
-// if(count1>0){//dbg
-// fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]);
-// count1--;
-// }
-// if(count2>0){//dbg
-// fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]);
-// count2--;
-// }
- }
- }
-
- if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
- {
- //fprintf(stderr,"ccCPU flow...\n\n");
- generateRPNCode(rDoc, aTmpPos, aCode2);
- ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2);
- aInterpreter.Interpret();
- pDest->SetResultToken(aInterpreter.GetResultToken().get());
- pDest->ResetDirty();
- pDest->SetChanged(true);
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+ dpBinaryData = pSvr->GetArray();
+ nSrcDataSize = pSvr->GetArrayLength();
+ srdDataPush( new SourceData( dpBinaryData, nSrcDataSize ) );
+ }
+ else
+ {
+ nOclOp = p->GetOpCode();
+ aChooseAction.calculate(nOclOp,mnRowSize,ocl_calc,mnpOclStartPos,mnpOclEndPos,this);
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ SourceData * sResult = srdDataPop();
+ dpResult = sResult->getDouleData();
}
- } // for loop end (xGroup->mnLength)
-
- // For GPU calculation
- if(getenv("SC_GPU")&&ocl_calc.GetOpenclState())
- {
- fprintf(stderr,"ggGPU flow...\n\n");
- printf(" oclOp is... %d\n",nOclOp);
- osl_getSystemTime(&aTimeBefore); //timer
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ rDoc.SetFormulaResults( rTopPos, dpResult, mnRowSize );
+ freeDoublePointers();
+ if ( pResult )
{
- fprintf(stderr,"ggGPU double precision flow...\n\n");
- //double precision
- switch(nOclOp)
- {
- case ocAdd:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocSub:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocMul:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocDiv:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocMax:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocMin:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocAverage:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSum:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocCount:
- ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSumProduct:
- ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize);
- break;
- default:
- fprintf(stderr,"No OpenCL function for this calculation.\n");
- break;
- }
+ free( pResult );
+ pResult = NULL;
}
- else
+ if ( mnpOclStartPos )
{
- fprintf(stderr,"ggGPU float precision flow...\n\n");
- //float precision
- switch(nOclOp)
- {
- case ocAdd:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocSub:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocMul:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocDiv:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocMax:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocMin:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocAverage:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSum:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocCount:
- ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSumProduct:
- ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize);
- break;
- default:
- fprintf(stderr,"No OpenCL function for this calculation.\n");
- break;
- }
+ free( mnpOclStartPos );
+ mnpOclStartPos = NULL;
}
-
- /////////////////////////////////////////////////////
- osl_getSystemTime(&aTimeAfter);
- double diff = getTimeDiff(aTimeAfter, aTimeBefore);
- //if (diff >= 1.0)
+ if ( mnpOclEndPos )
{
- fprintf(stderr,"OpenCL,diff...%f.\n",diff);
+ free( mnpOclEndPos );
+ mnpOclEndPos = NULL;
}
-/////////////////////////////////////////////////////
-
-//rResult[i];
-// for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results
-// fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
-// }
-
- // Insert the double data, in rResult[i] back into the document
- rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
- }
-
- free(rResult);
-
- return true;
+ return true;
+ } // getOpenclState() End
+ else
+ return false;
}
/// Special case of formula compiler for groundwatering
@@ -489,11 +964,11 @@ public:
FormulaGroupInterpreterSoftware()
{
fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n");
- OclCalc::InitEnv();
+ OclCalc::initEnv();
}
virtual ~FormulaGroupInterpreterGroundwater()
{
- OclCalc::ReleaseOpenclRunEnv();
+ OclCalc::releaseOpenclRunEnv();
}
virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); }
@@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA
fprintf (stderr, "Calculate !");
- double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
+ double *pResult = ocl_calc.oclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
pArrayToSubtractOneElementFrom,
(size_t) xGroup->mnLength, delta );
RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index bcd7db093a84..53917b3bf2a9 100644
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -7,8 +7,8 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
-#ifndef _OCL_KERNEL_H_
-#define _OCL_KERNEL_H_
+#ifndef SC_OCLKERNELS_HXX
+#define SC_OCLKERNELS_HXX
#ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__
@@ -24,6 +24,97 @@ const char *kernel_src = KERNEL(
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n
\n#endif\n
+inline fp_t oclAverage( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fSum = 0.0;
+ fp_t zero[16] = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f};
+ fp_t16 vSum=vload16(0,zero);
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vSum += ptr;
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fSum += *p;
+ p+=1;
+ }
+ vSum.s01234567 = vSum.s01234567+vSum.s89abcdef;
+ vSum.s0123 = vSum.s0123+vSum.s4567;
+ vSum.s01 = vSum.s01+vSum.s23;
+ vSum.s0 = vSum.s0+vSum.s1;
+ fSum = vSum.s0+fSum;
+ fp_t fVal = fSum/(end-start+1);
+ return fVal;
+}
+inline fp_t oclMax( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fMax = values[start];
+ fp_t zero[16] = {fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax};
+ fp_t16 vMax=vload16(0,zero);
+ //Max
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vMax = fmax(vMax,ptr);
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fMax = fmax(fMax,*p);
+ p+=1;
+ }
+ vMax.s01234567 = fmax(vMax.s01234567, vMax.s89abcdef);
+ vMax.s0123 = fmax(vMax.s0123, vMax.s4567);
+ vMax.s01 = fmax(vMax.s01, vMax.s23);
+ vMax.s0 = fmax(vMax.s0, vMax.s1);
+ fMax = fmax(vMax.s0, fMax);
+ return fMax;
+}
+inline fp_t oclMin( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fMin = values[start];
+ fp_t zero[16] = {fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin};
+ fp_t16 vMin=vload16(0,zero);
+ //Min
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vMin = fmin(vMin,ptr);
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fMin = fmin(fMin,*p);
+ p+=1;
+ }
+ vMin.s01234567 = fmin(vMin.s01234567, vMin.s89abcdef);
+ vMin.s0123 = fmin(vMin.s0123, vMin.s4567);
+ vMin.s01 = fmin(vMin.s01, vMin.s23);
+ vMin.s0 = fmin(vMin.s0, vMin.s1);
+ fMin = fmin(vMin.s0, fMin);
+ return fMin;
+}
__kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
@@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global
otData[id] = ltData[id] + rtData[id];
}
-
__kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
const unsigned int id = get_global_id(0);
@@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global
__kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
int id = get_global_id(0);
- otData[id] =ltData[id] * rtData[id];
+ otData[id] = ltData[id] * rtData[id];
}
__kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
const unsigned int id = get_global_id(0);
- otData[id] = ltData[id] / rtData[id];
+ fp_t divisor = rtData[id];
+ if ( divisor != 0 )
+ otData[id] = ltData[id] / divisor;
+ else
+ otData[id] = 0.0;
}
__kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- unsigned int startFlag = start[id];
- unsigned int endFlag = end[id];
- fp_t fMinVal = input[startFlag];
- for(int i=startFlag;i<=endFlag;i++)
- {
- fMinVal = fmin( fMinVal, input[i] );
- }
- output[id] = fMinVal;
+ fp_t fVal = oclMin(id,input,start,end);
+ output[id] = fVal ;
}
__kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- unsigned int startFlag = start[id];
- unsigned int endFlag = end[id];
- fp_t fMaxVal = input[startFlag];
- for ( int i = startFlag; i <= endFlag; i++ )
- {
- fMaxVal = fmax( fMaxVal, input[i] );
- }
- output[id] = fMaxVal;
+ fp_t fVal = oclMax(id,input,start,end);
+ output[id] = fVal ;
}
//Sum
__kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
@@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f
__kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- fp_t sum=0.0;
- for(int i = start[id];i<=end[id];i++)
- sum += input[i];
- output[id] = sum / (end[id]-start[id]+1);
-}
+ fp_t fVal = oclAverage(id,input,start,end);
+ output[id] = fVal ;
+}
//Sumproduct
__kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize)
{
@@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s
// Min
fp_t fMinVal = values[start];
- for(int i=start+1;i < end;i++)
+ for ( int i = start + 1; i < end; i++ )
{
if(values[i]<fMinVal)
fMinVal = values[i];
@@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp
fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId];
fpP[nMax*nDimension+nId] = dMovebuffer;
}
-__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY)
+__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY,__global uint* npDim)
{
int nId = get_global_id(0);
- int nDimension = get_global_size(0);
-
+ int nDimension = npDim[nId];
+ fp_t fsum = 0.0;
for ( int yi=0; yi < nDimension; yi++ )
{
- fp_t fsum = 0.0;
+ fsum = 0.0;
for ( int yj=0; yj < nDimension; yj++ )
{
fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension];
@@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
}
for ( int xi = nDimension - 1; xi >= 0; xi-- )
{
- fp_t fsum = 0.0;
+ fsum = 0.0;
for ( int xj = 0; xj < nDimension; xj++ )
{
fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj];
@@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
}
}
+__kernel void oclAverageAdd(__global fp_t *values,__global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal + addend[id];
+}
+
+__kernel void oclAverageSub(__global fp_t *values,__global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal - subtract[id];
+}
+
+__kernel void oclAverageMul(__global fp_t *values,__global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal * multiplier[id];
+}
+__kernel void oclAverageDiv(__global fp_t *values,__global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fVal / divisor;
+ else
+ output[id] = 0.0;
+}
+
+__kernel void oclMinAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin + addend[id];
+}
+
+__kernel void oclMinSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin - subtract[id];
+}
+__kernel void oclMinMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin * multiplier[id];
+}
+__kernel void oclMinDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fMin / divisor;
+ else
+ output[id] = 0.0;
+}
+__kernel void oclMaxAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax + addend[id];
+}
+
+__kernel void oclMaxSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax - subtract[id];
+}
+__kernel void oclMaxMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax * multiplier[id];
+}
+__kernel void oclMaxDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fMax / divisor;
+ else
+ output[id] = 0.0;
+}
+
+__kernel void oclSub( fp_t ltData, __global fp_t *rtData, __global fp_t *outData )
+{
+ const unsigned int id = get_global_id(0);
+ outData[id] = ltData - rtData[id];
+}
);
#endif // USE_EXTERNAL_KERNEL
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 47b906be813b..6db498b67f53 100644
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -7,19 +7,19 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
-#include "openclwrapper.hxx"
-
+#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cmath>
#include "sal/config.h"
#include "random.hxx"
+#include "openclwrapper.hxx"
#include "oclkernels.hxx"
#ifdef SAL_WIN32
#include <Windows.h>
#endif
-//#define USE_KERNEL_FILE
-
+//#define USE_MAP_BUFFER
+using namespace std;
GPUEnv OpenclDevice::gpuEnv;
int OpenclDevice::isInited =0;
@@ -44,9 +44,9 @@ int OpenclDevice::isInited =0;
HINSTANCE HOpenclDll = NULL;
void * OpenclDll = NULL;
-int OpenclDevice::LoadOpencl()
+int OpenclDevice::loadOpencl()
{
- //fprintf(stderr, " LoadOpenclDllxx... \n");
+ //fprintf(stderr, " loadOpenclDllxx... \n");
OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
OpenclDll = LoadLibrary( OPENCL_DLL_NAME );
if ( !static_cast<HINSTANCE>( OpenclDll ) )
@@ -59,7 +59,7 @@ int OpenclDevice::LoadOpencl()
return OCLSUCCESS;
}
-void OpenclDevice::FreeOpenclDll()
+void OpenclDevice::freeOpenclDll()
{
fprintf(stderr, " Free opencllo.dll ... \n");
if ( !static_cast<HINSTANCE>( OpenclDll ) )
@@ -67,39 +67,39 @@ void OpenclDevice::FreeOpenclDll()
}
#endif
-int OpenclDevice::InitEnv()
+int OpenclDevice::initEnv()
{
#ifdef SAL_WIN32
while( 1 )
{
- if( 1 == LoadOpencl() )
+ if( 1 == loadOpencl() )
break;
}
#endif
- InitOpenclRunEnv( 0 );
+ initOpenclRunEnv( 0 );
return 1;
}
-int OpenclDevice::ReleaseOpenclRunEnv()
+int OpenclDevice::releaseOpenclRunEnv()
{
- ReleaseOpenclEnv( &gpuEnv );
+ releaseOpenclEnv( &gpuEnv );
#ifdef SAL_WIN32
- FreeOpenclDll();
+ freeOpenclDll();
#endif
return 1;
}
///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
-inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
+inline int OpenclDevice::addKernelConfig( int kCount, const char *kName )
{
if ( kCount < 1 )
- fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "AddKernelConfig\n" );
+ fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "addKernelConfig\n" );
strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
gpuEnv.mnKernelCount++;
return 0;
}
-int OpenclDevice::RegistOpenclKernel()
+int OpenclDevice::registOpenclKernel()
{
if ( !gpuEnv.mnIsUserCreated )
memset( &gpuEnv, 0, sizeof(gpuEnv) );
@@ -107,37 +107,58 @@ int OpenclDevice::RegistOpenclKernel()
gpuEnv.mnFileCount = 0; //argc;
gpuEnv.mnKernelCount = 0UL;
- AddKernelConfig( 1, (const char*) "oclFormulaMin" );
- AddKernelConfig( 2, (const char*) "oclFormulaMax" );
- AddKernelConfig( 3, (const char*) "oclFormulaSum" );
- AddKernelConfig( 4, (const char*) "oclFormulaCount" );
- AddKernelConfig( 5, (const char*) "oclFormulaAverage" );
- AddKernelConfig( 6, (const char*) "oclFormulaSumproduct" );
- AddKernelConfig( 7, (const char*) "oclFormulaMtxInv" );
-
- AddKernelConfig( 8, (const char*) "oclSignedAdd" );
- AddKernelConfig( 9, (const char*) "oclSignedSub" );
- AddKernelConfig( 10, (const char*) "oclSignedMul" );
- AddKernelConfig( 11, (const char*) "oclSignedDiv" );
- AddKernelConfig( 12, (const char*) "oclAverageDelta" );
- AddKernelConfig( 13, (const char*) "oclMaxDelta" );
- AddKernelConfig( 14, (const char*) "oclMinDelta" );
- AddKernelConfig( 15, (const char*) "oclSubDelta" );
- AddKernelConfig( 16, (const char*) "oclLUDecomposition" );
+ addKernelConfig( 1, (const char*) "oclFormulaMin" );
+ addKernelConfig( 2, (const char*) "oclFormulaMax" );
+ addKernelConfig( 3, (const char*) "oclFormulaSum" );
+ addKernelConfig( 4, (const char*) "oclFormulaCount" );
+ addKernelConfig( 5, (const char*) "oclFormulaAverage" );
+ addKernelConfig( 6, (const char*) "oclFormulaSumproduct" );
+ addKernelConfig( 7, (const char*) "oclFormulaMtxInv" );
+
+ addKernelConfig( 8, (const char*) "oclSignedAdd" );
+ addKernelConfig( 9, (const char*) "oclSignedSub" );
+ addKernelConfig( 10, (const char*) "oclSignedMul" );
+ addKernelConfig( 11, (const char*) "oclSignedDiv" );
+ addKernelConfig( 12, (const char*) "oclAverageDelta" );
+ addKernelConfig( 13, (const char*) "oclMaxDelta" );
+ addKernelConfig( 14, (const char*) "oclMinDelta" );
+ addKernelConfig( 15, (const char*) "oclSubDelta" );
+ addKernelConfig( 16, (const char*) "oclLUDecomposition" );
+ addKernelConfig( 17, (const char*) "oclAverageDeltaRPN" );
+ addKernelConfig( 18, (const char*) "oclMaxDeltaRPN" );
+ addKernelConfig( 19, (const char*) "oclMinDeltaRPN" );
+ addKernelConfig( 20, (const char*) "oclMoreColArithmeticOperator" );
+ addKernelConfig( 21, (const char*) "oclColumnH" );
+ addKernelConfig( 22, (const char*) "oclColumnL" );
+ addKernelConfig( 23, (const char*) "oclColumnN" );
+ addKernelConfig( 24, (const char*) "oclColumnJ" );
+ addKernelConfig( 25, (const char*) "oclMaxSub" );
+ addKernelConfig( 26, (const char*) "oclAverageSub" );
+ addKernelConfig( 27, (const char*) "oclMinSub" );
+ addKernelConfig( 28, (const char*) "oclMaxAdd" );
+ addKernelConfig( 29, (const char*) "oclAverageAdd" );
+ addKernelConfig( 30, (const char*) "oclMinAdd" );
+ addKernelConfig( 31, (const char*) "oclMaxMul" );
+ addKernelConfig( 32, (const char*) "oclAverageMul" );
+ addKernelConfig( 33, (const char*) "oclMinMul" );
+ addKernelConfig( 34, (const char*) "oclMaxDiv" );
+ addKernelConfig( 35, (const char*) "oclAverageDiv" );
+ addKernelConfig( 36, (const char*) "oclMinDiv" );
+ addKernelConfig( 37, (const char*) "oclSub" );// for svDouble type
return 0;
}
OpenclDevice::OpenclDevice()
{
- //InitEnv();
+ //initEnv();
}
OpenclDevice::~OpenclDevice()
{
- //ReleaseOpenclRunEnv();
+ //releaseOpenclRunEnv();
}
-int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
+int OpenclDevice::setKernelEnv( KernelEnv *envInfo )
{
envInfo->mpkContext = gpuEnv.mpContext;
envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
@@ -146,12 +167,12 @@ int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
return 1;
}
-int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName )
+int OpenclDevice::checkKernelName( KernelEnv *envInfo, const char *kernelName )
{
- //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
+ //printf("checkKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
int kCount;
int nFlag = 0;
- for ( kCount = 0; kCount < gpuEnv.mnKernelCount; kCount++ )
+ for ( kCount=0; kCount < gpuEnv.mnKernelCount; kCount++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[kCount]) == 0 )
{
@@ -174,7 +195,7 @@ int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName )
return 1;
}
-int OpenclDevice::ConvertToString( const char *filename, char **source )
+int OpenclDevice::convertToString( const char *filename, char **source )
{
int file_size;
size_t result;
@@ -211,7 +232,7 @@ int OpenclDevice::ConvertToString( const char *filename, char **source )
return 0;
}
-int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
+int OpenclDevice::binaryGenerated( const char * clFileName, FILE ** fhandle )
{
unsigned int i = 0;
cl_int clStatus;
@@ -219,16 +240,28 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
char *str = NULL;
FILE *fd = NULL;
cl_uint numDevices=0;
- clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
- CL_DEVICE_TYPE_GPU, // device_type
- 0, // num_entries
- NULL, // devices ID
- &numDevices);
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+ CL_DEVICE_TYPE_CPU, // device_type for CPU device
+ 0, // num_entries
+ NULL, // devices ID
+ &numDevices);
+ }
+ else
+ {
+ clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+ CL_DEVICE_TYPE_GPU, // device_type for GPU device
+ 0, // num_entries
+ NULL, // devices ID
+ &numDevices);
+ }
+ CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
for ( i = 0; i < numDevices; i++ )
{
+ char fileName[256] = { 0 }, cl_name[128] = { 0 };
if ( gpuEnv.mpArryDevsID[i] != 0 )
{
- char fileName[256] = { 0 }, cl_name[128] = { 0 };
char deviceName[1024];
clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
@@ -248,7 +281,7 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
}
-int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
+int OpenclDevice::writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
{
FILE *output = NULL;
output = fopen( fileName, "wb" );
@@ -264,7 +297,7 @@ int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, s
}
-int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
+int OpenclDevice::generatBinFromKernelSource( cl_program program, const char * clFileName )
{
unsigned int i = 0;
cl_int clStatus;
@@ -307,12 +340,6 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
if ( binaries[i] == NULL )
{
- for ( unsigned int j = 0; j < i ; j++)
- {
- if (binaries[j])
- free(binaries[j]);
- }
- free(binaries);
return 0;
}
}
@@ -329,9 +356,10 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
/* dump out each binary into its own separate file. */
for ( i = 0; i < numDevices; i++ )
{
+ char fileName[256] = { 0 }, cl_name[128] = { 0 };
+
if ( binarySizes[i] != 0 )
{
- char fileName[256] = { 0 }, cl_name[128] = { 0 };
char deviceName[1024];
clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName, NULL);
@@ -342,7 +370,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
cl_name[str - clFileName] = '\0';
sprintf( fileName, "./%s-%s.bin", cl_name, deviceName );
- if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
+ if ( !writeBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
{
printf("opencl-wrapper: write binary[%s] failds\n", fileName);
return 0;
@@ -381,7 +409,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
return 1;
}
-int OpenclDevice::InitOpenclAttr( OpenCLEnv * env )
+int OpenclDevice::initOpenclAttr( OpenCLEnv * env )
{
if ( gpuEnv.mnIsUserCreated )
return 1;
@@ -396,7 +424,7 @@ int OpenclDevice::InitOpenclAttr( OpenCLEnv * env )
return 0;
}
-int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env )
+int OpenclDevice::createKernel( char * kernelname, KernelEnv * env )
{
int clStatus;
@@ -406,13 +434,13 @@ int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env )
return clStatus != CL_SUCCESS ? 1 : 0;
}
-int OpenclDevice::ReleaseKernel( KernelEnv * env )
+int OpenclDevice::releaseKernel( KernelEnv * env )
{
int clStatus = clReleaseKernel( env->mpkKernel );
return clStatus != CL_SUCCESS ? 1 : 0;
}
-int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
+int OpenclDevice::releaseOpenclEnv( GPUEnv *gpuInfo )
{
int i = 0;
int clStatus = 0;
@@ -447,18 +475,18 @@ int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
return 1;
}
-int OpenclDevice::RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata )
+int OpenclDevice::runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata )
{
- printf("oclwrapper:RunKernel_wrapper...\n");
- if ( RegisterKernelWrapper( kernelName, function ) != 1 )
+ printf("oclwrapper:runKernel_wrapper...\n");
+ if ( registerKernelWrapper( kernelName, function ) != 1 )
{
- fprintf(stderr, "Error:RunKernel_wrapper:RegisterKernelWrapper fail!\n");
+ fprintf(stderr, "Error:runKernel_wrapper:registerKernelWrapper fail!\n");
return -1;
}
- return ( RunKernel( kernelName, usrdata ) );
+ return ( runKernel( kernelName, usrdata ) );
}
-int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
+int OpenclDevice::cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
{
int i;
for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
@@ -475,19 +503,20 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl
return 0;
}
-int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
+int OpenclDevice::compileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
{
cl_int clStatus = 0;
size_t length;
- char *binary;
+ char *buildLog = NULL, *binary;
const char *source;
size_t source_size[1];
- int binary_status, binaryExisted, idx;
- cl_uint numDevices;
- FILE *fd;
+ int b_error, binary_status, binaryExisted, idx;
+ size_t numDevices;
+ cl_device_id *mpArryDevsID;
+ FILE *fd, *fd1;
const char* filename = "kernel.cl";
- fprintf(stderr, "CompileKernelFile ... \n");
- if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
+ fprintf(stderr, "compileKernelFile ... \n");
+ if ( cachedOfKernerPrg(gpuInfo, filename) == 1 )
{
return 1;
}
@@ -498,33 +527,31 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
source_size[0] = strlen( source );
binaryExisted = 0;
- if ( ( binaryExisted = BinaryGenerated( filename, &fd ) ) == 1 )
+ if ( ( binaryExisted = binaryGenerated( filename, &fd ) ) == 1 )
{
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
sizeof(numDevices), &numDevices, NULL );
CHECK_OPENCL( clStatus, "clGetContextInfo" );
- cl_device_id *mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+ mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
if ( mpArryDevsID == NULL )
{
return 0;
}
- int b_error = 0;
+ b_error = 0;
length = 0;
b_error |= fseek( fd, 0, SEEK_END ) < 0;
b_error |= ( length = ftell(fd) ) <= 0;
b_error |= fseek( fd, 0, SEEK_SET ) < 0;
if ( b_error )
{
- free(mpArryDevsID);
return 0;
}
binary = (char*) malloc( length + 2 );
if ( !binary )
{
- free(mpArryDevsID);
return 0;
}
@@ -537,12 +564,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
// grab the handles to all of the devices in the context.
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
- if (clStatus != CL_SUCCESS)
- {
- fprintf (stderr, "OpenCL error code is %d at " SAL_DETAIL_WHERE " when clGetContextInfo .\n", clStatus);
- free(binary);
- return 0;
- }
+ CHECK_OPENCL( clStatus, "clGetContextInfo" );
fprintf(stderr, "Create kernel from binary\n");
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
@@ -600,7 +622,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
printf("opencl create build log fail\n");
return 0;
}
- char* buildLog = (char*) malloc( length );
+ buildLog = (char*) malloc( length );
if ( buildLog == (char*) NULL )
{
return 0;
@@ -618,11 +640,10 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
if ( clStatus != CL_SUCCESS )
{
printf("opencl program build info fail\n");
- free(buildLog);
return 0;
}
- FILE *fd1 = fopen( "kernel-build.log", "w+" );
+ fd1 = fopen( "kernel-build.log", "w+" );
if ( fd1 != NULL )
{
fwrite( buildLog, sizeof(char), length, fd1 );
@@ -636,17 +657,17 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
strcpy( gpuEnv.mArryKnelSrcFile[idx], filename );
if ( binaryExisted == 0 )
- GeneratBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename );
+ generatBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename );
gpuInfo->mnFileCount += 1;
return 1;
}
-int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function)
+int OpenclDevice::getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function)
{
int i;
- //printf("----------------OpenclDevice::GetKernelEnvAndFunc\n");
+ //printf("----------------OpenclDevice::getKernelEnvAndFunc\n");
for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
@@ -662,14 +683,14 @@ int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, c
return 0;
}
-int OpenclDevice::RunKernel( const char *kernelName, void **userdata)
+int OpenclDevice::runKernel( const char *kernelName, void **userdata)
{
KernelEnv kEnv;
cl_kernel_function function;
int status;
memset( &kEnv, 0, sizeof( KernelEnv ) );
- status = GetKernelEnvAndFunc( kernelName, &kEnv, &function );
+ status = getKernelEnvAndFunc( kernelName, &kEnv, &function );
strcpy( kEnv.mckKernelName, kernelName );
if ( status == 1 )
{
@@ -680,8 +701,9 @@ int OpenclDevice::RunKernel( const char *kernelName, void **userdata)
return 0;
}
-int OpenclDevice::InitOpenclRunEnv( int argc )
+int OpenclDevice::initOpenclRunEnv( int argc )
{
+ int status = 0;
if ( MAX_CLKERNEL_NUM <= 0 )
{
return 1;
@@ -691,9 +713,9 @@ int OpenclDevice::InitOpenclRunEnv( int argc )
if ( !isInited )
{
- RegistOpenclKernel();
+ registOpenclKernel();
//initialize devices, context, comand_queue
- int status = InitOpenclRunEnv( &gpuEnv );
+ status = initOpenclRunEnv( &gpuEnv );
if ( status )
{
printf("init_opencl_env failed.\n");
@@ -709,36 +731,38 @@ int OpenclDevice::InitOpenclRunEnv( int argc )
if( gpuEnv.mnKhrFp64Flag )
{
printf("----use khr double type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double" );
+ status = compileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
}
else if( gpuEnv.mnAmdFp64Flag )
{
printf("----use amd double type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double" );
+ status = compileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
}
else
{
printf("----use float type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-Dfp_t=float" );
+ status = compileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
}
if ( status == 0 || gpuEnv.mnKernelCount == 0 )
{
- printf("CompileKernelFile failed.\n");
+ printf("compileKernelFile failed.\n");
return 1;
}
- printf("CompileKernelFile successed.\n");
+ printf("compileKernelFile successed.\n");
isInited = 1;
}
return 0;
}
-int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
+int OpenclDevice::initOpenclRunEnv( GPUEnv *gpuInfo )
{
size_t length;
cl_int clStatus;
cl_uint numPlatforms, numDevices;
cl_platform_id *platforms;
cl_context_properties cps[3];
+ char platformName[256];
+ unsigned int i;
// Have a look at the available platforms.
@@ -765,8 +789,7 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
return 1;
}
- char platformName[256];
- for ( size_t i = 0; i < numPlatforms; i++ )
+ for ( i = 0; i < numPlatforms; i++ )
{
clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
sizeof( platformName ), platformName, NULL );
@@ -781,13 +804,22 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
//if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
{
gpuInfo->mpPlatformID = platforms[i];
-
- clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
- CL_DEVICE_TYPE_GPU, // device_type
- 0, // num_entries
- NULL, // devices
- &numDevices);
-
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+ CL_DEVICE_TYPE_CPU, // device_type for CPU device
+ 0, // num_entries
+ NULL, // devices
+ &numDevices);
+ }
+ else
+ {
+ clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+ CL_DEVICE_TYPE_GPU, // device_type for GPU device
+ 0, // num_entries
+ NULL, // devices
+ &numDevices);
+ }
if ( clStatus != CL_SUCCESS )
continue;
@@ -806,8 +838,15 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
cps[2] = 0;
- // Check for GPU.
- gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+ // Set device type for OpenCL
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
+ }
+ else
+ {
+ gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+ }
gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
@@ -845,7 +884,6 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
}
clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "GetCommandQueueInfo" );
// Check device extensions for double type
size_t aDevExtInfoSize = 0;
@@ -875,10 +913,10 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
return 0;
}
-int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_function function )
+int OpenclDevice::registerKernelWrapper( const char *kernelName, cl_kernel_function function )
{
int i;
- //printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.mnKernelCount);
+ //printf("oclwrapper:registerKernelWrapper...%d\n", gpuEnv.mnKernelCount);
for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
@@ -890,13 +928,13 @@ int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_funct
return 0;
}
-void OpenclDevice::SetOpenclState( int state )
+void OpenclDevice::setOpenclState( int state )
{
//printf("OpenclDevice::setOpenclState...\n");
isInited = state;
}
-int OpenclDevice::GetOpenclState()
+int OpenclDevice::getOpenclState()
{
return isInited;
}
@@ -906,54 +944,46 @@ OclCalc::OclCalc()
fprintf(stderr,"OclCalc:: init opencl ...\n");
nFormulaColSize = 0;
nFormulaRowSize = 0;
+ nArithmeticLen = 0;
+ nFormulaLen = 0;
+ mpClmemSrcData = NULL;
+ mpClmemStartPos = NULL;
+ mpClmemEndPos = NULL;
+ mpClmemLeftData = NULL;
+ mpClmemRightData = NULL;
+ mpClmemMergeLfData = NULL;
+ mpClmemMergeRtData = NULL;
+ mpClmemMatixSumSize = NULL;
+ mpClmemeOp = NULL;
}
OclCalc::~OclCalc()
{
- fprintf(stderr,"OclCalc:: opencl end ...\n");
+ releaseOclBuffer();
}
-
-/////////////////////////////////////////////////////////////////////////////
-int OclCalc::CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize )
+int OclCalc::releaseOclBuffer(void)
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
-
- mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(double), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
-
- dpSrcData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL(clStatus,"clEnqueueMapBuffer");
- clFinish(kEnv.mpkCmdQueue);
- npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
- clFinish( kEnv.mpkCmdQueue );
- npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus,"clEnqueueMapBuffer" );
- clFinish( kEnv.mpkCmdQueue );
- //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
- return 0;
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemSrcData );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemStartPos );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemEndPos );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemLeftData );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemRightData );
+ fprintf(stderr,"OclCalc:: opencl end ...\n");
+ return 1;
}
-int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize )
+/////////////////////////////////////////////////////////////////////////////
+
+int OclCalc::createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize )
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
+ setKernelEnv( &kEnv );
- mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ),
nBufferSize * sizeof(double), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+ mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ),
nBufferSize * sizeof(double), NULL, &clStatus );
CHECK_OPENCL( clStatus,"clCreateBuffer" );
dpLeftData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue,mpClmemLeftData,CL_TRUE,CL_MAP_WRITE,0,
@@ -967,54 +997,169 @@ int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int
//printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
return 0;
}
-int OclCalc::CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize )
+
+int OclCalc::mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize)
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
- mpClmemMergeLfData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(double), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemMergeRtData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(double), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemMatixSumSize = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ double * dpSrcDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE,CL_MAP_WRITE, 0,
+ nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ unsigned int *npStartPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE,CL_MAP_WRITE, 0,
+ nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ unsigned int *npEndPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE, 0,
+ nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for(int i=0;i<nBufferSize;i++)
+ dpSrcDataMap[i] = dpTempSrcData[i];
+ for(int i=0;i<nRowsize;i++)
+ {
+ npStartPosMap[i] = unStartPos[i];
+ npEndPosMap[i] = unEndPos[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, dpSrcDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPosMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPosMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+int OclCalc::mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize )
+{
+ cl_int clStatus = 0;
+ double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ double *dpRightDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemRightData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( int i = 0; i < nBufferSize; i++ )
+ {
+ dpLeftDataMap[i] = dpTempLeftData[i];
+ dpRightDataMap[i] = dpTempRightData[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, dpRightDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
- dpSumProMergeLfData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeLfData, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(double),
- 0, NULL, NULL, &clStatus );
+int OclCalc::mapAndCopyArithmetic64Bits( const double *dpMoreColArithmetic, int nBufferSize )
+{
+ cl_int clStatus = 0;
+ double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
- clFinish(kEnv.mpkCmdQueue);
- dpSumProMergeRtData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeRtData, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(double),
- 0, NULL, NULL, &clStatus );
+ clFinish( kEnv.mpkCmdQueue );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- npSumSize = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMatixSumSize, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nBufferSize * sizeof(uint),
- 0, NULL, NULL, &clStatus );
+ for ( int i = 0; i < nBufferSize; i++ )
+ {
+ dpLeftDataMap[i] = dpMoreColArithmetic[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+int OclCalc::mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic, int nBufferSize, uint *npeOp, uint neOpSize )
+{
+ cl_int clStatus = 0;
+ double *dpLeftDataMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
+ uint *dpeOpMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemeOp, CL_TRUE, CL_MAP_WRITE,
+ 0, neOpSize * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( int i = 0; i < nBufferSize; i++ )
+ {
+ dpLeftDataMap[i] = dpMoreColArithmetic[i];
+ }
+ for( uint i = 0; i<neOpSize; i++)
+ {
+ dpeOpMap[i] = npeOp[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemeOp, dpeOpMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+
+int OclCalc::createFormulaBuf64Bits( int nBufferSize, int rowSize )
+{
+ cl_int clStatus = 0;
+ setKernelEnv( &kEnv );
+ mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(double), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ nFormulaLen = nBufferSize;
+ mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ rowSize * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ rowSize * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
return 0;
}
-int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult,int nRowSize )
+
+int OclCalc::createArithmeticOptBuf64Bits( int nBufferSize )
+{
+ cl_int clStatus = 0;
+ nArithmeticLen = nBufferSize;
+ setKernelEnv( &kEnv );
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(double), NULL, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(double), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 0;
+}
+
+int OclCalc::createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize )
+{
+ cl_int clStatus = 0;
+ nArithmeticLen = nBufferSize;
+ setKernelEnv( &kEnv );
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(double), NULL, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ mpClmemeOp = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ neOpSize * sizeof(uint), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 0;
+
+}
+
+int OclCalc::oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult,int nRowSize )
{
cl_int clStatus = 0;
size_t global_work_size[1];
- CheckKernelName( &kEnv, aKernelName );
+ checkKernelName( &kEnv, aKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftData, 0, NULL, NULL );
- CHECK_OPENCL(clStatus,"clEnqueueUnmapMemObject");
- clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, fpRightData, 0, NULL, NULL );
- CHECK_OPENCL(clStatus,"clEnqueueUnmapMemObject");
- clFinish( kEnv.mpkCmdQueue );
-
cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE,
nRowSize * sizeof(double), NULL, &clStatus);
CHECK_OPENCL( clStatus, "clCreateBuffer" );
@@ -1032,9 +1177,11 @@ int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *f
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
- rResult = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE,CL_MAP_READ,
+ double *dpOutPut = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE,CL_MAP_READ,
0, nRowSize*sizeof(double), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ for ( int i = 0; i < nRowSize; i++ )
+ rResult[i] = dpOutPut[i];
clFinish( kEnv.mpkCmdQueue );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, rResult, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
@@ -1043,31 +1190,163 @@ int OclCalc::OclHostArithmeticOperator64Bits( const char* aKernelName, double *f
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
- clStatus = clReleaseMemObject( mpClmemLeftData );
- CHECK_OPENCL( clStatus, "clReleaseMemObject");
- clStatus = clReleaseMemObject( mpClmemRightData );
+ clStatus = clReleaseMemObject( clResult );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ return 0;
+}
+
+int OclCalc::oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ const char *aKernelName = "oclMoreColArithmeticOperator";
+ checkKernelName( &kEnv,aKernelName );
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+ cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(double), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_int), (void *)&nDataSize );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&mpClmemeOp );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_int), (void *)&neOpSize );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&clResult );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ global_work_size[0] = nRowSize;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+ NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+ double * hostMapResult = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE, CL_MAP_READ, 0,
+ nRowSize*sizeof(double), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( int i = 0; i < nRowSize; i++)
+ rResult[i] = hostMapResult[i]; // from gpu float type to cpu double type
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, hostMapResult, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clStatus = clFinish(kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel(kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( clResult );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ return 0;
+
+}
+
+int OclCalc::oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ setKernelEnv( &kEnv );
+ checkKernelName( &kEnv, aKernelName );
+
+ cl_mem clLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ nRowSize * sizeof(double), (void *)dpLeftData, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ nRowSize * sizeof(double), (void *)dpRightData, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE,
+ nRowSize * sizeof(double), NULL, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&clLeftData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clRightData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clResult );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+
+ global_work_size[0] = nRowSize;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+ NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, clResult, CL_TRUE, 0, nRowSize * sizeof(double), (double *)rResult, 0, NULL, NULL);
+ CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" );
+
+ clStatus = clFinish( kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel( kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( clResult );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clLeftData );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clRightData );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
return 0;
}
-int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int size )
+int OclCalc::oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size )
{
cl_int clStatus = 0;
size_t global_work_size[1];
- CheckKernelName( &kEnv, aKernelName );
+ setKernelEnv( &kEnv );
+ checkKernelName( &kEnv, aKernelName );
+ cl_mem clSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ nBufferSize * sizeof(double), (void *)dpSrcData, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ size * sizeof(unsigned int), (void *)nStartPos, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ size * sizeof(unsigned int), (void *)nEndPos, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&clSrcData);
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clStartPos );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clEndPos );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&outputCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ global_work_size[0] = size;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
- clFinish(kEnv.mpkCmdQueue);
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject");
+
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, outputCl, CL_TRUE, 0, size * sizeof(double), (double *)output, 0, NULL, NULL);
+ CHECK_OPENCL( clStatus, "clReadBuffer" );
clFinish( kEnv.mpkCmdQueue );
+ clStatus = clFinish(kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel( kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( outputCl );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clSrcData );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clStartPos );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clEndPos );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ return 0;
+}
+
+int OclCalc::oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int size )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ checkKernelName( &kEnv, aKernelName );
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&mpClmemSrcData);
@@ -1082,8 +1361,14 @@ int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fp
clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
- output = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
- 0, size*sizeof(double), 0, NULL, NULL, &clStatus );
+ double *dpOutPut = (double *) clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE,CL_MAP_READ,
+ 0, size*sizeof(double), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ for ( int i = 0; i < size; i++ )
+ {
+ output[i] = dpOutPut[i];
+ }
+
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, outputCl, output, 0, NULL, NULL );
@@ -1092,21 +1377,15 @@ int OclCalc::OclHostFormulaStatistics64Bits( const char* aKernelName, double *fp
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
- clStatus = clReleaseMemObject(mpClmemSrcData );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject(mpClmemStartPos );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemEndPos );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
clStatus = clReleaseMemObject( outputCl );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
return 0;
}
-int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize )
+int OclCalc::oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize )
{
const char *cpKernelName = "oclFormulaCount";
- CheckKernelName( &kEnv, cpKernelName );
+ checkKernelName( &kEnv, cpKernelName );
cl_int clStatus;
size_t global_work_size[1];
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
@@ -1141,12 +1420,6 @@ int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
- clStatus = clReleaseMemObject( mpClmemSrcData );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemStartPos );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemEndPos );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
clStatus = clReleaseMemObject( clpOutput );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
return 0;
@@ -1157,14 +1430,14 @@ int OclCalc::OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double
*save the npStart array eg:a4-a8;b10-b14,the npStart will store a4,b10,and the npEnd will store a8,b14 range.So it can if(i +1)%2 to judge
* the a cloumn or b cloumn npStart range.so as b bolumn.
*/
-int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double *dpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )
+int OclCalc::oclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double *dpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )
{
cl_int clStatus;
size_t global_work_size[1];
memset(dpOutput,0,nSize);
const char *cpFirstKernelName = "oclSignedMul";
const char *cpSecondKernelName = "oclFormulaSumproduct";
- CheckKernelName( &kEnv, cpFirstKernelName );
+ checkKernelName( &kEnv, cpFirstKernelName );
kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kEnv.mckKernelName,&clStatus);
CHECK_OPENCL( clStatus, "clCreateKernel" );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemMergeLfData, dpSumProMergeLfData, 0, NULL, NULL );
@@ -1197,7 +1470,7 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- CheckKernelName( &kEnv, cpSecondKernelName );
+ checkKernelName( &kEnv, cpSecondKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
cl_mem clpOutput = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nSize* sizeof(double), NULL, &clStatus );
@@ -1220,11 +1493,13 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double
NULL, global_work_size, NULL, 0, NULL, NULL);
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
- dpOutput = (double *)clEnqueueMapBuffer(kEnv.mpkCmdQueue, clpOutput, CL_TRUE, CL_MAP_READ, 0,
- nSize*sizeof(double), 0, NULL, NULL, &clStatus );
+ double * outputMap = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpOutput, CL_TRUE, CL_MAP_READ,
+ 0, nSize*sizeof(double), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpOutput, dpOutput, 0, NULL, NULL );
+ for ( int i = 0; i < nSize; i++ )
+ dpOutput[i] = outputMap[i];
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpOutput, outputMap, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
clStatus = clFinish( kEnv.mpkCmdQueue );
CHECK_OPENCL( clStatus, "clFinish" );
@@ -1239,46 +1514,67 @@ int OclCalc::OclHostFormulaSumProduct64Bits( double *dpSumProMergeLfData, double
return 0;
}
-int OclCalc::CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize )
+int OclCalc::createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize )
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
- mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(float), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
+ nArithmeticLen = nBufferSize;
+ setKernelEnv( &kEnv );
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(float), NULL, &clStatus);
CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
+ mpClmemeOp = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ neOpSize * sizeof(uint), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
- fpSrcData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ return 0;
+}
+
+int OclCalc::createArithmeticOptBuf32Bits( int nBufferSize )
+{
+ cl_int clStatus = 0;
+ setKernelEnv( &kEnv );
+ nArithmeticLen = nBufferSize;
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(float), NULL, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(float), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
clFinish( kEnv.mpkCmdQueue );
- npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ return 0;
+}
+
+int OclCalc::createFormulaBuf32Bits( int nBufferSize, int rowSize )
+{
+ cl_int clStatus = 0;
+ setKernelEnv( &kEnv );
+ nFormulaLen = nBufferSize;
+
+ mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ nBufferSize * sizeof(float), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ rowSize * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ rowSize * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
clFinish( kEnv.mpkCmdQueue );
- //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
return 0;
}
-int OclCalc::CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize )
+int OclCalc::createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize )
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
+ setKernelEnv( &kEnv );
mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
nBufferSize * sizeof(float), NULL, &clStatus);
CHECK_OPENCL( clStatus, "clCreateBuffer" );
mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
nBufferSize * sizeof(unsigned int), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
- fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
+ fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,
0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
@@ -1289,48 +1585,120 @@ int OclCalc::CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nB
//printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
return 0;
}
-int OclCalc::CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize )
+
+int OclCalc::mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize)
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
- mpClmemMergeLfData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(float), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemMergeRtData = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(float), NULL, &clStatus);
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemMatixSumSize = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nMatixSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- fpSumProMergeLfData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeLfData, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ float *fpSrcData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE, 0,
+ nBufferSize * sizeof(float) , 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- fpSumProMergeRtData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMergeRtData, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nMatixSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ unsigned int *npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE, 0,
+ nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- npSumSize = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemMatixSumSize, CL_TRUE,
- CL_MAP_WRITE_INVALIDATE_REGION, 0, nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
+ unsigned int *npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE, 0,
+ nRowsize * sizeof(uint), 0, NULL, NULL, &clStatus );
CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
- return 0;
+ for(int i=0;i<nBufferSize;i++)
+ {
+ fpSrcData[i] = (float)dpTempSrcData[i];
+ }
+ for(int i=0;i<nRowsize;i++)
+ {
+ npStartPos[i] = unStartPos[i];
+ npEndPos[i] = unEndPos[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
}
-int OclCalc::OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize )
+int OclCalc::mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize )
{
cl_int clStatus = 0;
- size_t global_work_size[1];
- CheckKernelName( &kEnv,aKernelName );
- kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateKernel" );
+ float *fpLeftData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ float *fpRightData = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemRightData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for(int i=0;i<nBufferSize;i++)
+ {
+ fpLeftData[i] = (float)dpTempLeftData[i];
+ fpRightData[i] = (float)dpTempRightData[i];
+ }
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftData, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
clFinish( kEnv.mpkCmdQueue );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemRightData, fpRightData, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+int OclCalc::mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize )
+{
+ cl_int clStatus = 0;
+ float *dpLeftDataMap = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( int i = 0; i < nBufferSize; i++ )
+ {
+ dpLeftDataMap[i] = (float)dpMoreColArithmetic[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, dpLeftDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+int OclCalc::mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize, uint *npeOp, uint neOpSize )
+{
+ cl_int clStatus = 0;
+ float *fpLeftDataMap = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemLeftData, CL_TRUE, CL_MAP_WRITE,
+ 0, nBufferSize * sizeof(float), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ uint *dpeOpMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemeOp, CL_TRUE, CL_MAP_WRITE,
+ 0, neOpSize * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( int i = 0; i < nBufferSize; i++ )
+ {
+ fpLeftDataMap[i] = (float)dpMoreColArithmetic[i];
+ }
+ for( uint i = 0; i<neOpSize; i++ )
+ {
+ dpeOpMap[i] = npeOp[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemLeftData, fpLeftDataMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemeOp, dpeOpMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ return 1;
+}
+
+int OclCalc::oclHostArithmeticOperator32Bits( const char* aKernelName,double *rResult, int nRowSize )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+
+ checkKernelName( &kEnv,aKernelName );
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+
cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(float), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData );
@@ -1356,31 +1724,61 @@ int OclCalc::OclHostArithmeticOperator32Bits( const char* aKernelName, float *fp
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel(kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
- clStatus = clReleaseMemObject(mpClmemLeftData );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemRightData );
- CHECK_OPENCL( clStatus, "clReleaseMemObject" );
clStatus = clReleaseMemObject( clResult );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
return 0;
}
-int OclCalc::OclHostFormulaStatistics32Bits(const char* aKernelName,float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int size)
+int OclCalc::oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )
{
cl_int clStatus = 0;
size_t global_work_size[1];
- CheckKernelName( &kEnv, aKernelName );
- kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
+ const char *aKernelName = "oclMoreColArithmeticOperator";
+ checkKernelName( &kEnv,aKernelName );
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemSrcData, fpSrcData, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nRowSize * sizeof(float), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_int), (void *)&nDataSize );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&mpClmemeOp );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_int), (void *)&neOpSize );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&clResult );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ global_work_size[0] = nRowSize;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+ NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemStartPos, npStartPos, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ float * hostMapResult = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clResult, CL_TRUE, CL_MAP_READ, 0,
+ nRowSize*sizeof(float), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
clFinish( kEnv.mpkCmdQueue );
- clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemEndPos, npEndPos, 0, NULL, NULL );
+ for ( int i = 0; i < nRowSize; i++)
+ rResult[i] = hostMapResult[i]; // from gpu float type to cpu double type
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clResult, hostMapResult, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
- clFinish( kEnv.mpkCmdQueue );
+ clStatus = clFinish(kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel(kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( clResult );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ return 0;
+}
+
+int OclCalc::oclHostFormulaStatistics32Bits(const char* aKernelName,double *output,int size)
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ checkKernelName( &kEnv, aKernelName );
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+
cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(float), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemSrcData );
@@ -1407,21 +1805,141 @@ int OclCalc::OclHostFormulaStatistics32Bits(const char* aKernelName,float *fpSrc
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
- clStatus = clReleaseMemObject( mpClmemSrcData );
+ clStatus = clReleaseMemObject( outputCl );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemStartPos );
+ return 0;
+}
+
+int OclCalc::oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ setKernelEnv( &kEnv );
+ checkKernelName( &kEnv, aKernelName );
+ float *fpLeftData = (float *)malloc( sizeof(float) * nRowSize );
+ float *fpRightData = (float *)malloc( sizeof(float) * nRowSize );
+ float *fpResult = (float *)malloc( sizeof(float) * nRowSize );
+ for(int i=0;i<nRowSize;i++)
+ {
+ fpLeftData[i] = (float)dpLeftData[i];
+ fpRightData[i] = (float)dpRightData[i];
+ }
+ cl_mem clLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ nRowSize * sizeof(float), (void *)fpLeftData, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR),
+ nRowSize * sizeof(float), (void *)fpRightData, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ cl_mem clResult = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE,
+ nRowSize * sizeof(float), NULL, &clStatus);
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&clLeftData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clRightData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clResult );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+
+ global_work_size[0] = nRowSize;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+ NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, clResult, CL_TRUE, 0, nRowSize * sizeof(float), (float *)fpResult, 0, NULL, NULL);
+ CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" );
+ for(int i=0;i<nRowSize;i++)
+ rResult[i] = (double)fpResult[i];
+ if(fpResult)
+ {
+ free(fpResult);
+ fpResult = NULL;
+ }
+ clStatus = clFinish( kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel( kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( clResult );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- clStatus = clReleaseMemObject( mpClmemEndPos );
+ clStatus = clReleaseMemObject( clLeftData );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clRightData );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ return 0;
+}
+
+int OclCalc::oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size )
+{
+ cl_int clStatus = 0;
+ size_t global_work_size[1];
+ setKernelEnv( &kEnv );
+ checkKernelName( &kEnv, aKernelName );
+ float *fpSrcData = (float *)malloc( sizeof(float) * nBufferSize );
+ float *fpResult = (float *)malloc( sizeof(float) * size );
+ for(int i=0;i<nBufferSize;i++)
+ fpSrcData[i] = (float)dpSrcData[i];
+ cl_mem clSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR),
+ nBufferSize * sizeof(float), (void *)fpSrcData, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR),
+ size * sizeof(unsigned int), (void *)nStartPos, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_COPY_HOST_PTR),
+ size * sizeof(unsigned int), (void *)nEndPos, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+ cl_mem outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem),(void *)&clSrcData);
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&clStartPos );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&clEndPos );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&outputCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ global_work_size[0] = size;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue, outputCl, CL_TRUE, 0, size * sizeof(float), (double *)fpResult, 0, NULL, NULL);
+ CHECK_OPENCL( clStatus, "clReadBuffer" );
+ for(int i = 0;i<size;i++)
+ output[i] = (float)fpResult[i];
+ clFinish( kEnv.mpkCmdQueue );
+ if(fpResult)
+ {
+ free(fpResult);
+ fpResult = NULL;
+ }
+ clStatus = clFinish(kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel( kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( outputCl );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clSrcData );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clStartPos );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ clStatus = clReleaseMemObject( clEndPos );
+ CHECK_OPENCL( clStatus, "clReleaseMemObject" );
return 0;
}
-int OclCalc::OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize )
+int OclCalc::oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize )
{
const char *cpKernelName = "oclFormulaCount";
- CheckKernelName( &kEnv, cpKernelName );
+ checkKernelName( &kEnv, cpKernelName );
cl_int clStatus;
size_t global_work_size[1];
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
@@ -1470,14 +1988,14 @@ int OclCalc::OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double
}
//sumproduct
-int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )
+int OclCalc::oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )
{
cl_int clStatus;
size_t global_work_size[1];
memset(dpOutput,0,nSize);
const char *cpFirstKernelName = "oclSignedMul";
const char *cpSecondKernelName = "oclFormulaSumproduct";
- CheckKernelName( &kEnv, cpFirstKernelName );
+ checkKernelName( &kEnv, cpFirstKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, mpClmemMergeLfData, fpSumProMergeLfData, 0, NULL, NULL );
@@ -1508,7 +2026,7 @@ int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
- CheckKernelName( &kEnv,cpSecondKernelName );
+ checkKernelName( &kEnv,cpSecondKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kEnv.mckKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
cl_mem clpOutput = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nSize* sizeof(float), NULL, &clStatus );
@@ -1556,12 +2074,13 @@ int OclCalc::OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *
static cl_mem allocateDoubleBuffer( KernelEnv &rEnv, const double *_pValues, size_t nElements, cl_int *pStatus )
{
// Ugh - horrible redundant copying ...
- cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
nElements * sizeof(double), NULL, pStatus);
double *pValues = (double *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
nElements * sizeof(double), 0, NULL, NULL, NULL);
clFinish(rEnv.mpkCmdQueue);
- memcpy( pValues, _pValues, nElements*sizeof(double) );
+ for ( int i = 0; i < (int)nElements; i++ )
+ pValues[i] = _pValues[i];
clEnqueueUnmapMemObject( rEnv.mpkCmdQueue, xValues, pValues, 0, NULL, NULL );
clFinish( rEnv.mpkCmdQueue );
return xValues;
@@ -1570,7 +2089,7 @@ static cl_mem allocateDoubleBuffer( KernelEnv &rEnv, const double *_pValues, siz
static cl_mem allocateFloatBuffer( KernelEnv &rEnv, const double *_pValues, size_t nElements, cl_int *pStatus )
{
// Ugh - horrible redundant copying ...
- cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+ cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
nElements * sizeof(float), NULL, pStatus);
float *pValues = (float *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
nElements * sizeof(float), 0, NULL, NULL, NULL );
@@ -1583,9 +2102,185 @@ static cl_mem allocateFloatBuffer( KernelEnv &rEnv, const double *_pValues, size
return xValues;
}
-double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double del )
+int OclCalc::oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double del ,uint *nStartPos,uint *nEndPos,double *dpResult)
+{
+ setKernelEnv( &kEnv );
+
+ char kernelName[256] = "";
+ double delta = del;
+ bool subFlag = false;
+ strcat(kernelName,"ocl");
+ for ( size_t i = 0; i < eOpNum; i++ )
+ {
+ switch ( eOp[i] )
+ {
+ case ocAdd:
+ strcat(kernelName,"Add");
+ break;
+ case ocSub:
+ strcat(kernelName,"Sub");
+ break;
+ case ocMul:
+ strcat(kernelName,"Mul");
+ break;
+ case ocDiv:
+ strcat(kernelName,"Div");
+ break;
+ case ocMax:
+ strcat(kernelName,"Max");
+ break;
+ case ocMin:
+ strcat(kernelName,"Min");
+ break;
+ case ocAverage:
+ strcat(kernelName,"Average");
+ break;
+ default:
+ assert( false );
+ break;
+ }
+ }
+ checkKernelName( &kEnv, kernelName );
+ cl_int clStatus;
+ size_t global_work_size[1];
+ if ( ( eOpNum == 1 ) && ( eOp[0] == ocSub ) )
+ subFlag = true;
+
+ kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, kernelName, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateKernel" );
+
+ cl_mem valuesCl = NULL, subtractCl = NULL, outputCl = NULL, startPosCL = NULL, endPosCL = NULL;
+
+ if(!subFlag)
+ {
+ startPosCL = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+ nElements * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ endPosCL = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+ nElements * sizeof(unsigned int), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ unsigned int *npStartPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, startPosCL, CL_TRUE, CL_MAP_WRITE, 0,
+ nElements * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ unsigned int *npEndPosMap = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, endPosCL, CL_TRUE, CL_MAP_WRITE, 0,
+ nElements * sizeof(uint), 0, NULL, NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ for(uint i=0;i<nElements;i++)
+ {
+ npStartPosMap[i]=nStartPos[i];
+ npEndPosMap[i]=nEndPos[i];
+ }
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, startPosCL, npStartPosMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+ clFinish( kEnv.mpkCmdQueue );
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, endPosCL, npEndPosMap, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
+
+ if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag )
+ {
+ valuesCl = allocateDoubleBuffer( kEnv, pOpArray, nSrcDataSize, &clStatus );
+ subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus );
+ }
+ else
+ {
+ valuesCl = allocateFloatBuffer( kEnv, pOpArray, nSrcDataSize, &clStatus );
+ subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE , nElements * sizeof(float), NULL, &clStatus);
+ }
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg");
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&subtractCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg");
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&startPosCL );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(cl_mem), (void *)&endPosCL );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+
+ fprintf( stderr, "prior to enqueue range kernel\n" );
+ }
+ else
+ {
+ if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag )
+ {
+ subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_double), (void *)&delta );
+ CHECK_OPENCL( clStatus, "clSetKernelArg");
+ }
+ else
+ {
+ float fTmp = (float)delta;
+ subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus );
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_float), (void *)&fTmp );
+ CHECK_OPENCL( clStatus, "clSetKernelArg");
+ }
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&subtractCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg");
+ clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(cl_mem), (void *)&outputCl );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ }
+ global_work_size[0] = nElements;
+ clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kEnv.mpkKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+ CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
+ clFinish( kEnv.mpkCmdQueue );
+
+ if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag )
+ {
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue,
+ outputCl,
+ CL_TRUE,0,
+ nElements * sizeof(double),
+ (void *)dpResult,0,NULL,NULL);
+ CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ }
+ else
+ {
+ float *afBuffer = new float[nElements];
+ if ( !afBuffer )
+ return -1;
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue,
+ outputCl,
+ CL_TRUE,0,
+ nElements * sizeof(float),
+ (void *)afBuffer,0,NULL,NULL);
+ CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" );
+ clFinish( kEnv.mpkCmdQueue );
+ for ( size_t i = 0; i < nElements; i++ )
+ {
+ dpResult[i] = (double)afBuffer[i];
+ }
+ if ( !afBuffer )
+ delete [] afBuffer;
+ }
+
+ clStatus = clFinish( kEnv.mpkCmdQueue );
+ CHECK_OPENCL( clStatus, "clFinish" );
+ clStatus = clReleaseKernel( kEnv.mpkKernel );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+
+ CHECK_OPENCL_RELEASE( clStatus, valuesCl );
+ CHECK_OPENCL_RELEASE( clStatus, subtractCl );
+ CHECK_OPENCL_RELEASE( clStatus, outputCl );
+ CHECK_OPENCL_RELEASE( clStatus, startPosCL );
+ CHECK_OPENCL_RELEASE( clStatus, endPosCL );
+
+ fprintf( stderr, "completed opencl operation\n" );
+
+ return 0;
+}
+double *OclCalc::oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double del )
{
- SetKernelEnv( &kEnv );
+ setKernelEnv( &kEnv );
// select a kernel: cut & paste coding is utterly evil.
const char *kernelName = NULL;
@@ -1613,7 +2308,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
default:
assert( false );
}
- CheckKernelName( &kEnv, kernelName );
+ checkKernelName( &kEnv, kernelName );
cl_int clStatus;
size_t global_work_size[1];
@@ -1637,13 +2332,13 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
{
valuesCl = allocateDoubleBuffer( kEnv, pOpArray, nElements, &clStatus );
subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
- outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus );
}
else
{
valuesCl = allocateFloatBuffer( kEnv, pOpArray, nElements, &clStatus );
subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
- outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(float), NULL, &clStatus);
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus);
}
CHECK_OPENCL( clStatus, "clCreateBuffer" );
@@ -1668,7 +2363,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag )
{
subtractCl = allocateDoubleBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
- outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(double), NULL, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(double), NULL, &clStatus );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_double), (void *)&delta );
CHECK_OPENCL( clStatus, "clSetKernelArg");
}
@@ -1676,7 +2371,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
{
float fTmp = (float)delta;
subtractCl = allocateFloatBuffer( kEnv, pSubtractSingle, nElements, &clStatus );
- outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(float), NULL, &clStatus );
+ outputCl = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nElements * sizeof(float), NULL, &clStatus );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_float), (void *)&fTmp );
CHECK_OPENCL( clStatus, "clSetKernelArg");
}
@@ -1695,28 +2390,34 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
return NULL; // leak.
if ( gpuEnv.mnKhrFp64Flag || gpuEnv.mnAmdFp64Flag )
{
- double *pOutput = (double *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,
- CL_MAP_READ,0,nElements*sizeof(double), 0,NULL,NULL,NULL);
- clFinish(kEnv.mpkCmdQueue);
- memcpy( pResult, pOutput, nElements * sizeof(double) );
- clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pResult,0,NULL,NULL);
- clFinish(kEnv.mpkCmdQueue);
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue,
+ outputCl,
+ CL_TRUE,0,
+ nElements * sizeof(double),
+ (void *)pResult,0,NULL,NULL);
}
else
{
- float *pOutput = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, outputCl, CL_TRUE,
- CL_MAP_READ, 0, nElements*sizeof(float), 0, NULL, NULL, NULL );
- clFinish( kEnv.mpkCmdQueue );
+ float *afBuffer = new float[nElements];
+ if ( !afBuffer )
+ return NULL;
+ clStatus = clEnqueueReadBuffer(kEnv.mpkCmdQueue,
+ outputCl,
+ CL_TRUE,0,
+ nElements * sizeof(float),
+ (void *)afBuffer,0,NULL,NULL);
for ( int i = 0; i < (int)nElements; i++ )
- pResult[i] = (double)pOutput[i];
- clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, outputCl, pOutput, 0, NULL, NULL );
- clFinish( kEnv.mpkCmdQueue );
+ pResult[i] = (double)afBuffer[i];
+ if ( !afBuffer )
+ delete [] afBuffer;
}
+ CHECK_OPENCL( clStatus, "clEnqueueReadBuffer" );
clStatus = clFinish( kEnv.mpkCmdQueue );
CHECK_OPENCL( clStatus, "clFinish" );
clStatus = clReleaseKernel( kEnv.mpkKernel );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
+
if ( valuesCl != NULL )
{
clStatus = clReleaseMemObject( valuesCl );
@@ -1737,7 +2438,7 @@ double *OclCalc::OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, co
return pResult;
}
-int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim )
+int OclCalc::oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim )
{
cl_int clStatus = 0;
uint nMatrixSize = nDim * nDim;
@@ -1746,7 +2447,10 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM
CHECK_OPENCL( clStatus, "clCreateBuffer" );
cl_mem clpYData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nMatrixSize * sizeof(double), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clpNData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE ), nDim * sizeof(uint), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
double * dpY = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpYData, CL_TRUE, CL_MAP_WRITE, 0, nMatrixSize * sizeof(double), 0, NULL,NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
memset( dpY, 0, nMatrixSize*sizeof(double) );
memset( dpOclMatrixDst, 0, nMatrixSize*sizeof(double) );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpYData, dpY, 0, NULL, NULL );
@@ -1758,14 +2462,20 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM
for (uint j=0;j<nDim;j++)
{
if ( i == j )
- dpP[i*nDim+j]=1.0;
+ dpP[i*nDim+j] = 1.0;
else
- dpP[i*nDim+j]=0.0;
+ dpP[i*nDim+j] = 0.0;
}
}
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpPData, dpP, 0, NULL, NULL );
+ uint * npDim = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpNData, CL_TRUE, CL_MAP_WRITE, 0, nDim * sizeof(uint), 0, NULL,NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ for ( uint i = 0; i < nDim; i++ )
+ npDim[i] = nDim;
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpNData, npDim, 0, NULL, NULL );
+
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
- CheckKernelName( &kEnv,aKernelName );
+ checkKernelName( &kEnv,aKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData );
@@ -1805,6 +2515,9 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM
CHECK_OPENCL( clStatus, "clSetKernelArg" );
clStatus = clSetKernelArg( kernel_solve, 3, sizeof(cl_mem), (void *)&clpYData );
CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kernel_solve, 4, sizeof(cl_mem), (void *)&clpNData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+
clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kernel_solve, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
@@ -1819,18 +2532,23 @@ int OclCalc::OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclM
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( mpClmemLeftData );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ mpClmemLeftData = NULL;
clStatus = clReleaseMemObject( mpClmemRightData );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ mpClmemRightData = NULL;
clStatus = clReleaseKernel( kernel_solve );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( clpPData );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( clpYData );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( clpNData );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
+
return 0;
}
-int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )
+int OclCalc::oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )
{
cl_int clStatus = 0;
uint nMatrixSize = nDim * nDim;
@@ -1840,7 +2558,10 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa
CHECK_OPENCL( clStatus, "clCreateBuffer" );
cl_mem clpYData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ), nMatrixSize * sizeof(float), NULL, &clStatus );
CHECK_OPENCL( clStatus, "clCreateBuffer" );
+ cl_mem clpNData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE ), nDim * sizeof(uint), NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clCreateBuffer" );
float * fpY = (float *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpYData, CL_TRUE, CL_MAP_WRITE, 0, nMatrixSize * sizeof(float), 0, NULL,NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
memset( fpY, 0, nMatrixSize*sizeof(float) );
memset( fpOclMatrixDst, 0, nMatrixSize*sizeof(float) );
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpYData, fpY, 0, NULL, NULL );
@@ -1859,7 +2580,12 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa
}
clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpPData, fpP, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueUnmapMemObject" );
- CheckKernelName( &kEnv,aKernelName );
+ uint * npDim = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, clpNData, CL_TRUE, CL_MAP_WRITE, 0, nDim * sizeof(uint), 0, NULL,NULL, &clStatus );
+ CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+ for ( uint i = 0; i < nDim; i++ )
+ npDim[i] = nDim;
+ clStatus = clEnqueueUnmapMemObject( kEnv.mpkCmdQueue, clpNData, npDim, 0, NULL, NULL );
+ checkKernelName( &kEnv,aKernelName );
kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, aKernelName, &clStatus );
CHECK_OPENCL( clStatus, "clCreateKernel" );
clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&mpClmemLeftData );
@@ -1902,6 +2628,9 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa
CHECK_OPENCL( clStatus, "clSetKernelArg" );
clStatus = clSetKernelArg( kernel_solve, 3, sizeof(cl_mem), (void *)&clpYData );
CHECK_OPENCL( clStatus, "clSetKernelArg" );
+ clStatus = clSetKernelArg( kernel_solve, 4, sizeof(cl_mem), (void *)&clpNData );
+ CHECK_OPENCL( clStatus, "clSetKernelArg" );
+
clStatus = clEnqueueNDRangeKernel( kEnv.mpkCmdQueue, kernel_solve, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
clFinish( kEnv.mpkCmdQueue );
@@ -1916,14 +2645,18 @@ int OclCalc::OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMa
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( mpClmemLeftData );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ mpClmemLeftData = NULL;
clStatus = clReleaseMemObject( mpClmemRightData );
CHECK_OPENCL( clStatus, "clReleaseMemObject" );
+ mpClmemRightData = NULL;
clStatus = clReleaseKernel( kernel_solve );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( clpPData );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
clStatus = clReleaseMemObject( clpYData );
CHECK_OPENCL( clStatus, "clReleaseKernel" );
+ clStatus = clReleaseMemObject( clpNData );
+ CHECK_OPENCL( clStatus, "clReleaseKernel" );
return 0;
}
diff --git a/sc/source/core/opencl/openclwrapper.hxx b/sc/source/core/opencl/openclwrapper.hxx
index 173ae58831d1..cf3b4f1a17a2 100644
--- a/sc/source/core/opencl/openclwrapper.hxx
+++ b/sc/source/core/opencl/openclwrapper.hxx
@@ -7,8 +7,8 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
-#ifndef SC_OPENCL_WRAPPER_H
-#define SC_OPENCL_WRAPPER_H
+#ifndef SC_OPENCLWRAPPER_HXX
+#define SC_OPENCLWRAPPER_HXX
#include <config_features.h>
#include <formula/opcode.hxx>
@@ -87,6 +87,19 @@ if( status != CL_SUCCESS ) \
return 0; \
}
+#define CHECK_OPENCL_VOID(status,name) \
+if( status != CL_SUCCESS ) \
+{ \
+ printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when %s .\n", status, name); \
+}
+
+#define CHECK_OPENCL_RELEASE(status,name) \
+if ( name != NULL ) \
+ clReleaseMemObject( name ); \
+if( status != CL_SUCCESS ) \
+{ \
+ printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE " when clReleaseMemObject( %s ).\n", status, #name); \
+}
#define MAX_KERNEL_STRING_LEN 64
#define MAX_CLFILE_NUM 50
@@ -119,25 +132,48 @@ typedef struct
char kernelName[MAX_KERNEL_NAME_LEN + 1];
char *kernelStr;
} kernel_node;
-
+typedef struct _SingleVectorFormula
+{
+ const double *mdpInputLeftData;
+ const double *mdpInputRightData;
+ size_t mnInputLeftDataSize;
+ size_t mnInputRightDataSize;
+ uint mnInputLeftStartPosition;
+ uint mnInputRightStartPosition;
+ int mnInputLeftOffset;
+ int mnInputRightOffset;
+} SingleVectorFormula;
+
+typedef struct _DoubleVectorFormula
+{
+ const double *mdpInputData;
+ size_t mnInputDataSize;
+ uint mnInputStartPosition;
+ uint mnInputEndPosition;
+ int mnInputStartOffset;
+ int mnInputEndOffset;
+} DoubleVectorFormula;
class OpenclCalcBase
{
public:
OpenclCalcBase(){};
virtual ~OpenclCalcBase(){};
- virtual int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize )=0;
- virtual int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize )=0;
- virtual int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0;
- virtual int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0;
- virtual int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>& dpResult, uint nDim)=0;
+ virtual int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize )=0;
+ virtual int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;
+ virtual int oclHostFormulaStatistics64Bits( const char* aKernelName,double *&output, int outputSize )=0;
+ virtual int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize)=0;
+ virtual int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize )=0;
+ virtual int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst,std::vector<double>&dpResult, uint nDim)=0;
+ virtual int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize )=0;
- virtual int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize )=0;
- virtual int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize )=0;
- virtual int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0;
- virtual int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0;
- virtual int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0;
+ virtual int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize )=0;
+ virtual int oclHostFormulaStatistics32Bits( const char* aKernelName,double *output, int outputSize )=0;
+ virtual int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize)=0;
+ virtual int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize )=0;
+ virtual int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim )=0;
- virtual double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0;
+ virtual int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle, size_t nSrcDataSize,size_t nElements, double delta,uint *nStartPos,uint *nEndPos ,double *deResult)=0;
+ virtual double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta )=0;
};
@@ -151,40 +187,40 @@ public:
static int isInited;
OpenclDevice();
~OpenclDevice();
- static int InitEnv();
- static int RegistOpenclKernel();
- static int ReleaseOpenclRunEnv();
- static int InitOpenclRunEnv( GPUEnv *gpu );
- static int ReleaseOpenclEnv( GPUEnv *gpuInfo );
- static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
- static int InitOpenclRunEnv( int argc );
- static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
- static int GeneratBinFromKernelSource( cl_program program, const char * clFileName );
- static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
- static int BinaryGenerated( const char * clFileName, FILE ** fhandle );
- static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
-
- int InitOpenclAttr( OpenCLEnv * env );
- int ReleaseKernel( KernelEnv * env );
- int SetKernelEnv( KernelEnv *envInfo );
- int CreateKernel( char * kernelname, KernelEnv * env );
- int RunKernel( const char *kernelName, void **userdata );
- int ConvertToString( const char *filename, char **source );
- int CheckKernelName( KernelEnv *envInfo, const char *kernelName );
- int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function );
- int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
- int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
+ static int initEnv();
+ static int registOpenclKernel();
+ static int releaseOpenclRunEnv();
+ static int initOpenclRunEnv( GPUEnv *gpu );
+ static int releaseOpenclEnv( GPUEnv *gpuInfo );
+ static int compileKernelFile( GPUEnv *gpuInfo, const char *buildOption );
+ static int initOpenclRunEnv( int argc );
+ static int cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName );
+ static int generatBinFromKernelSource( cl_program program, const char * clFileName );
+ static int writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes );
+ static int binaryGenerated( const char * clFileName, FILE ** fhandle );
+ static int compileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption );
+
+ int initOpenclAttr( OpenCLEnv * env );
+ int releaseKernel( KernelEnv * env );
+ int setKernelEnv( KernelEnv *envInfo );
+ int createKernel( char * kernelname, KernelEnv * env );
+ int runKernel( const char *kernelName, void **userdata );
+ int convertToString( const char *filename, char **source );
+ int checkKernelName( KernelEnv *envInfo, const char *kernelName );
+ int registerKernelWrapper( const char *kernelName, cl_kernel_function function );
+ int runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata );
+ int getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function );
#ifdef WIN32
- static int LoadOpencl();
- static int OpenclInite();
- static void FreeOpenclDll();
+ static int loadOpencl();
+ static int openclInite();
+ static void freeOpenclDll();
#endif
- int GetOpenclState();
- void SetOpenclState( int state );
- inline static int AddKernelConfig( int kCount, const char *kName );
+ int getOpenclState();
+ void setOpenclState( int state );
+ inline static int addKernelConfig( int kCount, const char *kName );
};
@@ -201,6 +237,10 @@ public:
cl_mem mpClmemMergeLfData;
cl_mem mpClmemMergeRtData;
cl_mem mpClmemMatixSumSize;
+ cl_mem mpClmemeOp;
+ unsigned int nArithmeticLen;
+ unsigned int nFormulaLen;
+ unsigned int nClmemLen;
unsigned int nFormulaColSize;
unsigned int nFormulaRowSize;
@@ -208,27 +248,49 @@ public:
~OclCalc();
// for 64bits double
- int OclHostArithmeticOperator64Bits( const char* aKernelName, double *fpLeftData, double *fpRightData, double *&rResult, int nRowSize );
- int OclHostFormulaStatistics64Bits( const char* aKernelName, double *fpSrcData, uint *npStartPos, uint *npEndPos, double *&output, int outputSize);
- int OclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize );
- int OclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize);
- int OclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim );
+ int oclHostArithmeticOperator64Bits( const char* aKernelName, double *&rResult, int nRowSize );
+ int oclMoreColHostArithmeticOperator64Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
+ int oclHostFormulaStatistics64Bits( const char* aKernelName, double *&output, int outputSize);
+ int oclHostFormulaStash64Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size);
+ int oclHostFormulaCount64Bits( uint *npStartPos, uint *npEndPos, double *&dpOutput, int nSize );
+ int oclHostFormulaSumProduct64Bits( double *fpSumProMergeLfData, double *fpSumProMergeRrData, uint *npSumSize, double *&dpOutput, int nSize);
+ int oclHostMatrixInverse64Bits( const char* aKernelName, double *dpOclMatrixSrc, double *dpOclMatrixDst, std::vector<double>&dpResult, uint nDim );
// for 32bits float
- int OclHostArithmeticOperator32Bits( const char* aKernelName, float *fpLeftData, float *fpRightData, double *rResult, int nRowSize );
- int OclHostFormulaStatistics32Bits( const char* aKernelName, float *fpSrcData, uint *npStartPos, uint *npEndPos, double *output, int outputSize);
- int OclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize );
- int OclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize );
- int OclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim );
+ int oclHostArithmeticOperator32Bits( const char* aKernelName, double *rResult, int nRowSize );
+ int oclMoreColHostArithmeticOperator32Bits( int nDataSize,int neOpSize,double *rResult, int nRowSize );
+ int oclHostFormulaStatistics32Bits( const char* aKernelName, double *output, int outputSize);
+ int oclHostFormulaCount32Bits( uint *npStartPos, uint *npEndPos, double *dpOutput, int nSize );
+ int oclHostArithmeticStash64Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
+ int oclHostFormulaSumProduct32Bits( float *fpSumProMergeLfData, float *fpSumProMergeRrData, uint *npSumSize, double *dpOutput, int nSize );
+ int oclHostMatrixInverse32Bits( const char* aKernelName, float *fpOclMatrixSrc, float *fpOclMatrixDst, std::vector<double>& dpResult, uint nDim );
// for groundwater
- double *OclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta );
+ int oclGroundWaterGroup( uint *eOp, uint eOpNum, const double *pOpArray, const double *pSubtractSingle,size_t nSrcDataSize, size_t nElements, double delta ,uint *nStartPos,uint *nEndPos,double *deResult);
+ double *oclSimpleDeltaOperation( OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements, double delta );
///////////////////////////////////////////////////////////////
- int CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize );
- int CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize );
- int CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize );
- int CreateBuffer32Bits( float *&fpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize );
- int CreateBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize );
- int CreateBuffer32Bits( float *&fpSumProMergeLfData, float *&fpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize );
+ int createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize );
+ int mapAndCopy64Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
+ int mapAndCopy64Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
+ int mapAndCopyArithmetic64Bits( const double *dpMoreArithmetic,int nBufferSize );
+ int mapAndCopyMoreColArithmetic64Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
+ int createMoreColArithmeticBuf64Bits( int nBufferSize, int neOpSize );
+
+ int createFormulaBuf64Bits( int nBufferSize, int rowSize );
+ int createArithmeticOptBuf64Bits( int nBufferSize );
+
+ int createBuffer32Bits( float *&fpLeftData, float *&fpRightData, int nBufferSize );
+ int mapAndCopy32Bits(const double *dpTempLeftData,const double *dpTempRightData,int nBufferSize );
+ int mapAndCopy32Bits(const double *dpTempSrcData,unsigned int *unStartPos,unsigned int *unEndPos,int nBufferSize ,int nRowsize);
+ int mapAndCopyArithmetic32Bits( const double *dpMoreColArithmetic, int nBufferSize );
+ int mapAndCopyMoreColArithmetic32Bits( const double *dpMoreColArithmetic,int nBufferSize ,uint *npeOp,uint neOpSize );
+ int createMoreColArithmeticBuf32Bits( int nBufferSize, int neOpSize );
+ int createFormulaBuf32Bits( int nBufferSize, int rowSize );
+ int createArithmeticOptBuf32Bits( int nBufferSize );
+ int oclHostFormulaStash32Bits( const char* aKernelName, const double* dpSrcData, uint *nStartPos, uint *nEndPos, double *output, int nBufferSize, int size );
+ int oclHostArithmeticStash32Bits( const char* aKernelName, const double *dpLeftData, const double *dpRightData, double *rResult,int nRowSize );
+
+ int releaseOclBuffer(void);
+ friend class agency;
};
#endif