diff options
author | Tomaž Vajngerl <tomaz.vajngerl@collabora.com> | 2015-11-13 12:41:16 +0100 |
---|---|---|
committer | Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk> | 2015-11-13 14:10:10 +0100 |
commit | 5493402fb37a1def960c93f7c31aff36a5ab5f9e (patch) | |
tree | 73d1b7b23a09b244b00415563f33e5d77118a3f2 /sc | |
parent | 154bcd887d3772addc8196944044fa57738d3cf2 (diff) |
arraysumfunctor: fast sum a double array, use for SUM() in Calc
This adds an array sum functor which sums a double array in a
as fast as possible way. There are 2 implementations: SSE2 and
a simple unrolled implementation. SSE2 implementation is used if
SSE2 is detected at runtime.
Additional info:
SSE implementation at first processes the array until the array is
aligned by 16-bit boundary (should only process 1 element).
Then the array is processed by summing 8 values in one pass (using
4 variables that are 128-bit wide) where SSE operation can process
2 double values in one call.
Change-Id: I24494b08cae049aa3eabcb086867f1bdd4128374
Diffstat (limited to 'sc')
-rw-r--r-- | sc/source/core/inc/arraysumfunctor.hxx | 141 | ||||
-rw-r--r-- | sc/source/core/tool/interpr6.cxx | 15 |
2 files changed, 145 insertions, 11 deletions
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx new file mode 100644 index 000000000000..776c5143732e --- /dev/null +++ b/sc/source/core/inc/arraysumfunctor.hxx @@ -0,0 +1,141 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#ifndef INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX +#define INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX + +#include <emmintrin.h> +#include <tools/cpuid.hxx> + +namespace sc +{ + +template<typename T, unsigned int N> +inline bool isAligned(const T* pointer) +{ + return 0 == (uintptr_t(pointer) % N); +} + +struct ArraySumFunctor +{ +private: + const double* mpArray; + size_t mnSize; + +public: + ArraySumFunctor(const double* pArray, size_t nSize) + : mpArray(pArray) + , mnSize(nSize) + { + } + + double operator() () + { + static bool hasSSE2 = tools::cpuid::hasSSE2(); + + double fSum = 0.0; + size_t i = 0; + const double* pCurrent = mpArray; + + if (hasSSE2) + { + while (!isAligned<double, 16>(pCurrent)) + { + fSum += *pCurrent++; + i++; + } + fSum += executeSSE2(i, pCurrent); + } + else + fSum += executeUnrolled(i, pCurrent); + + // sum rest of the array + + for (; i < mnSize; ++i) + fSum += mpArray[i]; + + return fSum; + } + +private: + inline double executeSSE2(size_t& i, const double* pCurrent) const + { + double fSum = 0.0; + size_t nRealSize = mnSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 8); + + if (nUnrolledSize > 0) + { + __m128d sum1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); + + for (; i < nUnrolledSize; i += 8) + { + __m128d load1 = _mm_load_pd(pCurrent); + sum1 = _mm_add_pd(sum1, load1); + pCurrent += 2; + + __m128d load2 = _mm_load_pd(pCurrent); + sum2 = _mm_add_pd(sum2, load2); + pCurrent += 2; + + __m128d load3 = _mm_load_pd(pCurrent); + sum3 = _mm_add_pd(sum3, load3); + pCurrent += 2; + + __m128d load4 = _mm_load_pd(pCurrent); + sum4 = _mm_add_pd(sum4, load4); + pCurrent += 2; + } + sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4)); + + double temp; + + _mm_storel_pd(&temp, sum1); + fSum += temp; + + _mm_storeh_pd(&temp, sum1); + fSum += temp; + } + return fSum; + } + + inline double executeUnrolled(size_t& i, const double* pCurrent) const + { + size_t nRealSize = mnSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 4); + + if (nUnrolledSize > 0) + { + double sum0 = 0.0; + double sum1 = 0.0; + double sum2 = 0.0; + double sum3 = 0.0; + + for (; i < nUnrolledSize; i += 4) + { + sum0 += *pCurrent++; + sum1 += *pCurrent++; + sum2 += *pCurrent++; + sum3 += *pCurrent++; + } + return sum0 + sum1 + sum2 + sum3; + } + return 0.0; + } +}; + +} // end namespace sc + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx index a4a75f2d6e7d..5bf453300c81 100644 --- a/sc/source/core/tool/interpr6.cxx +++ b/sc/source/core/tool/interpr6.cxx @@ -26,6 +26,8 @@ #include "mtvcellfunc.hxx" #include "scmatrix.hxx" +#include "arraysumfunctor.hxx" + #include <formula/token.hxx> using namespace formula; @@ -235,18 +237,9 @@ public: if (nDataSize == 0) return; - size_t nUnrolled = (nDataSize & 0x3) >> 2; + sc::ArraySumFunctor functor(p, nDataSize); - // Try to encourage the compiler/CPU to do something sensible for the next. - for (i = 0; i < nUnrolled; i+=4) - { - mfRest += p[i]; - mfRest += p[i+1]; - mfRest += p[i+2]; - mfRest += p[i+3]; - } - for (; i < nDataSize; ++i) - mfRest += p[i]; + mfRest += functor(); break; } |