diff options
author | dante <dante19031999@gmail.com> | 2021-05-16 16:04:20 +0200 |
---|---|---|
committer | Mike Kaganski <mike.kaganski@collabora.com> | 2021-08-26 08:48:35 +0200 |
commit | 5b9cf5881ef53fac5f1d8376f687dbadf9d3cf2b (patch) | |
tree | ff16b0dcf62236c48fed6ca87e7d584959ed1841 /sc | |
parent | fb1233d77500413ba237c335a84773d8a6f8e381 (diff) |
tdf#142307 - Upgrade SSE2 sum to AVX512 sum with Neumaier 1
This part focuses on allowing it on replacing arrayfunctor
By thefault it will try AVX512F (1,17%)
If not available will use AVX (94,77%)
Use of AVX2 (82,28%) has been avoided even if the code could been more compact
Source of hardware statistics: https://store.steampowered.com/hwsurvey
Change-Id: Iae737a565379e82c5f84f3fdee6321ac74f59d40
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115675
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'sc')
-rw-r--r-- | sc/Library_sc.mk | 8 | ||||
-rw-r--r-- | sc/inc/arraysumfunctor.hxx | 137 | ||||
-rw-r--r-- | sc/inc/kahan.hxx | 9 | ||||
-rw-r--r-- | sc/qa/unit/functions_statistical.cxx | 22 | ||||
-rw-r--r-- | sc/source/core/inc/arraysumfunctor.hxx | 121 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX.cxx | 114 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX512.cxx | 140 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumSSE2.cxx | 140 | ||||
-rw-r--r-- | sc/source/core/tool/interpr6.cxx | 4 |
9 files changed, 513 insertions, 182 deletions
diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk index f1b62e1c263d..a85650e935e6 100644 --- a/sc/Library_sc.mk +++ b/sc/Library_sc.mk @@ -99,6 +99,14 @@ $(eval $(call gb_Library_use_libraries,sc,\ )) $(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \ +)) + +$(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \ +)) + +$(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \ )) diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx new file mode 100644 index 000000000000..ee6d4c0fca5c --- /dev/null +++ b/sc/inc/arraysumfunctor.hxx @@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#pragma once + +#include <cmath> +#include "kahan.hxx" +#include "scdllapi.h" +#include <tools/cpuid.hxx> +#include <formula/errorcodes.hxx> + +namespace sc::op +{ +/* Checkout available optimization options */ +SC_DLLPUBLIC extern const bool hasAVX512F; +const bool hasAVX = cpuid::hasAVX(); +const bool hasSSE2 = cpuid::hasSSE2(); + +/** + * Performs one step of the Neumanier sum between doubles + * Overwrites the summand and error + * @parma sum + * @param err + * @param value + */ +inline void sumNeumanierNormal(double& sum, double& err, const double& value) +{ + double t = sum + value; + if (std::abs(sum) >= std::abs(value)) + err += (sum - t) + value; + else + err += (value - t) + sum; + sum = t; +} + +/** + * If no boosts available, Unrolled KahanSum. + * Most likely to use on android. + */ +static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pCurrent) +{ + size_t nRealSize = nSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 4); + + if (nUnrolledSize > 0) + { + KahanSum sum0 = 0.0; + KahanSum sum1 = 0.0; + KahanSum sum2 = 0.0; + KahanSum sum3 = 0.0; + + for (; i + 3 < nUnrolledSize; i += 4) + { + sum0 += *pCurrent++; + sum1 += *pCurrent++; + sum2 += *pCurrent++; + sum3 += *pCurrent++; + } + // We are using pairwise summation alongside Kahan + return (sum0 + sum1) + (sum2 + sum3); + } + return 0.0; +} + +/* Available methos */ +SC_DLLPUBLIC KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent); +SC_DLLPUBLIC KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent); +SC_DLLPUBLIC KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent); + +/** + * This function task is to choose the fastest method available to perform the sum. + * @param i + * @param nSize + * @param pCurrent + */ +static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent) +{ + if (hasAVX512F) + return executeAVX512F(i, nSize, pCurrent); + if (hasAVX) + return executeAVX(i, nSize, pCurrent); + if (hasSSE2) + return executeSSE2(i, nSize, pCurrent); + return executeUnrolled(i, nSize, pCurrent); +} + +/** + * Performs the sum of an array. + * Note that align 16 will speed up the process. + * @param pArray + * @param nSize + */ +inline KahanSum sumArray(const double* pArray, size_t nSize) +{ + size_t i = 0; + const double* pCurrent = pArray; + KahanSum fSum = executeFast(i, nSize, pCurrent); + + // sum rest of the array + for (; i < nSize; ++i) + fSum += pArray[i]; + + // If the sum is a NaN, some of the terms were empty cells, probably. + // Re-calculate, carefully + double fVal = fSum.get(); + if (!std::isfinite(fVal)) + { + FormulaError nErr = GetDoubleErrorValue(fVal); + if (nErr == FormulaError::NoValue) + { + fSum = 0; + for (i = 0; i < nSize; i++) + { + if (!std::isfinite(pArray[i])) + { + nErr = GetDoubleErrorValue(pArray[i]); + if (nErr != FormulaError::NoValue) + fSum += pArray[i]; // Let errors encoded as NaNs propagate ??? + } + else + fSum += pArray[i]; + } + } + } + return fSum; +} + +} // end namespace sc::op + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx index 49c7922b3c79..3404fb6d14a6 100644 --- a/sc/inc/kahan.hxx +++ b/sc/inc/kahan.hxx @@ -28,6 +28,12 @@ public: { } + constexpr KahanSum(double x_0, double err_0) + : m_fSum(x_0) + , m_fError(err_0) + { + } + constexpr KahanSum(const KahanSum& fSum) = default; public: @@ -182,6 +188,9 @@ public: return fSum.m_fSum != m_fSum || fSum.m_fError != m_fError; } + constexpr double* getSum() { return &m_fSum; } + constexpr double* getError() { return &m_fError; } + public: /** * Returns the final sum. diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx index 4d97d4cc1689..e82d762d88ee 100644 --- a/sc/qa/unit/functions_statistical.cxx +++ b/sc/qa/unit/functions_statistical.cxx @@ -1,4 +1,5 @@ #include "functions_test.hxx" +#include <arraysumfunctor.hxx> class StatisticalFunctionsTest : public FunctionsTest { @@ -6,9 +7,11 @@ public: StatisticalFunctionsTest(); void testStatisticalFormulasFODS(); + void testIntrinsicSums(); CPPUNIT_TEST_SUITE(StatisticalFunctionsTest); CPPUNIT_TEST(testStatisticalFormulasFODS); + CPPUNIT_TEST(testIntrinsicSums); CPPUNIT_TEST_SUITE_END(); }; @@ -26,6 +29,25 @@ StatisticalFunctionsTest::StatisticalFunctionsTest(): { } +void StatisticalFunctionsTest::testIntrinsicSums() +{ + // Checkout SSE2, AVX and AVX512 opperations + // Needs exactly 9 terms + double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 }; + double* pCurrent = summands; + size_t i = 0; + if (sc::op::hasAVX512F) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX512F(i, 9, pCurrent).get()); + i = 0; + if (sc::op::hasAVX) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX(i, 9, pCurrent).get()); + i = 0; + if (sc::op::hasSSE2) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeSSE2(i, 9, pCurrent).get()); + i = 0; + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get()); +} + CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest); CPPUNIT_PLUGIN_IMPLEMENT(); diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx deleted file mode 100644 index ae8d38db1338..000000000000 --- a/sc/source/core/inc/arraysumfunctor.hxx +++ /dev/null @@ -1,121 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - */ - -#pragma once - -#include <cstdint> -#include <cmath> - -#include <sal/mathconf.h> -#include <sal/types.h> -#include <tools/simd.hxx> -#include <tools/cpuid.hxx> -#include <kahan.hxx> - -namespace sc -{ -struct ArraySumFunctor -{ -private: - const double* mpArray; - size_t mnSize; - -public: - ArraySumFunctor(const double* pArray, size_t nSize) - : mpArray(pArray) - , mnSize(nSize) - { - } - - KahanSum operator()() - { - const static bool hasSSE2 = cpuid::hasSSE2(); - - KahanSum fSum = 0.0; - size_t i = 0; - const double* pCurrent = mpArray; - - if (hasSSE2) - { - while (i < mnSize && !simd::isAligned<double, 16>(pCurrent)) - { - fSum += *pCurrent++; - i++; - } - if (i < mnSize) - { - fSum += executeSSE2(i, pCurrent); - } - } - else - fSum = executeUnrolled(i, pCurrent); - - // sum rest of the array - - for (; i < mnSize; ++i) - fSum += mpArray[i]; - - // If the sum is a NaN, some of the terms were empty cells, probably. - // Re-calculate, carefully - double fVal = fSum.get(); - if (!std::isfinite(fVal)) - { - sal_uInt32 nErr = reinterpret_cast<sal_math_Double*>(&fVal)->nan_parts.fraction_lo; - if (nErr & 0xffff0000) - { - fSum = 0; - for (i = 0; i < mnSize; i++) - { - if (!std::isfinite(mpArray[i])) - { - nErr = reinterpret_cast<const sal_math_Double*>(&mpArray[i]) - ->nan_parts.fraction_lo; - if (!(nErr & 0xffff0000)) - fSum += mpArray[i]; // Let errors encoded as NaNs propagate ??? - } - else - fSum += mpArray[i]; - } - } - } - return fSum; - } - -private: - double executeSSE2(size_t& i, const double* pCurrent) const; - KahanSum executeUnrolled(size_t& i, const double* pCurrent) const - { - size_t nRealSize = mnSize - i; - size_t nUnrolledSize = nRealSize - (nRealSize % 4); - - if (nUnrolledSize > 0) - { - KahanSum sum0 = 0.0; - KahanSum sum1 = 0.0; - KahanSum sum2 = 0.0; - KahanSum sum3 = 0.0; - - for (; i < nUnrolledSize; i += 4) - { - sum0 += *pCurrent++; - sum1 += *pCurrent++; - sum2 += *pCurrent++; - sum3 += *pCurrent++; - } - // We are using pairwise summation alongside Kahan - return (sum0 + sum1) + (sum2 + sum3); - } - return 0.0; - } -}; - -} // end namespace sc - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx new file mode 100644 index 000000000000..af12a08a8cf5 --- /dev/null +++ b/sc/source/core/tool/arraysumAVX.cxx @@ -0,0 +1,114 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> +#include <tools/simdsupport.hxx> + +namespace sc::op +{ +#ifdef LO_AVX_AVAILABLE // Old processors + +const __m256d ANNULATE_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); + +/** Kahan sum with AVX. + */ +static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value) +{ + // Temporal parameter + __m256d t = _mm256_add_pd(sum, value); + // Absolute value of the total sum + __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT); + // Absolute value of the value to add + __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT); + // Comaprate the absolute values sum >= value + __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value)); + __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum)); + err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif + +/** Execute Kahan sum with AVX. + */ +KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_AVX_AVAILABLE + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) + { + // Setup sums and errors as 0 + __m256d sum1 = _mm256_setzero_pd(); + __m256d err1 = _mm256_setzero_pd(); + __m256d sum2 = _mm256_setzero_pd(); + __m256d err2 = _mm256_setzero_pd(); + + for (; i + 7 < nSize; i += 8) + { + // Kahan sum 1 + __m256d load1 = _mm256_loadu_pd(pCurrent); + sumAVX(sum1, err1, load1); + pCurrent += 4; + + // Kahan sum 2 + __m256d load2 = _mm256_loadu_pd(pCurrent); + sumAVX(sum2, err2, load2); + pCurrent += 4; + } + + // Now we combine pairwise summation with Kahan summation + + // sum 1 + sum 2 -> sum 1 + sumAVX(sum1, err1, sum2); + sumAVX(sum1, err1, err2); + + // Store results + double sums[4]; + double errs[4]; + _mm256_storeu_pd(&sums[0], sum1); + _mm256_storeu_pd(&errs[0], err1); + + // First Kahan & pairwise summation + // 0+1 1+2 -> 0, 2 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[2], errs[2], sums[3]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + sumNeumanierNormal(sums[2], errs[2], errs[3]); + + // 0+2 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[2]); + sumNeumanierNormal(sums[0], errs[0], errs[2]); + + // Store result + return KahanSum(sums[0], errs[0]); + } + return 0.0; +#else + SAL_WARN("sc", "Failed to use AVX"); + (void)i; + (void)nSize; + (void)pCurrent; + return 0.0; +#endif +} + +} // end namespace sc::op + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx new file mode 100644 index 000000000000..55764849edfb --- /dev/null +++ b/sc/source/core/tool/arraysumAVX512.cxx @@ -0,0 +1,140 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> +#include <tools/simdsupport.hxx> + +/* TODO Remove this once GCC updated and AVX512 can work. */ +#ifdef __GNUC__ +#if __GNUC__ < 9 +#ifdef LO_AVX512F_AVAILABLE +#define HAS_LO_AVX512F_AVAILABLE +#undef LO_AVX512F_AVAILABLE +#endif +#endif +#endif + +#ifdef LO_AVX512F_AVAILABLE +const bool sc::op::hasAVX512F = cpuid::hasAVX512F(); +#else +const bool sc::op::hasAVX512F = false; +#endif + +namespace sc::op +{ +#ifdef LO_AVX512F_AVAILABLE // New processors + +/** Kahan sum with AVX512. + */ +static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value) +{ + // Temporal parameter + __m512d t = _mm512_add_pd(sum, value); + // Absolute value of the total sum + __m512d asum = _mm512_abs_pd(sum); + // Absolute value of the value to add + __m512d avalue = _mm512_abs_pd(value); + // Comaprate the absolute values sum >= value + __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m512d a = _mm512_mask_blend_pd(mask, sum, value); + __m512d b = _mm512_mask_blend_pd(mask, value, sum); + err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif + +/** Execute Kahan sum with AVX512. + */ +KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_AVX512F_AVAILABLE // New processors + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) + { + // Setup sums and errors as 0 + __m512d sum = _mm512_setzero_pd(); + __m512d err = _mm512_setzero_pd(); + + // Sum the stuff + for (; i + 7 < nSize; i += 8) + { + // Kahan sum + __m512d load = _mm512_loadu_pd(pCurrent); + sumAVX512(sum, err, load); + pCurrent += 8; + } + + // Store result + static_assert(sizeof(double) == 8); + double sums[8]; + double errs[8]; + _mm512_storeu_pd(static_cast<void*>(&sums[0]), sum); + _mm512_storeu_pd(static_cast<void*>(&errs[0]), err); + + // First Kahan & pairwise summation + // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[2], errs[2], sums[3]); + sumNeumanierNormal(sums[4], errs[4], sums[5]); + sumNeumanierNormal(sums[6], errs[6], sums[7]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + sumNeumanierNormal(sums[2], errs[2], errs[3]); + sumNeumanierNormal(sums[4], errs[4], errs[5]); + sumNeumanierNormal(sums[6], errs[6], errs[7]); + + // Second Kahan & pairwise summation + // 0+2 4+6 -> 0, 4 + sumNeumanierNormal(sums[0], errs[0], sums[2]); + sumNeumanierNormal(sums[4], errs[4], sums[6]); + sumNeumanierNormal(sums[0], errs[0], errs[2]); + sumNeumanierNormal(sums[4], errs[4], errs[6]); + + // Third Kahan & pairwise summation + // 0+4 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[4]); + sumNeumanierNormal(sums[0], errs[0], errs[4]); + + // Return final result + return KahanSum(sums[0], errs[0]); + } + else + return 0.0; +#else + SAL_WARN("sc", "Failed to use AVX 512"); + (void)i; + (void)nSize; + (void)pCurrent; + return 0.0; +#endif +} + +} // end namespace sc::op + +/* TODO Remove this once GCC updated and AVX512 can work. */ +#ifdef __GNUC__ +#if __GNUC__ < 9 +#ifdef HAS_LO_AVX512F_AVAILABLE +#define LO_AVX512F_AVAILABLE +#undef HAS_LO_AVX512F_AVAILABLE +#endif +#endif +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx index e69f672b6014..4f6a4b47a11d 100644 --- a/sc/source/core/tool/arraysumSSE2.cxx +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -9,97 +9,121 @@ */ #include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> #include <tools/simdsupport.hxx> -namespace sc +//AVX512VL + AVX512F + KNCNI + +namespace sc::op { -double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const +#ifdef LO_SSE2_AVAILABLE // Old processors + +const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); + +/** Kahan sum with SSE4.2. + */ +static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value) { -#if defined(LO_SSE2_AVAILABLE) - double fSum = 0.0; - size_t nRealSize = mnSize - i; - size_t nUnrolledSize = nRealSize - (nRealSize % 8); + // Temporal parameter + __m128d t = _mm_add_pd(sum, value); + // Absolute value of the total sum + __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT); + // Absolute value of the value to add + __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT); + // Comaprate the absolute values sum >= value + __m128d mask = _mm_cmpge_pd(asum, avalue); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value)); + __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum)); + err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif - if (nUnrolledSize > 0) +/** Execute Kahan sum with SSE2. + */ +KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_SSE2_AVAILABLE + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) { + // Setup sums and errors as 0 __m128d sum1 = _mm_setzero_pd(); - __m128d sum2 = _mm_setzero_pd(); - __m128d sum3 = _mm_setzero_pd(); - __m128d sum4 = _mm_setzero_pd(); - __m128d err1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); __m128d err2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); __m128d err3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); __m128d err4 = _mm_setzero_pd(); - __m128d y, t; - - for (; i < nUnrolledSize; i += 8) + for (; i + 7 < nSize; i += 8) { // Kahan sum 1 - __m128d load1 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load1, err1); - t = _mm_add_pd(sum1, y); - err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); - sum1 = t; + __m128d load1 = _mm_loadu_pd(pCurrent); + sumSSE2(sum1, err1, load1); pCurrent += 2; // Kahan sum 2 - __m128d load2 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load2, err2); - t = _mm_add_pd(sum2, y); - err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y); - sum2 = t; + __m128d load2 = _mm_loadu_pd(pCurrent); + sumSSE2(sum2, err2, load2); pCurrent += 2; // Kahan sum 3 - __m128d load3 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load3, err3); - t = _mm_add_pd(sum3, y); - err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y); - sum3 = t; + __m128d load3 = _mm_loadu_pd(pCurrent); + sumSSE2(sum3, err3, load3); pCurrent += 2; // Kahan sum 4 - __m128d load4 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load4, err4); - t = _mm_add_pd(sum4, y); - err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y); - sum4 = t; + __m128d load4 = _mm_loadu_pd(pCurrent); + sumSSE2(sum4, err4, load4); pCurrent += 2; } // Now we combine pairwise summation with Kahan summation - // sum 1 + sum 2 - y = _mm_sub_pd(sum2, err1); - t = _mm_add_pd(sum1, y); - err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); - sum1 = t; - - // sum 3 + sum 4 - y = _mm_sub_pd(sum4, err3); - t = _mm_add_pd(sum3, y); - sum3 = t; - - // sum 1 + sum 3 - y = _mm_sub_pd(sum3, err1); - t = _mm_add_pd(sum1, y); - sum1 = t; - - double temp; - - _mm_storel_pd(&temp, sum1); - fSum += temp; - - _mm_storeh_pd(&temp, sum1); - fSum += temp; + // 1+2 3+4 -> 1, 3 + sumSSE2(sum1, err1, sum2); + sumSSE2(sum1, err1, err2); + sumSSE2(sum3, err3, sum4); + sumSSE2(sum3, err3, err4); + + // 1+3 -> 1 + sumSSE2(sum1, err1, sum3); + sumSSE2(sum1, err1, err3); + + // Store results + double sums[2]; + double errs[2]; + _mm_storeu_pd(&sums[0], sum1); + _mm_storeu_pd(&errs[0], err1); + + // First Kahan & pairwise summation + // 0+1 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + + // Store result + return KahanSum(sums[0], errs[0]); } - return fSum; + return 0.0; #else + SAL_WARN("sc", "Failed to use SSE2"); (void)i; + (void)nSize; (void)pCurrent; return 0.0; #endif } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx index a291b444ecdc..dbeab67fe35c 100644 --- a/sc/source/core/tool/interpr6.cxx +++ b/sc/source/core/tool/interpr6.cxx @@ -223,9 +223,7 @@ public: return; const double *p = &sc::numeric_block::at(*rNode.data, nOffset); - sc::ArraySumFunctor functor(p, nDataSize); - - maSum += functor(); + maSum += sc::op::sumArray(p, nDataSize); break; } |