diff options
author | dante <dante19031999@gmail.com> | 2021-05-16 16:04:20 +0200 |
---|---|---|
committer | Mike Kaganski <mike.kaganski@collabora.com> | 2021-08-26 08:48:35 +0200 |
commit | 5b9cf5881ef53fac5f1d8376f687dbadf9d3cf2b (patch) | |
tree | ff16b0dcf62236c48fed6ca87e7d584959ed1841 | |
parent | fb1233d77500413ba237c335a84773d8a6f8e381 (diff) |
tdf#142307 - Upgrade SSE2 sum to AVX512 sum with Neumaier 1
This part focuses on allowing it on replacing arrayfunctor
By thefault it will try AVX512F (1,17%)
If not available will use AVX (94,77%)
Use of AVX2 (82,28%) has been avoided even if the code could been more compact
Source of hardware statistics: https://store.steampowered.com/hwsurvey
Change-Id: Iae737a565379e82c5f84f3fdee6321ac74f59d40
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115675
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
-rw-r--r-- | configure.ac | 9 | ||||
-rw-r--r-- | include/tools/cpuid.hxx | 17 | ||||
-rw-r--r-- | include/tools/simdsupport.hxx | 13 | ||||
-rw-r--r-- | sc/Library_sc.mk | 8 | ||||
-rw-r--r-- | sc/inc/arraysumfunctor.hxx | 137 | ||||
-rw-r--r-- | sc/inc/kahan.hxx | 9 | ||||
-rw-r--r-- | sc/qa/unit/functions_statistical.cxx | 22 | ||||
-rw-r--r-- | sc/source/core/inc/arraysumfunctor.hxx | 121 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX.cxx | 114 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX512.cxx | 140 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumSSE2.cxx | 140 | ||||
-rw-r--r-- | sc/source/core/tool/interpr6.cxx | 4 | ||||
-rw-r--r-- | tools/source/misc/cpuid.cxx | 5 |
13 files changed, 556 insertions, 183 deletions
diff --git a/configure.ac b/configure.ac index 9e5708cd49d4..bd929450971c 100644 --- a/configure.ac +++ b/configure.ac @@ -7631,6 +7631,7 @@ CXXFLAGS_INTRINSICS_SSE42= CXXFLAGS_INTRINSICS_AVX= CXXFLAGS_INTRINSICS_AVX2= CXXFLAGS_INTRINSICS_AVX512= +CXXFLAGS_INTRINSICS_AVX512F= CXXFLAGS_INTRINSICS_F16C= CXXFLAGS_INTRINSICS_FMA= @@ -7643,6 +7644,7 @@ if test "$GCC" = "yes" -o "$COM_IS_CLANG" = TRUE; then flag_avx=-mavx flag_avx2=-mavx2 flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd" + flag_avx512f=-mavx512f flag_f16c=-mf16c flag_fma=-mfma else @@ -7658,6 +7660,7 @@ else flag_avx=-arch:AVX flag_avx2=-arch:AVX2 flag_avx512=-arch:AVX512 + flag_avx512f=-arch:AVX512 # These are part of -arch:AVX2 flag_f16c=-arch:AVX2 flag_fma=-arch:AVX2 @@ -7807,6 +7810,7 @@ CXXFLAGS=$save_CXXFLAGS AC_MSG_RESULT([${can_compile_avx512}]) if test "${can_compile_avx512}" = "yes" ; then CXXFLAGS_INTRINSICS_AVX512="$flag_avx512" + CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f" fi AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics]) @@ -7859,6 +7863,7 @@ AC_SUBST([CXXFLAGS_INTRINSICS_SSE42]) AC_SUBST([CXXFLAGS_INTRINSICS_AVX]) AC_SUBST([CXXFLAGS_INTRINSICS_AVX2]) AC_SUBST([CXXFLAGS_INTRINSICS_AVX512]) +AC_SUBST([CXXFLAGS_INTRINSICS_AVX512F]) AC_SUBST([CXXFLAGS_INTRINSICS_F16C]) AC_SUBST([CXXFLAGS_INTRINSICS_FMA]) @@ -12013,6 +12018,7 @@ LO_CLANG_CXXFLAGS_INTRINSICS_SSE42= LO_CLANG_CXXFLAGS_INTRINSICS_AVX= LO_CLANG_CXXFLAGS_INTRINSICS_AVX2= LO_CLANG_CXXFLAGS_INTRINSICS_AVX512= +LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F= LO_CLANG_CXXFLAGS_INTRINSICS_F16C= LO_CLANG_CXXFLAGS_INTRINSICS_FMA= @@ -12066,6 +12072,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT flag_avx=-mavx flag_avx2=-mavx2 flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd" + flag_avx512f=-mavx512f flag_f16c=-mf16c flag_fma=-mfma @@ -12213,6 +12220,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT AC_MSG_RESULT([${can_compile_avx512}]) if test "${can_compile_avx512}" = "yes" ; then LO_CLANG_CXXFLAGS_INTRINSICS_AVX512="$flag_avx512" + LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f" fi AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics]) @@ -12319,6 +12327,7 @@ AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_SSE42) AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX) AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX2) AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512) +AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F) AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_F16C) AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_FMA) AC_SUBST(CLANG_USE_LD) diff --git a/include/tools/cpuid.hxx b/include/tools/cpuid.hxx index 90c7d37b485c..98dca497a7de 100644 --- a/include/tools/cpuid.hxx +++ b/include/tools/cpuid.hxx @@ -26,7 +26,8 @@ enum class InstructionSetFlags SSE41 = 0x08, SSE42 = 0x10, AVX = 0x20, - AVX2 = 0x40 + AVX2 = 0x40, + AVX512F = 0x100 }; } // end cpuid @@ -63,6 +64,13 @@ inline bool hasSSSE3() return isCpuInstructionSetSupported(InstructionSetFlags::SSSE3); } +/** Check if AVX is supported by the CPU + */ +inline bool hasAVX() +{ + return isCpuInstructionSetSupported(InstructionSetFlags::AVX); +} + /** Check if AVX2 is supported by the CPU */ inline bool hasAVX2() @@ -70,6 +78,13 @@ inline bool hasAVX2() return isCpuInstructionSetSupported(InstructionSetFlags::AVX2); } +/** Check if AVX512F is supported by the CPU + */ +inline bool hasAVX512F() +{ + return isCpuInstructionSetSupported(InstructionSetFlags::AVX512F); +} + /** Check if Hyper Threading is supported */ inline bool hasHyperThreading() diff --git a/include/tools/simdsupport.hxx b/include/tools/simdsupport.hxx index 5d10d53d48ad..bc9227223da0 100644 --- a/include/tools/simdsupport.hxx +++ b/include/tools/simdsupport.hxx @@ -20,6 +20,7 @@ #undef LO_SSSE3_AVAILABLE #undef LO_AVX_AVAILABLE #undef LO_AVX2_AVAILABLE +#undef LO_AVX512F_AVAILABLE #if defined(_MSC_VER) // VISUAL STUDIO COMPILER @@ -46,6 +47,12 @@ #include <immintrin.h> #endif // defined(__AVX2__) +// compiled with /arch:AVX512F +#if defined(__AVX512F__) +#define LO_AVX512F_AVAILABLE +#include <immintrin.h> +#endif // defined(__AVX512F__) + #else // compiler Clang and GCC #if defined(__SSE2__) || defined(__x86_64__) // SSE2 is required for X64 @@ -68,6 +75,12 @@ #include <immintrin.h> #endif // defined(__AVX2__) +#if defined(__AVX512F__) +#define LO_AVX512F_AVAILABLE +#include <immintrin.h> +#else +#endif // defined(__AVX512F__) + #endif // end compiler Clang and GCC /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk index f1b62e1c263d..a85650e935e6 100644 --- a/sc/Library_sc.mk +++ b/sc/Library_sc.mk @@ -99,6 +99,14 @@ $(eval $(call gb_Library_use_libraries,sc,\ )) $(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \ +)) + +$(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \ +)) + +$(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \ )) diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx new file mode 100644 index 000000000000..ee6d4c0fca5c --- /dev/null +++ b/sc/inc/arraysumfunctor.hxx @@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#pragma once + +#include <cmath> +#include "kahan.hxx" +#include "scdllapi.h" +#include <tools/cpuid.hxx> +#include <formula/errorcodes.hxx> + +namespace sc::op +{ +/* Checkout available optimization options */ +SC_DLLPUBLIC extern const bool hasAVX512F; +const bool hasAVX = cpuid::hasAVX(); +const bool hasSSE2 = cpuid::hasSSE2(); + +/** + * Performs one step of the Neumanier sum between doubles + * Overwrites the summand and error + * @parma sum + * @param err + * @param value + */ +inline void sumNeumanierNormal(double& sum, double& err, const double& value) +{ + double t = sum + value; + if (std::abs(sum) >= std::abs(value)) + err += (sum - t) + value; + else + err += (value - t) + sum; + sum = t; +} + +/** + * If no boosts available, Unrolled KahanSum. + * Most likely to use on android. + */ +static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pCurrent) +{ + size_t nRealSize = nSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 4); + + if (nUnrolledSize > 0) + { + KahanSum sum0 = 0.0; + KahanSum sum1 = 0.0; + KahanSum sum2 = 0.0; + KahanSum sum3 = 0.0; + + for (; i + 3 < nUnrolledSize; i += 4) + { + sum0 += *pCurrent++; + sum1 += *pCurrent++; + sum2 += *pCurrent++; + sum3 += *pCurrent++; + } + // We are using pairwise summation alongside Kahan + return (sum0 + sum1) + (sum2 + sum3); + } + return 0.0; +} + +/* Available methos */ +SC_DLLPUBLIC KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent); +SC_DLLPUBLIC KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent); +SC_DLLPUBLIC KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent); + +/** + * This function task is to choose the fastest method available to perform the sum. + * @param i + * @param nSize + * @param pCurrent + */ +static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent) +{ + if (hasAVX512F) + return executeAVX512F(i, nSize, pCurrent); + if (hasAVX) + return executeAVX(i, nSize, pCurrent); + if (hasSSE2) + return executeSSE2(i, nSize, pCurrent); + return executeUnrolled(i, nSize, pCurrent); +} + +/** + * Performs the sum of an array. + * Note that align 16 will speed up the process. + * @param pArray + * @param nSize + */ +inline KahanSum sumArray(const double* pArray, size_t nSize) +{ + size_t i = 0; + const double* pCurrent = pArray; + KahanSum fSum = executeFast(i, nSize, pCurrent); + + // sum rest of the array + for (; i < nSize; ++i) + fSum += pArray[i]; + + // If the sum is a NaN, some of the terms were empty cells, probably. + // Re-calculate, carefully + double fVal = fSum.get(); + if (!std::isfinite(fVal)) + { + FormulaError nErr = GetDoubleErrorValue(fVal); + if (nErr == FormulaError::NoValue) + { + fSum = 0; + for (i = 0; i < nSize; i++) + { + if (!std::isfinite(pArray[i])) + { + nErr = GetDoubleErrorValue(pArray[i]); + if (nErr != FormulaError::NoValue) + fSum += pArray[i]; // Let errors encoded as NaNs propagate ??? + } + else + fSum += pArray[i]; + } + } + } + return fSum; +} + +} // end namespace sc::op + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx index 49c7922b3c79..3404fb6d14a6 100644 --- a/sc/inc/kahan.hxx +++ b/sc/inc/kahan.hxx @@ -28,6 +28,12 @@ public: { } + constexpr KahanSum(double x_0, double err_0) + : m_fSum(x_0) + , m_fError(err_0) + { + } + constexpr KahanSum(const KahanSum& fSum) = default; public: @@ -182,6 +188,9 @@ public: return fSum.m_fSum != m_fSum || fSum.m_fError != m_fError; } + constexpr double* getSum() { return &m_fSum; } + constexpr double* getError() { return &m_fError; } + public: /** * Returns the final sum. diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx index 4d97d4cc1689..e82d762d88ee 100644 --- a/sc/qa/unit/functions_statistical.cxx +++ b/sc/qa/unit/functions_statistical.cxx @@ -1,4 +1,5 @@ #include "functions_test.hxx" +#include <arraysumfunctor.hxx> class StatisticalFunctionsTest : public FunctionsTest { @@ -6,9 +7,11 @@ public: StatisticalFunctionsTest(); void testStatisticalFormulasFODS(); + void testIntrinsicSums(); CPPUNIT_TEST_SUITE(StatisticalFunctionsTest); CPPUNIT_TEST(testStatisticalFormulasFODS); + CPPUNIT_TEST(testIntrinsicSums); CPPUNIT_TEST_SUITE_END(); }; @@ -26,6 +29,25 @@ StatisticalFunctionsTest::StatisticalFunctionsTest(): { } +void StatisticalFunctionsTest::testIntrinsicSums() +{ + // Checkout SSE2, AVX and AVX512 opperations + // Needs exactly 9 terms + double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 }; + double* pCurrent = summands; + size_t i = 0; + if (sc::op::hasAVX512F) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX512F(i, 9, pCurrent).get()); + i = 0; + if (sc::op::hasAVX) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX(i, 9, pCurrent).get()); + i = 0; + if (sc::op::hasSSE2) + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeSSE2(i, 9, pCurrent).get()); + i = 0; + CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get()); +} + CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest); CPPUNIT_PLUGIN_IMPLEMENT(); diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx deleted file mode 100644 index ae8d38db1338..000000000000 --- a/sc/source/core/inc/arraysumfunctor.hxx +++ /dev/null @@ -1,121 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - */ - -#pragma once - -#include <cstdint> -#include <cmath> - -#include <sal/mathconf.h> -#include <sal/types.h> -#include <tools/simd.hxx> -#include <tools/cpuid.hxx> -#include <kahan.hxx> - -namespace sc -{ -struct ArraySumFunctor -{ -private: - const double* mpArray; - size_t mnSize; - -public: - ArraySumFunctor(const double* pArray, size_t nSize) - : mpArray(pArray) - , mnSize(nSize) - { - } - - KahanSum operator()() - { - const static bool hasSSE2 = cpuid::hasSSE2(); - - KahanSum fSum = 0.0; - size_t i = 0; - const double* pCurrent = mpArray; - - if (hasSSE2) - { - while (i < mnSize && !simd::isAligned<double, 16>(pCurrent)) - { - fSum += *pCurrent++; - i++; - } - if (i < mnSize) - { - fSum += executeSSE2(i, pCurrent); - } - } - else - fSum = executeUnrolled(i, pCurrent); - - // sum rest of the array - - for (; i < mnSize; ++i) - fSum += mpArray[i]; - - // If the sum is a NaN, some of the terms were empty cells, probably. - // Re-calculate, carefully - double fVal = fSum.get(); - if (!std::isfinite(fVal)) - { - sal_uInt32 nErr = reinterpret_cast<sal_math_Double*>(&fVal)->nan_parts.fraction_lo; - if (nErr & 0xffff0000) - { - fSum = 0; - for (i = 0; i < mnSize; i++) - { - if (!std::isfinite(mpArray[i])) - { - nErr = reinterpret_cast<const sal_math_Double*>(&mpArray[i]) - ->nan_parts.fraction_lo; - if (!(nErr & 0xffff0000)) - fSum += mpArray[i]; // Let errors encoded as NaNs propagate ??? - } - else - fSum += mpArray[i]; - } - } - } - return fSum; - } - -private: - double executeSSE2(size_t& i, const double* pCurrent) const; - KahanSum executeUnrolled(size_t& i, const double* pCurrent) const - { - size_t nRealSize = mnSize - i; - size_t nUnrolledSize = nRealSize - (nRealSize % 4); - - if (nUnrolledSize > 0) - { - KahanSum sum0 = 0.0; - KahanSum sum1 = 0.0; - KahanSum sum2 = 0.0; - KahanSum sum3 = 0.0; - - for (; i < nUnrolledSize; i += 4) - { - sum0 += *pCurrent++; - sum1 += *pCurrent++; - sum2 += *pCurrent++; - sum3 += *pCurrent++; - } - // We are using pairwise summation alongside Kahan - return (sum0 + sum1) + (sum2 + sum3); - } - return 0.0; - } -}; - -} // end namespace sc - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx new file mode 100644 index 000000000000..af12a08a8cf5 --- /dev/null +++ b/sc/source/core/tool/arraysumAVX.cxx @@ -0,0 +1,114 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> +#include <tools/simdsupport.hxx> + +namespace sc::op +{ +#ifdef LO_AVX_AVAILABLE // Old processors + +const __m256d ANNULATE_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); + +/** Kahan sum with AVX. + */ +static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value) +{ + // Temporal parameter + __m256d t = _mm256_add_pd(sum, value); + // Absolute value of the total sum + __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT); + // Absolute value of the value to add + __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT); + // Comaprate the absolute values sum >= value + __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value)); + __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum)); + err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif + +/** Execute Kahan sum with AVX. + */ +KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_AVX_AVAILABLE + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) + { + // Setup sums and errors as 0 + __m256d sum1 = _mm256_setzero_pd(); + __m256d err1 = _mm256_setzero_pd(); + __m256d sum2 = _mm256_setzero_pd(); + __m256d err2 = _mm256_setzero_pd(); + + for (; i + 7 < nSize; i += 8) + { + // Kahan sum 1 + __m256d load1 = _mm256_loadu_pd(pCurrent); + sumAVX(sum1, err1, load1); + pCurrent += 4; + + // Kahan sum 2 + __m256d load2 = _mm256_loadu_pd(pCurrent); + sumAVX(sum2, err2, load2); + pCurrent += 4; + } + + // Now we combine pairwise summation with Kahan summation + + // sum 1 + sum 2 -> sum 1 + sumAVX(sum1, err1, sum2); + sumAVX(sum1, err1, err2); + + // Store results + double sums[4]; + double errs[4]; + _mm256_storeu_pd(&sums[0], sum1); + _mm256_storeu_pd(&errs[0], err1); + + // First Kahan & pairwise summation + // 0+1 1+2 -> 0, 2 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[2], errs[2], sums[3]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + sumNeumanierNormal(sums[2], errs[2], errs[3]); + + // 0+2 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[2]); + sumNeumanierNormal(sums[0], errs[0], errs[2]); + + // Store result + return KahanSum(sums[0], errs[0]); + } + return 0.0; +#else + SAL_WARN("sc", "Failed to use AVX"); + (void)i; + (void)nSize; + (void)pCurrent; + return 0.0; +#endif +} + +} // end namespace sc::op + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx new file mode 100644 index 000000000000..55764849edfb --- /dev/null +++ b/sc/source/core/tool/arraysumAVX512.cxx @@ -0,0 +1,140 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> +#include <tools/simdsupport.hxx> + +/* TODO Remove this once GCC updated and AVX512 can work. */ +#ifdef __GNUC__ +#if __GNUC__ < 9 +#ifdef LO_AVX512F_AVAILABLE +#define HAS_LO_AVX512F_AVAILABLE +#undef LO_AVX512F_AVAILABLE +#endif +#endif +#endif + +#ifdef LO_AVX512F_AVAILABLE +const bool sc::op::hasAVX512F = cpuid::hasAVX512F(); +#else +const bool sc::op::hasAVX512F = false; +#endif + +namespace sc::op +{ +#ifdef LO_AVX512F_AVAILABLE // New processors + +/** Kahan sum with AVX512. + */ +static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value) +{ + // Temporal parameter + __m512d t = _mm512_add_pd(sum, value); + // Absolute value of the total sum + __m512d asum = _mm512_abs_pd(sum); + // Absolute value of the value to add + __m512d avalue = _mm512_abs_pd(value); + // Comaprate the absolute values sum >= value + __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m512d a = _mm512_mask_blend_pd(mask, sum, value); + __m512d b = _mm512_mask_blend_pd(mask, value, sum); + err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif + +/** Execute Kahan sum with AVX512. + */ +KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_AVX512F_AVAILABLE // New processors + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) + { + // Setup sums and errors as 0 + __m512d sum = _mm512_setzero_pd(); + __m512d err = _mm512_setzero_pd(); + + // Sum the stuff + for (; i + 7 < nSize; i += 8) + { + // Kahan sum + __m512d load = _mm512_loadu_pd(pCurrent); + sumAVX512(sum, err, load); + pCurrent += 8; + } + + // Store result + static_assert(sizeof(double) == 8); + double sums[8]; + double errs[8]; + _mm512_storeu_pd(static_cast<void*>(&sums[0]), sum); + _mm512_storeu_pd(static_cast<void*>(&errs[0]), err); + + // First Kahan & pairwise summation + // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[2], errs[2], sums[3]); + sumNeumanierNormal(sums[4], errs[4], sums[5]); + sumNeumanierNormal(sums[6], errs[6], sums[7]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + sumNeumanierNormal(sums[2], errs[2], errs[3]); + sumNeumanierNormal(sums[4], errs[4], errs[5]); + sumNeumanierNormal(sums[6], errs[6], errs[7]); + + // Second Kahan & pairwise summation + // 0+2 4+6 -> 0, 4 + sumNeumanierNormal(sums[0], errs[0], sums[2]); + sumNeumanierNormal(sums[4], errs[4], sums[6]); + sumNeumanierNormal(sums[0], errs[0], errs[2]); + sumNeumanierNormal(sums[4], errs[4], errs[6]); + + // Third Kahan & pairwise summation + // 0+4 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[4]); + sumNeumanierNormal(sums[0], errs[0], errs[4]); + + // Return final result + return KahanSum(sums[0], errs[0]); + } + else + return 0.0; +#else + SAL_WARN("sc", "Failed to use AVX 512"); + (void)i; + (void)nSize; + (void)pCurrent; + return 0.0; +#endif +} + +} // end namespace sc::op + +/* TODO Remove this once GCC updated and AVX512 can work. */ +#ifdef __GNUC__ +#if __GNUC__ < 9 +#ifdef HAS_LO_AVX512F_AVAILABLE +#define LO_AVX512F_AVAILABLE +#undef HAS_LO_AVX512F_AVAILABLE +#endif +#endif +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx index e69f672b6014..4f6a4b47a11d 100644 --- a/sc/source/core/tool/arraysumSSE2.cxx +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -9,97 +9,121 @@ */ #include <arraysumfunctor.hxx> +#include <sal/log.hxx> +#include <tools/simd.hxx> #include <tools/simdsupport.hxx> -namespace sc +//AVX512VL + AVX512F + KNCNI + +namespace sc::op { -double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const +#ifdef LO_SSE2_AVAILABLE // Old processors + +const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); + +/** Kahan sum with SSE4.2. + */ +static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value) { -#if defined(LO_SSE2_AVAILABLE) - double fSum = 0.0; - size_t nRealSize = mnSize - i; - size_t nUnrolledSize = nRealSize - (nRealSize % 8); + // Temporal parameter + __m128d t = _mm_add_pd(sum, value); + // Absolute value of the total sum + __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT); + // Absolute value of the value to add + __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT); + // Comaprate the absolute values sum >= value + __m128d mask = _mm_cmpge_pd(asum, avalue); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value)); + __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum)); + err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b)); + // Store result + sum = t; +} + +#endif - if (nUnrolledSize > 0) +/** Execute Kahan sum with SSE2. + */ +KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent) +{ +#ifdef LO_SSE2_AVAILABLE + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate untill nSize won't fall out of bounds + if (nSize > i + 7) { + // Setup sums and errors as 0 __m128d sum1 = _mm_setzero_pd(); - __m128d sum2 = _mm_setzero_pd(); - __m128d sum3 = _mm_setzero_pd(); - __m128d sum4 = _mm_setzero_pd(); - __m128d err1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); __m128d err2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); __m128d err3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); __m128d err4 = _mm_setzero_pd(); - __m128d y, t; - - for (; i < nUnrolledSize; i += 8) + for (; i + 7 < nSize; i += 8) { // Kahan sum 1 - __m128d load1 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load1, err1); - t = _mm_add_pd(sum1, y); - err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); - sum1 = t; + __m128d load1 = _mm_loadu_pd(pCurrent); + sumSSE2(sum1, err1, load1); pCurrent += 2; // Kahan sum 2 - __m128d load2 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load2, err2); - t = _mm_add_pd(sum2, y); - err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y); - sum2 = t; + __m128d load2 = _mm_loadu_pd(pCurrent); + sumSSE2(sum2, err2, load2); pCurrent += 2; // Kahan sum 3 - __m128d load3 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load3, err3); - t = _mm_add_pd(sum3, y); - err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y); - sum3 = t; + __m128d load3 = _mm_loadu_pd(pCurrent); + sumSSE2(sum3, err3, load3); pCurrent += 2; // Kahan sum 4 - __m128d load4 = _mm_load_pd(pCurrent); - y = _mm_sub_pd(load4, err4); - t = _mm_add_pd(sum4, y); - err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y); - sum4 = t; + __m128d load4 = _mm_loadu_pd(pCurrent); + sumSSE2(sum4, err4, load4); pCurrent += 2; } // Now we combine pairwise summation with Kahan summation - // sum 1 + sum 2 - y = _mm_sub_pd(sum2, err1); - t = _mm_add_pd(sum1, y); - err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); - sum1 = t; - - // sum 3 + sum 4 - y = _mm_sub_pd(sum4, err3); - t = _mm_add_pd(sum3, y); - sum3 = t; - - // sum 1 + sum 3 - y = _mm_sub_pd(sum3, err1); - t = _mm_add_pd(sum1, y); - sum1 = t; - - double temp; - - _mm_storel_pd(&temp, sum1); - fSum += temp; - - _mm_storeh_pd(&temp, sum1); - fSum += temp; + // 1+2 3+4 -> 1, 3 + sumSSE2(sum1, err1, sum2); + sumSSE2(sum1, err1, err2); + sumSSE2(sum3, err3, sum4); + sumSSE2(sum3, err3, err4); + + // 1+3 -> 1 + sumSSE2(sum1, err1, sum3); + sumSSE2(sum1, err1, err3); + + // Store results + double sums[2]; + double errs[2]; + _mm_storeu_pd(&sums[0], sum1); + _mm_storeu_pd(&errs[0], err1); + + // First Kahan & pairwise summation + // 0+1 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + + // Store result + return KahanSum(sums[0], errs[0]); } - return fSum; + return 0.0; #else + SAL_WARN("sc", "Failed to use SSE2"); (void)i; + (void)nSize; (void)pCurrent; return 0.0; #endif } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx index a291b444ecdc..dbeab67fe35c 100644 --- a/sc/source/core/tool/interpr6.cxx +++ b/sc/source/core/tool/interpr6.cxx @@ -223,9 +223,7 @@ public: return; const double *p = &sc::numeric_block::at(*rNode.data, nOffset); - sc::ArraySumFunctor functor(p, nDataSize); - - maSum += functor(); + maSum += sc::op::sumArray(p, nDataSize); break; } diff --git a/tools/source/misc/cpuid.cxx b/tools/source/misc/cpuid.cxx index 0395d0e0f001..855b87e6da63 100644 --- a/tools/source/misc/cpuid.cxx +++ b/tools/source/misc/cpuid.cxx @@ -56,6 +56,7 @@ bool checkAVXSupportInOS() #define XSAVE_bit (1 << 27) #define AVX_bit (1 << 28) #define AVX2_bit (1 << 5) +#define AVX512F_bit (1 << 16) InstructionSetFlags getCpuInstructionSetFlags() { @@ -98,6 +99,8 @@ InstructionSetFlags getCpuInstructionSetFlags() if ((aExtendedInfo[1] & AVX2_bit) != 0) eInstructions |= InstructionSetFlags::AVX2; + if ((aExtendedInfo[1] & AVX512F_bit) != 0) + eInstructions |= InstructionSetFlags::AVX512F; } } } @@ -127,6 +130,8 @@ OUString instructionSetSupportedString() aString += "AVX "; if (isCpuInstructionSetSupported(InstructionSetFlags::AVX2)) aString += "AVX2 "; + if (isCpuInstructionSetSupported(InstructionSetFlags::AVX512F)) + aString += "AVX512F "; return aString; } |