diff options
-rw-r--r-- | include/tools/cpuid.hxx | 2 | ||||
-rw-r--r-- | include/tools/simdsupport.hxx | 10 | ||||
-rw-r--r-- | sc/Library_sc.mk | 13 | ||||
-rw-r--r-- | sc/inc/arraysumfunctor.hxx | 30 | ||||
-rw-r--r-- | sc/inc/arraysumfunctorinternal.hxx | 36 | ||||
-rw-r--r-- | sc/inc/kahan.hxx | 8 | ||||
-rw-r--r-- | sc/qa/unit/functions_statistical.cxx | 22 | ||||
-rw-r--r-- | sc/source/core/tool/arraysum.hxx | 18 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX.cxx | 121 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumAVX512.cxx | 120 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumSSE2.cxx | 20 |
11 files changed, 37 insertions, 363 deletions
diff --git a/include/tools/cpuid.hxx b/include/tools/cpuid.hxx index 1be897e84a4e..4f309ff11e96 100644 --- a/include/tools/cpuid.hxx +++ b/include/tools/cpuid.hxx @@ -23,6 +23,8 @@ or inline functions, otherwise their possibly emitted copies compiled with the CPU-specific instructions might be chosen by the linker as the copy to keep. +Also see the note at the top of simdsupport.hxx . + */ namespace cpuid { diff --git a/include/tools/simdsupport.hxx b/include/tools/simdsupport.hxx index bc9227223da0..738b34e072db 100644 --- a/include/tools/simdsupport.hxx +++ b/include/tools/simdsupport.hxx @@ -8,6 +8,16 @@ * */ +// IMPORTANT: Having CPU-specific routines turned out to be a maintenance +// problem, because of various problems such as compilers moving CPU-specific +// code out of #ifdef code into static initialization or our code using C++ +// features that caused the compiler to emit code that used CPU-specific +// instructions (even cpuid.hxx isn't safe, see the comment there). +// The only safe usage is using CPU-specific code that's always available, +// such as SSE2-specific code for x86_64. Do not use for anything else +// unless you really know what you are doing (and you check git history +// to learn from past problems). + // Determine the compiler support for SIMD compiler intrinsics. // This changes from one compiled unit to the other, depending if // the support has been detected and if the compiled unit contains diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk index 936ca33901ee..3c8dcb3e5085 100644 --- a/sc/Library_sc.mk +++ b/sc/Library_sc.mk @@ -206,6 +206,7 @@ $(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/core/tool/address \ sc/source/core/tool/adiasync \ sc/source/core/tool/appoptio \ + sc/source/core/tool/arraysumSSE2 \ sc/source/core/tool/autoform \ sc/source/core/tool/calcconfig \ sc/source/core/tool/callform \ @@ -680,18 +681,6 @@ $(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/ui/xmlsource/xmlsourcedlg \ )) -$(eval $(call gb_Library_add_exception_objects,sc,\ - sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \ -)) - -$(eval $(call gb_Library_add_exception_objects,sc,\ - sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \ -)) - -$(eval $(call gb_Library_add_exception_objects,sc,\ - sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \ -)) - ifeq ($(ENABLE_FORMULA_LOGGER),TRUE) $(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/core/tool/formulalogger \ diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx index b727f5893a8c..eecfa59c3f65 100644 --- a/sc/inc/arraysumfunctor.hxx +++ b/sc/inc/arraysumfunctor.hxx @@ -12,16 +12,25 @@ #include <cmath> #include "kahan.hxx" -#include "arraysumfunctorinternal.hxx" -#include <tools/cpuid.hxx> +#include "arraysumfunctor.hxx" #include <formula/errorcodes.hxx> namespace sc::op { -/* Checkout available optimization options */ -const bool hasAVX512F = hasAVX512FCode() && cpuid::hasAVX512F(); -const bool hasAVX = hasAVXCode() && cpuid::hasAVX(); -const bool hasSSE2 = hasSSE2Code() && cpuid::hasSSE2(); +// Checkout available optimization options. +// Note that it turned out to be problematic to support CPU-specific code +// that's not guaranteed to be available on that specific platform (see +// git history). SSE2 is guaranteed on x86_64 and it is our baseline requirement +// for x86 on Windows, so SSE2 use is hardcoded on those platforms. +// Whenever we raise baseline to e.g. AVX, this may get +// replaced with AVX code (get it from git history). +// Do it similarly with other platforms. +#if defined(X86_64) || (defined(X86) && defined(_WIN32)) +#define SC_USE_SSE2 1 +KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent); +#else +#define SC_USE_SSE2 0 +#endif /** * If no boosts available, Unrolled KahanSum. @@ -60,12 +69,9 @@ static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pC */ static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent) { - if (hasAVX512F) - return executeAVX512F(i, nSize, pCurrent); - if (hasAVX) - return executeAVX(i, nSize, pCurrent); - if (hasSSE2) - return executeSSE2(i, nSize, pCurrent); +#if SC_USE_SSE2 + return executeSSE2(i, nSize, pCurrent); +#endif return executeUnrolled(i, nSize, pCurrent); } diff --git a/sc/inc/arraysumfunctorinternal.hxx b/sc/inc/arraysumfunctorinternal.hxx deleted file mode 100644 index e939dbd3037d..000000000000 --- a/sc/inc/arraysumfunctorinternal.hxx +++ /dev/null @@ -1,36 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -#pragma once - -#include "scdllapi.h" - -namespace sc::op -{ -// Plain old data structure, to be used by code compiled with CPU intrinsics without generating any -// code for it (so that code requiring intrinsics doesn't get accidentally selected as the one copy -// when merging duplicates). -struct KahanSumSimple -{ - double m_fSum; - double m_fError; -}; - -/* Available methods */ -SC_DLLPUBLIC KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent); -SC_DLLPUBLIC KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent); -SC_DLLPUBLIC KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent); - -SC_DLLPUBLIC bool hasAVX512FCode(); -SC_DLLPUBLIC bool hasAVXCode(); -SC_DLLPUBLIC bool hasSSE2Code(); - -} // namespace - -/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx index 5418c8d23b35..6c84f6eeef2e 100644 --- a/sc/inc/kahan.hxx +++ b/sc/inc/kahan.hxx @@ -12,8 +12,6 @@ #include <rtl/math.hxx> #include <cmath> -#include "arraysumfunctorinternal.hxx" - /** * This class provides LO with Kahan summation algorithm * About this algorithm: https://en.wikipedia.org/wiki/Kahan_summation_algorithm @@ -40,12 +38,6 @@ public: { } - constexpr KahanSum(const sc::op::KahanSumSimple& sum) - : m_fSum(sum.m_fSum) - , m_fError(sum.m_fError) - { - } - constexpr KahanSum(const KahanSum& fSum) = default; public: diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx index a034964f4923..4d97d4cc1689 100644 --- a/sc/qa/unit/functions_statistical.cxx +++ b/sc/qa/unit/functions_statistical.cxx @@ -1,5 +1,4 @@ #include "functions_test.hxx" -#include <arraysumfunctor.hxx> class StatisticalFunctionsTest : public FunctionsTest { @@ -7,11 +6,9 @@ public: StatisticalFunctionsTest(); void testStatisticalFormulasFODS(); - void testIntrinsicSums(); CPPUNIT_TEST_SUITE(StatisticalFunctionsTest); CPPUNIT_TEST(testStatisticalFormulasFODS); - CPPUNIT_TEST(testIntrinsicSums); CPPUNIT_TEST_SUITE_END(); }; @@ -29,25 +26,6 @@ StatisticalFunctionsTest::StatisticalFunctionsTest(): { } -void StatisticalFunctionsTest::testIntrinsicSums() -{ - // Checkout SSE2, AVX and AVX512 operations - // Needs exactly 9 terms - double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 }; - double* pCurrent = summands; - size_t i = 0; - if (sc::op::hasAVX512F) - CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX512F(i, 9, pCurrent)).get()); - i = 0; - if (sc::op::hasAVX) - CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX(i, 9, pCurrent)).get()); - i = 0; - if (sc::op::hasSSE2) - CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeSSE2(i, 9, pCurrent)).get()); - i = 0; - CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get()); -} - CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest); CPPUNIT_PLUGIN_IMPLEMENT(); diff --git a/sc/source/core/tool/arraysum.hxx b/sc/source/core/tool/arraysum.hxx index 5d227bd85a48..ce8a7f30f4dc 100644 --- a/sc/source/core/tool/arraysum.hxx +++ b/sc/source/core/tool/arraysum.hxx @@ -14,19 +14,6 @@ namespace sc::op { -// Code must not be shared between different CPU instrinsics flags (e.g. in debug mode the compiler would not -// inline them, and merge the copies, keeping only the one with the most demanding CPU set that's not available otherwise). -// Put everything in a different namespace and additionally try to force inlining. -namespace LO_ARRAYSUM_SPACE -{ -#if defined _MSC_VER -#define INLINE __forceinline static -#elif defined __GNUC__ -#define INLINE __attribute__((always_inline)) static inline -#else -#define static inline -#endif - /** * Performs one step of the Neumanier sum between doubles * Overwrites the summand and error @@ -34,7 +21,7 @@ namespace LO_ARRAYSUM_SPACE * @param err * @param value */ -INLINE void sumNeumanierNormal(double& sum, double& err, const double& value) +inline void sumNeumanierNormal(double& sum, double& err, const double& value) { double t = sum + value; if (fabs(sum) >= fabs(value)) @@ -44,9 +31,6 @@ INLINE void sumNeumanierNormal(double& sum, double& err, const double& value) sum = t; } -#undef INLINE - -} // end namespace LO_ARRAYSUM_SPACE } // end namespace sc::op /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx deleted file mode 100644 index 4d9ee02285e7..000000000000 --- a/sc/source/core/tool/arraysumAVX.cxx +++ /dev/null @@ -1,121 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - */ - -#define LO_ARRAYSUM_SPACE AVX -#include "arraysum.hxx" - -#include <arraysumfunctorinternal.hxx> - -#include <tools/simd.hxx> -#include <tools/simdsupport.hxx> - -#include <stdlib.h> - -namespace sc::op -{ -#ifdef LO_AVX_AVAILABLE - -bool hasAVXCode() { return true; } - -using namespace AVX; - -/** Kahan sum with AVX. - */ -static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value) -{ - static const __m256d ANNULATE_SIGN_BIT - = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); - // Temporal parameter - __m256d t = _mm256_add_pd(sum, value); - // Absolute value of the total sum - __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT); - // Absolute value of the value to add - __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT); - // Compare the absolute values sum >= value - __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ); - // The following code has this form ( a - t + b) - // Case 1: a = sum b = value - // Case 2: a = value b = sum - __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value)); - __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum)); - err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b)); - // Store result - sum = t; -} - -/** Execute Kahan sum with AVX. - */ -KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent) -{ - // Make sure we don't fall out of bounds. - // This works by sums of 8 terms. - // So the 8'th term is i+7 - // If we iterate until nSize won't fall out of bounds - if (nSize > i + 7) - { - // Setup sums and errors as 0 - __m256d sum1 = _mm256_setzero_pd(); - __m256d err1 = _mm256_setzero_pd(); - __m256d sum2 = _mm256_setzero_pd(); - __m256d err2 = _mm256_setzero_pd(); - - for (; i + 7 < nSize; i += 8) - { - // Kahan sum 1 - __m256d load1 = _mm256_loadu_pd(pCurrent); - sumAVX(sum1, err1, load1); - pCurrent += 4; - - // Kahan sum 2 - __m256d load2 = _mm256_loadu_pd(pCurrent); - sumAVX(sum2, err2, load2); - pCurrent += 4; - } - - // Now we combine pairwise summation with Kahan summation - - // sum 1 + sum 2 -> sum 1 - sumAVX(sum1, err1, sum2); - sumAVX(sum1, err1, err2); - - // Store results - double sums[4]; - double errs[4]; - _mm256_storeu_pd(&sums[0], sum1); - _mm256_storeu_pd(&errs[0], err1); - - // First Kahan & pairwise summation - // 0+1 1+2 -> 0, 2 - sumNeumanierNormal(sums[0], errs[0], sums[1]); - sumNeumanierNormal(sums[2], errs[2], sums[3]); - sumNeumanierNormal(sums[0], errs[0], errs[1]); - sumNeumanierNormal(sums[2], errs[2], errs[3]); - - // 0+2 -> 0 - sumNeumanierNormal(sums[0], errs[0], sums[2]); - sumNeumanierNormal(sums[0], errs[0], errs[2]); - - // Store result - return { sums[0], errs[0] }; - } - return { 0.0, 0.0 }; -} - -#else // LO_AVX_AVAILABLE - -bool hasAVXCode() { return false; } - -KahanSumSimple executeAVX(size_t&, size_t, const double*) { abort(); } - -#endif - -} // end namespace sc::op - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx deleted file mode 100644 index 6a3235a58e2e..000000000000 --- a/sc/source/core/tool/arraysumAVX512.cxx +++ /dev/null @@ -1,120 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - */ - -#define LO_ARRAYSUM_SPACE AVX512 -#include "arraysum.hxx" - -#include <arraysumfunctorinternal.hxx> - -#include <tools/simd.hxx> -#include <tools/simdsupport.hxx> - -#include <stdlib.h> - -namespace sc::op -{ -#ifdef LO_AVX512F_AVAILABLE - -bool hasAVX512FCode() { return true; } - -using namespace AVX512; - -/** Kahan sum with AVX512. - */ -static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value) -{ - // Temporal parameter - __m512d t = _mm512_add_pd(sum, value); - // Absolute value of the total sum - __m512d asum = _mm512_abs_pd(sum); - // Absolute value of the value to add - __m512d avalue = _mm512_abs_pd(value); - // Compare the absolute values sum >= value - __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ); - // The following code has this form ( a - t + b) - // Case 1: a = sum b = value - // Case 2: a = value b = sum - __m512d a = _mm512_mask_blend_pd(mask, sum, value); - __m512d b = _mm512_mask_blend_pd(mask, value, sum); - err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b)); - // Store result - sum = t; -} - -/** Execute Kahan sum with AVX512. - */ -KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent) -{ - // Make sure we don't fall out of bounds. - // This works by sums of 8 terms. - // So the 8'th term is i+7 - // If we iterate until nSize won't fall out of bounds - if (nSize > i + 7) - { - // Setup sums and errors as 0 - __m512d sum = _mm512_setzero_pd(); - __m512d err = _mm512_setzero_pd(); - - // Sum the stuff - for (; i + 7 < nSize; i += 8) - { - // Kahan sum - __m512d load = _mm512_loadu_pd(pCurrent); - sumAVX512(sum, err, load); - pCurrent += 8; - } - - // Store result - static_assert(sizeof(double) == 8); - double sums[8]; - double errs[8]; - _mm512_storeu_pd(&sums[0], sum); - _mm512_storeu_pd(&errs[0], err); - - // First Kahan & pairwise summation - // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6 - sumNeumanierNormal(sums[0], errs[0], sums[1]); - sumNeumanierNormal(sums[2], errs[2], sums[3]); - sumNeumanierNormal(sums[4], errs[4], sums[5]); - sumNeumanierNormal(sums[6], errs[6], sums[7]); - sumNeumanierNormal(sums[0], errs[0], errs[1]); - sumNeumanierNormal(sums[2], errs[2], errs[3]); - sumNeumanierNormal(sums[4], errs[4], errs[5]); - sumNeumanierNormal(sums[6], errs[6], errs[7]); - - // Second Kahan & pairwise summation - // 0+2 4+6 -> 0, 4 - sumNeumanierNormal(sums[0], errs[0], sums[2]); - sumNeumanierNormal(sums[4], errs[4], sums[6]); - sumNeumanierNormal(sums[0], errs[0], errs[2]); - sumNeumanierNormal(sums[4], errs[4], errs[6]); - - // Third Kahan & pairwise summation - // 0+4 -> 0 - sumNeumanierNormal(sums[0], errs[0], sums[4]); - sumNeumanierNormal(sums[0], errs[0], errs[4]); - - // Return final result - return { sums[0], errs[0] }; - } - return { 0.0, 0.0 }; -} - -#else // LO_AVX512F_AVAILABLE - -bool hasAVX512FCode() { return false; } - -KahanSumSimple executeAVX512F(size_t&, size_t, const double*) { abort(); } - -#endif - -} // end namespace sc::op - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx index 1a5cc2f00dfe..8542e2e2335d 100644 --- a/sc/source/core/tool/arraysumSSE2.cxx +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -8,24 +8,19 @@ * */ -#define LO_ARRAYSUM_SPACE SSE2 #include "arraysum.hxx" -#include <arraysumfunctorinternal.hxx> +#include <arraysumfunctor.hxx> #include <tools/simd.hxx> #include <tools/simdsupport.hxx> #include <stdlib.h> +#if SC_USE_SSE2 + namespace sc::op { -#ifdef LO_SSE2_AVAILABLE - -bool hasSSE2Code() { return true; } - -using namespace SSE2; - /** Kahan sum with SSE2. */ static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value) @@ -51,7 +46,7 @@ static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value) /** Execute Kahan sum with SSE2. */ -KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent) +KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent) { // Make sure we don't fall out of bounds. // This works by sums of 8 terms. @@ -121,13 +116,8 @@ KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent) return { 0.0, 0.0 }; } -#else // LO_SSE2_AVAILABLE - -bool hasSSE2Code() { return false; } - -KahanSumSimple executeSSE2(size_t&, size_t, const double*) { abort(); } +} // namespace #endif -} /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |