summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordante <dante19031999@gmail.com>2021-05-16 16:04:20 +0200
committerMike Kaganski <mike.kaganski@collabora.com>2021-08-26 08:48:35 +0200
commit5b9cf5881ef53fac5f1d8376f687dbadf9d3cf2b (patch)
treeff16b0dcf62236c48fed6ca87e7d584959ed1841
parentfb1233d77500413ba237c335a84773d8a6f8e381 (diff)
tdf#142307 - Upgrade SSE2 sum to AVX512 sum with Neumaier 1
This part focuses on allowing it on replacing arrayfunctor By thefault it will try AVX512F (1,17%) If not available will use AVX (94,77%) Use of AVX2 (82,28%) has been avoided even if the code could been more compact Source of hardware statistics: https://store.steampowered.com/hwsurvey Change-Id: Iae737a565379e82c5f84f3fdee6321ac74f59d40 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115675 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
-rw-r--r--configure.ac9
-rw-r--r--include/tools/cpuid.hxx17
-rw-r--r--include/tools/simdsupport.hxx13
-rw-r--r--sc/Library_sc.mk8
-rw-r--r--sc/inc/arraysumfunctor.hxx137
-rw-r--r--sc/inc/kahan.hxx9
-rw-r--r--sc/qa/unit/functions_statistical.cxx22
-rw-r--r--sc/source/core/inc/arraysumfunctor.hxx121
-rw-r--r--sc/source/core/tool/arraysumAVX.cxx114
-rw-r--r--sc/source/core/tool/arraysumAVX512.cxx140
-rw-r--r--sc/source/core/tool/arraysumSSE2.cxx140
-rw-r--r--sc/source/core/tool/interpr6.cxx4
-rw-r--r--tools/source/misc/cpuid.cxx5
13 files changed, 556 insertions, 183 deletions
diff --git a/configure.ac b/configure.ac
index 9e5708cd49d4..bd929450971c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -7631,6 +7631,7 @@ CXXFLAGS_INTRINSICS_SSE42=
CXXFLAGS_INTRINSICS_AVX=
CXXFLAGS_INTRINSICS_AVX2=
CXXFLAGS_INTRINSICS_AVX512=
+CXXFLAGS_INTRINSICS_AVX512F=
CXXFLAGS_INTRINSICS_F16C=
CXXFLAGS_INTRINSICS_FMA=
@@ -7643,6 +7644,7 @@ if test "$GCC" = "yes" -o "$COM_IS_CLANG" = TRUE; then
flag_avx=-mavx
flag_avx2=-mavx2
flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd"
+ flag_avx512f=-mavx512f
flag_f16c=-mf16c
flag_fma=-mfma
else
@@ -7658,6 +7660,7 @@ else
flag_avx=-arch:AVX
flag_avx2=-arch:AVX2
flag_avx512=-arch:AVX512
+ flag_avx512f=-arch:AVX512
# These are part of -arch:AVX2
flag_f16c=-arch:AVX2
flag_fma=-arch:AVX2
@@ -7807,6 +7810,7 @@ CXXFLAGS=$save_CXXFLAGS
AC_MSG_RESULT([${can_compile_avx512}])
if test "${can_compile_avx512}" = "yes" ; then
CXXFLAGS_INTRINSICS_AVX512="$flag_avx512"
+ CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f"
fi
AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics])
@@ -7859,6 +7863,7 @@ AC_SUBST([CXXFLAGS_INTRINSICS_SSE42])
AC_SUBST([CXXFLAGS_INTRINSICS_AVX])
AC_SUBST([CXXFLAGS_INTRINSICS_AVX2])
AC_SUBST([CXXFLAGS_INTRINSICS_AVX512])
+AC_SUBST([CXXFLAGS_INTRINSICS_AVX512F])
AC_SUBST([CXXFLAGS_INTRINSICS_F16C])
AC_SUBST([CXXFLAGS_INTRINSICS_FMA])
@@ -12013,6 +12018,7 @@ LO_CLANG_CXXFLAGS_INTRINSICS_SSE42=
LO_CLANG_CXXFLAGS_INTRINSICS_AVX=
LO_CLANG_CXXFLAGS_INTRINSICS_AVX2=
LO_CLANG_CXXFLAGS_INTRINSICS_AVX512=
+LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F=
LO_CLANG_CXXFLAGS_INTRINSICS_F16C=
LO_CLANG_CXXFLAGS_INTRINSICS_FMA=
@@ -12066,6 +12072,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT
flag_avx=-mavx
flag_avx2=-mavx2
flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd"
+ flag_avx512f=-mavx512f
flag_f16c=-mf16c
flag_fma=-mfma
@@ -12213,6 +12220,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT
AC_MSG_RESULT([${can_compile_avx512}])
if test "${can_compile_avx512}" = "yes" ; then
LO_CLANG_CXXFLAGS_INTRINSICS_AVX512="$flag_avx512"
+ LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f"
fi
AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics])
@@ -12319,6 +12327,7 @@ AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_SSE42)
AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX)
AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX2)
AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512)
+AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F)
AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_F16C)
AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_FMA)
AC_SUBST(CLANG_USE_LD)
diff --git a/include/tools/cpuid.hxx b/include/tools/cpuid.hxx
index 90c7d37b485c..98dca497a7de 100644
--- a/include/tools/cpuid.hxx
+++ b/include/tools/cpuid.hxx
@@ -26,7 +26,8 @@ enum class InstructionSetFlags
SSE41 = 0x08,
SSE42 = 0x10,
AVX = 0x20,
- AVX2 = 0x40
+ AVX2 = 0x40,
+ AVX512F = 0x100
};
} // end cpuid
@@ -63,6 +64,13 @@ inline bool hasSSSE3()
return isCpuInstructionSetSupported(InstructionSetFlags::SSSE3);
}
+/** Check if AVX is supported by the CPU
+ */
+inline bool hasAVX()
+{
+ return isCpuInstructionSetSupported(InstructionSetFlags::AVX);
+}
+
/** Check if AVX2 is supported by the CPU
*/
inline bool hasAVX2()
@@ -70,6 +78,13 @@ inline bool hasAVX2()
return isCpuInstructionSetSupported(InstructionSetFlags::AVX2);
}
+/** Check if AVX512F is supported by the CPU
+ */
+inline bool hasAVX512F()
+{
+ return isCpuInstructionSetSupported(InstructionSetFlags::AVX512F);
+}
+
/** Check if Hyper Threading is supported
*/
inline bool hasHyperThreading()
diff --git a/include/tools/simdsupport.hxx b/include/tools/simdsupport.hxx
index 5d10d53d48ad..bc9227223da0 100644
--- a/include/tools/simdsupport.hxx
+++ b/include/tools/simdsupport.hxx
@@ -20,6 +20,7 @@
#undef LO_SSSE3_AVAILABLE
#undef LO_AVX_AVAILABLE
#undef LO_AVX2_AVAILABLE
+#undef LO_AVX512F_AVAILABLE
#if defined(_MSC_VER) // VISUAL STUDIO COMPILER
@@ -46,6 +47,12 @@
#include <immintrin.h>
#endif // defined(__AVX2__)
+// compiled with /arch:AVX512F
+#if defined(__AVX512F__)
+#define LO_AVX512F_AVAILABLE
+#include <immintrin.h>
+#endif // defined(__AVX512F__)
+
#else // compiler Clang and GCC
#if defined(__SSE2__) || defined(__x86_64__) // SSE2 is required for X64
@@ -68,6 +75,12 @@
#include <immintrin.h>
#endif // defined(__AVX2__)
+#if defined(__AVX512F__)
+#define LO_AVX512F_AVAILABLE
+#include <immintrin.h>
+#else
+#endif // defined(__AVX512F__)
+
#endif // end compiler Clang and GCC
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk
index f1b62e1c263d..a85650e935e6 100644
--- a/sc/Library_sc.mk
+++ b/sc/Library_sc.mk
@@ -99,6 +99,14 @@ $(eval $(call gb_Library_use_libraries,sc,\
))
$(eval $(call gb_Library_add_exception_objects,sc,\
+ sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,sc,\
+ sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,sc,\
sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \
))
diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx
new file mode 100644
index 000000000000..ee6d4c0fca5c
--- /dev/null
+++ b/sc/inc/arraysumfunctor.hxx
@@ -0,0 +1,137 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#pragma once
+
+#include <cmath>
+#include "kahan.hxx"
+#include "scdllapi.h"
+#include <tools/cpuid.hxx>
+#include <formula/errorcodes.hxx>
+
+namespace sc::op
+{
+/* Checkout available optimization options */
+SC_DLLPUBLIC extern const bool hasAVX512F;
+const bool hasAVX = cpuid::hasAVX();
+const bool hasSSE2 = cpuid::hasSSE2();
+
+/**
+ * Performs one step of the Neumanier sum between doubles
+ * Overwrites the summand and error
+ * @parma sum
+ * @param err
+ * @param value
+ */
+inline void sumNeumanierNormal(double& sum, double& err, const double& value)
+{
+ double t = sum + value;
+ if (std::abs(sum) >= std::abs(value))
+ err += (sum - t) + value;
+ else
+ err += (value - t) + sum;
+ sum = t;
+}
+
+/**
+ * If no boosts available, Unrolled KahanSum.
+ * Most likely to use on android.
+ */
+static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pCurrent)
+{
+ size_t nRealSize = nSize - i;
+ size_t nUnrolledSize = nRealSize - (nRealSize % 4);
+
+ if (nUnrolledSize > 0)
+ {
+ KahanSum sum0 = 0.0;
+ KahanSum sum1 = 0.0;
+ KahanSum sum2 = 0.0;
+ KahanSum sum3 = 0.0;
+
+ for (; i + 3 < nUnrolledSize; i += 4)
+ {
+ sum0 += *pCurrent++;
+ sum1 += *pCurrent++;
+ sum2 += *pCurrent++;
+ sum3 += *pCurrent++;
+ }
+ // We are using pairwise summation alongside Kahan
+ return (sum0 + sum1) + (sum2 + sum3);
+ }
+ return 0.0;
+}
+
+/* Available methos */
+SC_DLLPUBLIC KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
+
+/**
+ * This function task is to choose the fastest method available to perform the sum.
+ * @param i
+ * @param nSize
+ * @param pCurrent
+ */
+static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent)
+{
+ if (hasAVX512F)
+ return executeAVX512F(i, nSize, pCurrent);
+ if (hasAVX)
+ return executeAVX(i, nSize, pCurrent);
+ if (hasSSE2)
+ return executeSSE2(i, nSize, pCurrent);
+ return executeUnrolled(i, nSize, pCurrent);
+}
+
+/**
+ * Performs the sum of an array.
+ * Note that align 16 will speed up the process.
+ * @param pArray
+ * @param nSize
+ */
+inline KahanSum sumArray(const double* pArray, size_t nSize)
+{
+ size_t i = 0;
+ const double* pCurrent = pArray;
+ KahanSum fSum = executeFast(i, nSize, pCurrent);
+
+ // sum rest of the array
+ for (; i < nSize; ++i)
+ fSum += pArray[i];
+
+ // If the sum is a NaN, some of the terms were empty cells, probably.
+ // Re-calculate, carefully
+ double fVal = fSum.get();
+ if (!std::isfinite(fVal))
+ {
+ FormulaError nErr = GetDoubleErrorValue(fVal);
+ if (nErr == FormulaError::NoValue)
+ {
+ fSum = 0;
+ for (i = 0; i < nSize; i++)
+ {
+ if (!std::isfinite(pArray[i]))
+ {
+ nErr = GetDoubleErrorValue(pArray[i]);
+ if (nErr != FormulaError::NoValue)
+ fSum += pArray[i]; // Let errors encoded as NaNs propagate ???
+ }
+ else
+ fSum += pArray[i];
+ }
+ }
+ }
+ return fSum;
+}
+
+} // end namespace sc::op
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx
index 49c7922b3c79..3404fb6d14a6 100644
--- a/sc/inc/kahan.hxx
+++ b/sc/inc/kahan.hxx
@@ -28,6 +28,12 @@ public:
{
}
+ constexpr KahanSum(double x_0, double err_0)
+ : m_fSum(x_0)
+ , m_fError(err_0)
+ {
+ }
+
constexpr KahanSum(const KahanSum& fSum) = default;
public:
@@ -182,6 +188,9 @@ public:
return fSum.m_fSum != m_fSum || fSum.m_fError != m_fError;
}
+ constexpr double* getSum() { return &m_fSum; }
+ constexpr double* getError() { return &m_fError; }
+
public:
/**
* Returns the final sum.
diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx
index 4d97d4cc1689..e82d762d88ee 100644
--- a/sc/qa/unit/functions_statistical.cxx
+++ b/sc/qa/unit/functions_statistical.cxx
@@ -1,4 +1,5 @@
#include "functions_test.hxx"
+#include <arraysumfunctor.hxx>
class StatisticalFunctionsTest : public FunctionsTest
{
@@ -6,9 +7,11 @@ public:
StatisticalFunctionsTest();
void testStatisticalFormulasFODS();
+ void testIntrinsicSums();
CPPUNIT_TEST_SUITE(StatisticalFunctionsTest);
CPPUNIT_TEST(testStatisticalFormulasFODS);
+ CPPUNIT_TEST(testIntrinsicSums);
CPPUNIT_TEST_SUITE_END();
};
@@ -26,6 +29,25 @@ StatisticalFunctionsTest::StatisticalFunctionsTest():
{
}
+void StatisticalFunctionsTest::testIntrinsicSums()
+{
+ // Checkout SSE2, AVX and AVX512 opperations
+ // Needs exactly 9 terms
+ double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 };
+ double* pCurrent = summands;
+ size_t i = 0;
+ if (sc::op::hasAVX512F)
+ CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX512F(i, 9, pCurrent).get());
+ i = 0;
+ if (sc::op::hasAVX)
+ CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX(i, 9, pCurrent).get());
+ i = 0;
+ if (sc::op::hasSSE2)
+ CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeSSE2(i, 9, pCurrent).get());
+ i = 0;
+ CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get());
+}
+
CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest);
CPPUNIT_PLUGIN_IMPLEMENT();
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
deleted file mode 100644
index ae8d38db1338..000000000000
--- a/sc/source/core/inc/arraysumfunctor.hxx
+++ /dev/null
@@ -1,121 +0,0 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
-/*
- * This file is part of the LibreOffice project.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cmath>
-
-#include <sal/mathconf.h>
-#include <sal/types.h>
-#include <tools/simd.hxx>
-#include <tools/cpuid.hxx>
-#include <kahan.hxx>
-
-namespace sc
-{
-struct ArraySumFunctor
-{
-private:
- const double* mpArray;
- size_t mnSize;
-
-public:
- ArraySumFunctor(const double* pArray, size_t nSize)
- : mpArray(pArray)
- , mnSize(nSize)
- {
- }
-
- KahanSum operator()()
- {
- const static bool hasSSE2 = cpuid::hasSSE2();
-
- KahanSum fSum = 0.0;
- size_t i = 0;
- const double* pCurrent = mpArray;
-
- if (hasSSE2)
- {
- while (i < mnSize && !simd::isAligned<double, 16>(pCurrent))
- {
- fSum += *pCurrent++;
- i++;
- }
- if (i < mnSize)
- {
- fSum += executeSSE2(i, pCurrent);
- }
- }
- else
- fSum = executeUnrolled(i, pCurrent);
-
- // sum rest of the array
-
- for (; i < mnSize; ++i)
- fSum += mpArray[i];
-
- // If the sum is a NaN, some of the terms were empty cells, probably.
- // Re-calculate, carefully
- double fVal = fSum.get();
- if (!std::isfinite(fVal))
- {
- sal_uInt32 nErr = reinterpret_cast<sal_math_Double*>(&fVal)->nan_parts.fraction_lo;
- if (nErr & 0xffff0000)
- {
- fSum = 0;
- for (i = 0; i < mnSize; i++)
- {
- if (!std::isfinite(mpArray[i]))
- {
- nErr = reinterpret_cast<const sal_math_Double*>(&mpArray[i])
- ->nan_parts.fraction_lo;
- if (!(nErr & 0xffff0000))
- fSum += mpArray[i]; // Let errors encoded as NaNs propagate ???
- }
- else
- fSum += mpArray[i];
- }
- }
- }
- return fSum;
- }
-
-private:
- double executeSSE2(size_t& i, const double* pCurrent) const;
- KahanSum executeUnrolled(size_t& i, const double* pCurrent) const
- {
- size_t nRealSize = mnSize - i;
- size_t nUnrolledSize = nRealSize - (nRealSize % 4);
-
- if (nUnrolledSize > 0)
- {
- KahanSum sum0 = 0.0;
- KahanSum sum1 = 0.0;
- KahanSum sum2 = 0.0;
- KahanSum sum3 = 0.0;
-
- for (; i < nUnrolledSize; i += 4)
- {
- sum0 += *pCurrent++;
- sum1 += *pCurrent++;
- sum2 += *pCurrent++;
- sum3 += *pCurrent++;
- }
- // We are using pairwise summation alongside Kahan
- return (sum0 + sum1) + (sum2 + sum3);
- }
- return 0.0;
- }
-};
-
-} // end namespace sc
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx
new file mode 100644
index 000000000000..af12a08a8cf5
--- /dev/null
+++ b/sc/source/core/tool/arraysumAVX.cxx
@@ -0,0 +1,114 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
+#include <tools/simdsupport.hxx>
+
+namespace sc::op
+{
+#ifdef LO_AVX_AVAILABLE // Old processors
+
+const __m256d ANNULATE_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+
+/** Kahan sum with AVX.
+ */
+static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
+{
+ // Temporal parameter
+ __m256d t = _mm256_add_pd(sum, value);
+ // Absolute value of the total sum
+ __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT);
+ // Absolute value of the value to add
+ __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT);
+ // Comaprate the absolute values sum >= value
+ __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ);
+ // The following code has this form ( a - t + b)
+ // Case 1: a = sum b = value
+ // Case 2: a = value b = sum
+ __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value));
+ __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum));
+ err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b));
+ // Store result
+ sum = t;
+}
+
+#endif
+
+/** Execute Kahan sum with AVX.
+ */
+KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_AVX_AVAILABLE
+ // Make sure we don't fall out of bounds.
+ // This works by sums of 8 terms.
+ // So the 8'th term is i+7
+ // If we iterate untill nSize won't fall out of bounds
+ if (nSize > i + 7)
+ {
+ // Setup sums and errors as 0
+ __m256d sum1 = _mm256_setzero_pd();
+ __m256d err1 = _mm256_setzero_pd();
+ __m256d sum2 = _mm256_setzero_pd();
+ __m256d err2 = _mm256_setzero_pd();
+
+ for (; i + 7 < nSize; i += 8)
+ {
+ // Kahan sum 1
+ __m256d load1 = _mm256_loadu_pd(pCurrent);
+ sumAVX(sum1, err1, load1);
+ pCurrent += 4;
+
+ // Kahan sum 2
+ __m256d load2 = _mm256_loadu_pd(pCurrent);
+ sumAVX(sum2, err2, load2);
+ pCurrent += 4;
+ }
+
+ // Now we combine pairwise summation with Kahan summation
+
+ // sum 1 + sum 2 -> sum 1
+ sumAVX(sum1, err1, sum2);
+ sumAVX(sum1, err1, err2);
+
+ // Store results
+ double sums[4];
+ double errs[4];
+ _mm256_storeu_pd(&sums[0], sum1);
+ _mm256_storeu_pd(&errs[0], err1);
+
+ // First Kahan & pairwise summation
+ // 0+1 1+2 -> 0, 2
+ sumNeumanierNormal(sums[0], errs[0], sums[1]);
+ sumNeumanierNormal(sums[2], errs[2], sums[3]);
+ sumNeumanierNormal(sums[0], errs[0], errs[1]);
+ sumNeumanierNormal(sums[2], errs[2], errs[3]);
+
+ // 0+2 -> 0
+ sumNeumanierNormal(sums[0], errs[0], sums[2]);
+ sumNeumanierNormal(sums[0], errs[0], errs[2]);
+
+ // Store result
+ return KahanSum(sums[0], errs[0]);
+ }
+ return 0.0;
+#else
+ SAL_WARN("sc", "Failed to use AVX");
+ (void)i;
+ (void)nSize;
+ (void)pCurrent;
+ return 0.0;
+#endif
+}
+
+} // end namespace sc::op
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx
new file mode 100644
index 000000000000..55764849edfb
--- /dev/null
+++ b/sc/source/core/tool/arraysumAVX512.cxx
@@ -0,0 +1,140 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
+#include <tools/simdsupport.hxx>
+
+/* TODO Remove this once GCC updated and AVX512 can work. */
+#ifdef __GNUC__
+#if __GNUC__ < 9
+#ifdef LO_AVX512F_AVAILABLE
+#define HAS_LO_AVX512F_AVAILABLE
+#undef LO_AVX512F_AVAILABLE
+#endif
+#endif
+#endif
+
+#ifdef LO_AVX512F_AVAILABLE
+const bool sc::op::hasAVX512F = cpuid::hasAVX512F();
+#else
+const bool sc::op::hasAVX512F = false;
+#endif
+
+namespace sc::op
+{
+#ifdef LO_AVX512F_AVAILABLE // New processors
+
+/** Kahan sum with AVX512.
+ */
+static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
+{
+ // Temporal parameter
+ __m512d t = _mm512_add_pd(sum, value);
+ // Absolute value of the total sum
+ __m512d asum = _mm512_abs_pd(sum);
+ // Absolute value of the value to add
+ __m512d avalue = _mm512_abs_pd(value);
+ // Comaprate the absolute values sum >= value
+ __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
+ // The following code has this form ( a - t + b)
+ // Case 1: a = sum b = value
+ // Case 2: a = value b = sum
+ __m512d a = _mm512_mask_blend_pd(mask, sum, value);
+ __m512d b = _mm512_mask_blend_pd(mask, value, sum);
+ err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
+ // Store result
+ sum = t;
+}
+
+#endif
+
+/** Execute Kahan sum with AVX512.
+ */
+KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_AVX512F_AVAILABLE // New processors
+ // Make sure we don't fall out of bounds.
+ // This works by sums of 8 terms.
+ // So the 8'th term is i+7
+ // If we iterate untill nSize won't fall out of bounds
+ if (nSize > i + 7)
+ {
+ // Setup sums and errors as 0
+ __m512d sum = _mm512_setzero_pd();
+ __m512d err = _mm512_setzero_pd();
+
+ // Sum the stuff
+ for (; i + 7 < nSize; i += 8)
+ {
+ // Kahan sum
+ __m512d load = _mm512_loadu_pd(pCurrent);
+ sumAVX512(sum, err, load);
+ pCurrent += 8;
+ }
+
+ // Store result
+ static_assert(sizeof(double) == 8);
+ double sums[8];
+ double errs[8];
+ _mm512_storeu_pd(static_cast<void*>(&sums[0]), sum);
+ _mm512_storeu_pd(static_cast<void*>(&errs[0]), err);
+
+ // First Kahan & pairwise summation
+ // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
+ sumNeumanierNormal(sums[0], errs[0], sums[1]);
+ sumNeumanierNormal(sums[2], errs[2], sums[3]);
+ sumNeumanierNormal(sums[4], errs[4], sums[5]);
+ sumNeumanierNormal(sums[6], errs[6], sums[7]);
+ sumNeumanierNormal(sums[0], errs[0], errs[1]);
+ sumNeumanierNormal(sums[2], errs[2], errs[3]);
+ sumNeumanierNormal(sums[4], errs[4], errs[5]);
+ sumNeumanierNormal(sums[6], errs[6], errs[7]);
+
+ // Second Kahan & pairwise summation
+ // 0+2 4+6 -> 0, 4
+ sumNeumanierNormal(sums[0], errs[0], sums[2]);
+ sumNeumanierNormal(sums[4], errs[4], sums[6]);
+ sumNeumanierNormal(sums[0], errs[0], errs[2]);
+ sumNeumanierNormal(sums[4], errs[4], errs[6]);
+
+ // Third Kahan & pairwise summation
+ // 0+4 -> 0
+ sumNeumanierNormal(sums[0], errs[0], sums[4]);
+ sumNeumanierNormal(sums[0], errs[0], errs[4]);
+
+ // Return final result
+ return KahanSum(sums[0], errs[0]);
+ }
+ else
+ return 0.0;
+#else
+ SAL_WARN("sc", "Failed to use AVX 512");
+ (void)i;
+ (void)nSize;
+ (void)pCurrent;
+ return 0.0;
+#endif
+}
+
+} // end namespace sc::op
+
+/* TODO Remove this once GCC updated and AVX512 can work. */
+#ifdef __GNUC__
+#if __GNUC__ < 9
+#ifdef HAS_LO_AVX512F_AVAILABLE
+#define LO_AVX512F_AVAILABLE
+#undef HAS_LO_AVX512F_AVAILABLE
+#endif
+#endif
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
index e69f672b6014..4f6a4b47a11d 100644
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -9,97 +9,121 @@
*/
#include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
#include <tools/simdsupport.hxx>
-namespace sc
+//AVX512VL + AVX512F + KNCNI
+
+namespace sc::op
{
-double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
+#ifdef LO_SSE2_AVAILABLE // Old processors
+
+const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+
+/** Kahan sum with SSE4.2.
+ */
+static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
{
-#if defined(LO_SSE2_AVAILABLE)
- double fSum = 0.0;
- size_t nRealSize = mnSize - i;
- size_t nUnrolledSize = nRealSize - (nRealSize % 8);
+ // Temporal parameter
+ __m128d t = _mm_add_pd(sum, value);
+ // Absolute value of the total sum
+ __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT);
+ // Absolute value of the value to add
+ __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT);
+ // Comaprate the absolute values sum >= value
+ __m128d mask = _mm_cmpge_pd(asum, avalue);
+ // The following code has this form ( a - t + b)
+ // Case 1: a = sum b = value
+ // Case 2: a = value b = sum
+ __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value));
+ __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum));
+ err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b));
+ // Store result
+ sum = t;
+}
+
+#endif
- if (nUnrolledSize > 0)
+/** Execute Kahan sum with SSE2.
+ */
+KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_SSE2_AVAILABLE
+ // Make sure we don't fall out of bounds.
+ // This works by sums of 8 terms.
+ // So the 8'th term is i+7
+ // If we iterate untill nSize won't fall out of bounds
+ if (nSize > i + 7)
{
+ // Setup sums and errors as 0
__m128d sum1 = _mm_setzero_pd();
- __m128d sum2 = _mm_setzero_pd();
- __m128d sum3 = _mm_setzero_pd();
- __m128d sum4 = _mm_setzero_pd();
-
__m128d err1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
__m128d err2 = _mm_setzero_pd();
+ __m128d sum3 = _mm_setzero_pd();
__m128d err3 = _mm_setzero_pd();
+ __m128d sum4 = _mm_setzero_pd();
__m128d err4 = _mm_setzero_pd();
- __m128d y, t;
-
- for (; i < nUnrolledSize; i += 8)
+ for (; i + 7 < nSize; i += 8)
{
// Kahan sum 1
- __m128d load1 = _mm_load_pd(pCurrent);
- y = _mm_sub_pd(load1, err1);
- t = _mm_add_pd(sum1, y);
- err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
- sum1 = t;
+ __m128d load1 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum1, err1, load1);
pCurrent += 2;
// Kahan sum 2
- __m128d load2 = _mm_load_pd(pCurrent);
- y = _mm_sub_pd(load2, err2);
- t = _mm_add_pd(sum2, y);
- err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y);
- sum2 = t;
+ __m128d load2 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum2, err2, load2);
pCurrent += 2;
// Kahan sum 3
- __m128d load3 = _mm_load_pd(pCurrent);
- y = _mm_sub_pd(load3, err3);
- t = _mm_add_pd(sum3, y);
- err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y);
- sum3 = t;
+ __m128d load3 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum3, err3, load3);
pCurrent += 2;
// Kahan sum 4
- __m128d load4 = _mm_load_pd(pCurrent);
- y = _mm_sub_pd(load4, err4);
- t = _mm_add_pd(sum4, y);
- err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y);
- sum4 = t;
+ __m128d load4 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum4, err4, load4);
pCurrent += 2;
}
// Now we combine pairwise summation with Kahan summation
- // sum 1 + sum 2
- y = _mm_sub_pd(sum2, err1);
- t = _mm_add_pd(sum1, y);
- err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
- sum1 = t;
-
- // sum 3 + sum 4
- y = _mm_sub_pd(sum4, err3);
- t = _mm_add_pd(sum3, y);
- sum3 = t;
-
- // sum 1 + sum 3
- y = _mm_sub_pd(sum3, err1);
- t = _mm_add_pd(sum1, y);
- sum1 = t;
-
- double temp;
-
- _mm_storel_pd(&temp, sum1);
- fSum += temp;
-
- _mm_storeh_pd(&temp, sum1);
- fSum += temp;
+ // 1+2 3+4 -> 1, 3
+ sumSSE2(sum1, err1, sum2);
+ sumSSE2(sum1, err1, err2);
+ sumSSE2(sum3, err3, sum4);
+ sumSSE2(sum3, err3, err4);
+
+ // 1+3 -> 1
+ sumSSE2(sum1, err1, sum3);
+ sumSSE2(sum1, err1, err3);
+
+ // Store results
+ double sums[2];
+ double errs[2];
+ _mm_storeu_pd(&sums[0], sum1);
+ _mm_storeu_pd(&errs[0], err1);
+
+ // First Kahan & pairwise summation
+ // 0+1 -> 0
+ sumNeumanierNormal(sums[0], errs[0], sums[1]);
+ sumNeumanierNormal(sums[0], errs[0], errs[1]);
+
+ // Store result
+ return KahanSum(sums[0], errs[0]);
}
- return fSum;
+ return 0.0;
#else
+ SAL_WARN("sc", "Failed to use SSE2");
(void)i;
+ (void)nSize;
(void)pCurrent;
return 0.0;
#endif
}
}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx
index a291b444ecdc..dbeab67fe35c 100644
--- a/sc/source/core/tool/interpr6.cxx
+++ b/sc/source/core/tool/interpr6.cxx
@@ -223,9 +223,7 @@ public:
return;
const double *p = &sc::numeric_block::at(*rNode.data, nOffset);
- sc::ArraySumFunctor functor(p, nDataSize);
-
- maSum += functor();
+ maSum += sc::op::sumArray(p, nDataSize);
break;
}
diff --git a/tools/source/misc/cpuid.cxx b/tools/source/misc/cpuid.cxx
index 0395d0e0f001..855b87e6da63 100644
--- a/tools/source/misc/cpuid.cxx
+++ b/tools/source/misc/cpuid.cxx
@@ -56,6 +56,7 @@ bool checkAVXSupportInOS()
#define XSAVE_bit (1 << 27)
#define AVX_bit (1 << 28)
#define AVX2_bit (1 << 5)
+#define AVX512F_bit (1 << 16)
InstructionSetFlags getCpuInstructionSetFlags()
{
@@ -98,6 +99,8 @@ InstructionSetFlags getCpuInstructionSetFlags()
if ((aExtendedInfo[1] & AVX2_bit) != 0)
eInstructions |= InstructionSetFlags::AVX2;
+ if ((aExtendedInfo[1] & AVX512F_bit) != 0)
+ eInstructions |= InstructionSetFlags::AVX512F;
}
}
}
@@ -127,6 +130,8 @@ OUString instructionSetSupportedString()
aString += "AVX ";
if (isCpuInstructionSetSupported(InstructionSetFlags::AVX2))
aString += "AVX2 ";
+ if (isCpuInstructionSetSupported(InstructionSetFlags::AVX512F))
+ aString += "AVX512F ";
return aString;
}