tdf#142307 - Upgrade SSE2 sum to AVX512 sum with Neumaier 1

This part focuses on allowing it on replacing arrayfunctor By thefault it will try AVX512F (1,17%) If not available will use AVX (94,77%) Use of AVX2 (82,28%) has been avoided even if the code could been more compact Source of hardware statistics: https://store.steampowered.com/hwsurvey Change-Id: Iae737a565379e82c5f84f3fdee6321ac74f59d40 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115675 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
author: dante <dante19031999@gmail.com> 2021-05-16 16:04:20 +0200
committer: Mike Kaganski <mike.kaganski@collabora.com> 2021-08-26 08:48:35 +0200
commit: 5b9cf5881ef53fac5f1d8376f687dbadf9d3cf2b (patch)
tree: ff16b0dcf62236c48fed6ca87e7d584959ed1841
parent: fb1233d77500413ba237c335a84773d8a6f8e381 (diff)
13 files changed, 556 insertions, 183 deletions
diff --git a/configure.ac b/configure.ac
index 9e5708cd49d4..bd929450971c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -7631,6 +7631,7 @@ CXXFLAGS_INTRINSICS_SSE42=
 CXXFLAGS_INTRINSICS_AVX=
 CXXFLAGS_INTRINSICS_AVX2=
 CXXFLAGS_INTRINSICS_AVX512=
+CXXFLAGS_INTRINSICS_AVX512F=
 CXXFLAGS_INTRINSICS_F16C=
 CXXFLAGS_INTRINSICS_FMA=
 
@@ -7643,6 +7644,7 @@ if test "$GCC" = "yes" -o "$COM_IS_CLANG" = TRUE; then
     flag_avx=-mavx
     flag_avx2=-mavx2
     flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd"
+    flag_avx512f=-mavx512f
     flag_f16c=-mf16c
     flag_fma=-mfma
 else
@@ -7658,6 +7660,7 @@ else
     flag_avx=-arch:AVX
     flag_avx2=-arch:AVX2
     flag_avx512=-arch:AVX512
+    flag_avx512f=-arch:AVX512
     # These are part of -arch:AVX2
     flag_f16c=-arch:AVX2
     flag_fma=-arch:AVX2
@@ -7807,6 +7810,7 @@ CXXFLAGS=$save_CXXFLAGS
 AC_MSG_RESULT([${can_compile_avx512}])
 if test "${can_compile_avx512}" = "yes" ; then
     CXXFLAGS_INTRINSICS_AVX512="$flag_avx512"
+    CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f"
 fi
 
 AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics])
@@ -7859,6 +7863,7 @@ AC_SUBST([CXXFLAGS_INTRINSICS_SSE42])
 AC_SUBST([CXXFLAGS_INTRINSICS_AVX])
 AC_SUBST([CXXFLAGS_INTRINSICS_AVX2])
 AC_SUBST([CXXFLAGS_INTRINSICS_AVX512])
+AC_SUBST([CXXFLAGS_INTRINSICS_AVX512F])
 AC_SUBST([CXXFLAGS_INTRINSICS_F16C])
 AC_SUBST([CXXFLAGS_INTRINSICS_FMA])
 
@@ -12013,6 +12018,7 @@ LO_CLANG_CXXFLAGS_INTRINSICS_SSE42=
 LO_CLANG_CXXFLAGS_INTRINSICS_AVX=
 LO_CLANG_CXXFLAGS_INTRINSICS_AVX2=
 LO_CLANG_CXXFLAGS_INTRINSICS_AVX512=
+LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F=
 LO_CLANG_CXXFLAGS_INTRINSICS_F16C=
 LO_CLANG_CXXFLAGS_INTRINSICS_FMA=
 
@@ -12066,6 +12072,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT
         flag_avx=-mavx
         flag_avx2=-mavx2
         flag_avx512="-mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd"
+        flag_avx512f=-mavx512f
         flag_f16c=-mf16c
         flag_fma=-mfma
 
@@ -12213,6 +12220,7 @@ if test "$ENABLE_SKIA" = TRUE -a "$COM_IS_CLANG" != TRUE -a ! \( "$_os" = "WINNT
         AC_MSG_RESULT([${can_compile_avx512}])
         if test "${can_compile_avx512}" = "yes" ; then
             LO_CLANG_CXXFLAGS_INTRINSICS_AVX512="$flag_avx512"
+            LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F="$flag_avx512f"
         fi
 
         AC_MSG_CHECKING([whether $CXX can compile F16C intrinsics])
@@ -12319,6 +12327,7 @@ AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_SSE42)
 AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX)
 AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX2)
 AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512)
+AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_AVX512F)
 AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_F16C)
 AC_SUBST(LO_CLANG_CXXFLAGS_INTRINSICS_FMA)
 AC_SUBST(CLANG_USE_LD)
diff --git a/include/tools/cpuid.hxx b/include/tools/cpuid.hxx
index 90c7d37b485c..98dca497a7de 100644
--- a/include/tools/cpuid.hxx
+++ b/include/tools/cpuid.hxx
@@ -26,7 +26,8 @@ enum class InstructionSetFlags
     SSE41 = 0x08,
     SSE42 = 0x10,
     AVX   = 0x20,
-    AVX2  = 0x40
+    AVX2  = 0x40,
+    AVX512F = 0x100
 };
 
 } // end cpuid
@@ -63,6 +64,13 @@ inline bool hasSSSE3()
     return isCpuInstructionSetSupported(InstructionSetFlags::SSSE3);
 }
 
+/** Check if AVX is supported by the CPU
+ */
+inline bool hasAVX()
+{
+    return isCpuInstructionSetSupported(InstructionSetFlags::AVX);
+}
+
 /** Check if AVX2 is supported by the CPU
  */
 inline bool hasAVX2()
@@ -70,6 +78,13 @@ inline bool hasAVX2()
     return isCpuInstructionSetSupported(InstructionSetFlags::AVX2);
 }
 
+/** Check if AVX512F is supported by the CPU
+ */
+inline bool hasAVX512F()
+{
+    return isCpuInstructionSetSupported(InstructionSetFlags::AVX512F);
+}
+
 /** Check if Hyper Threading is supported
  */
 inline bool hasHyperThreading()
diff --git a/include/tools/simdsupport.hxx b/include/tools/simdsupport.hxx
index 5d10d53d48ad..bc9227223da0 100644
--- a/include/tools/simdsupport.hxx
+++ b/include/tools/simdsupport.hxx
@@ -20,6 +20,7 @@
 #undef LO_SSSE3_AVAILABLE
 #undef LO_AVX_AVAILABLE
 #undef LO_AVX2_AVAILABLE
+#undef LO_AVX512F_AVAILABLE
 
 #if defined(_MSC_VER) // VISUAL STUDIO COMPILER
 
@@ -46,6 +47,12 @@
 #include <immintrin.h>
 #endif // defined(__AVX2__)
 
+// compiled with /arch:AVX512F
+#if defined(__AVX512F__)
+#define LO_AVX512F_AVAILABLE
+#include <immintrin.h>
+#endif // defined(__AVX512F__)
+
 #else // compiler Clang and GCC
 
 #if defined(__SSE2__) || defined(__x86_64__) // SSE2 is required for X64
@@ -68,6 +75,12 @@
 #include <immintrin.h>
 #endif // defined(__AVX2__)
 
+#if defined(__AVX512F__)
+#define LO_AVX512F_AVAILABLE
+#include <immintrin.h>
+#else
+#endif // defined(__AVX512F__)
+
 #endif // end compiler Clang and GCC
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk
index f1b62e1c263d..a85650e935e6 100644
--- a/sc/Library_sc.mk
+++ b/sc/Library_sc.mk
@@ -99,6 +99,14 @@ $(eval $(call gb_Library_use_libraries,sc,\
 ))
 
 $(eval $(call gb_Library_add_exception_objects,sc,\
+    sc/source/core/tool/arraysumAVX512, $(CXXFLAGS_INTRINSICS_AVX512F) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,sc,\
+    sc/source/core/tool/arraysumAVX, $(CXXFLAGS_INTRINSICS_AVX) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,sc,\
     sc/source/core/tool/arraysumSSE2, $(CXXFLAGS_INTRINSICS_SSE2) \
 ))
 
diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx
new file mode 100644
index 000000000000..ee6d4c0fca5c
--- /dev/null
+++ b/sc/inc/arraysumfunctor.hxx
@@ -0,0 +1,137 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#pragma once
+
+#include <cmath>
+#include "kahan.hxx"
+#include "scdllapi.h"
+#include <tools/cpuid.hxx>
+#include <formula/errorcodes.hxx>
+
+namespace sc::op
+{
+/* Checkout available optimization options */
+SC_DLLPUBLIC extern const bool hasAVX512F;
+const bool hasAVX = cpuid::hasAVX();
+const bool hasSSE2 = cpuid::hasSSE2();
+
+/**
+  * Performs one step of the Neumanier sum between doubles
+  * Overwrites the summand and error
+  * @parma sum
+  * @param err
+  * @param value
+  */
+inline void sumNeumanierNormal(double& sum, double& err, const double& value)
+{
+    double t = sum + value;
+    if (std::abs(sum) >= std::abs(value))
+        err += (sum - t) + value;
+    else
+        err += (value - t) + sum;
+    sum = t;
+}
+
+/**
+  * If no boosts available, Unrolled KahanSum.
+  * Most likely to use on android.
+  */
+static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pCurrent)
+{
+    size_t nRealSize = nSize - i;
+    size_t nUnrolledSize = nRealSize - (nRealSize % 4);
+
+    if (nUnrolledSize > 0)
+    {
+        KahanSum sum0 = 0.0;
+        KahanSum sum1 = 0.0;
+        KahanSum sum2 = 0.0;
+        KahanSum sum3 = 0.0;
+
+        for (; i + 3 < nUnrolledSize; i += 4)
+        {
+            sum0 += *pCurrent++;
+            sum1 += *pCurrent++;
+            sum2 += *pCurrent++;
+            sum3 += *pCurrent++;
+        }
+        // We are using pairwise summation alongside Kahan
+        return (sum0 + sum1) + (sum2 + sum3);
+    }
+    return 0.0;
+}
+
+/* Available methos */
+SC_DLLPUBLIC KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
+
+/**
+  * This function task is to choose the fastest method available to perform the sum.
+  * @param i
+  * @param nSize
+  * @param pCurrent
+  */
+static inline KahanSum executeFast(size_t& i, size_t nSize, const double* pCurrent)
+{
+    if (hasAVX512F)
+        return executeAVX512F(i, nSize, pCurrent);
+    if (hasAVX)
+        return executeAVX(i, nSize, pCurrent);
+    if (hasSSE2)
+        return executeSSE2(i, nSize, pCurrent);
+    return executeUnrolled(i, nSize, pCurrent);
+}
+
+/**
+  * Performs the sum of an array.
+  * Note that align 16 will speed up the process.
+  * @param pArray
+  * @param nSize
+  */
+inline KahanSum sumArray(const double* pArray, size_t nSize)
+{
+    size_t i = 0;
+    const double* pCurrent = pArray;
+    KahanSum fSum = executeFast(i, nSize, pCurrent);
+
+    // sum rest of the array
+    for (; i < nSize; ++i)
+        fSum += pArray[i];
+
+    // If the sum is a NaN, some of the terms were empty cells, probably.
+    // Re-calculate, carefully
+    double fVal = fSum.get();
+    if (!std::isfinite(fVal))
+    {
+        FormulaError nErr = GetDoubleErrorValue(fVal);
+        if (nErr == FormulaError::NoValue)
+        {
+            fSum = 0;
+            for (i = 0; i < nSize; i++)
+            {
+                if (!std::isfinite(pArray[i]))
+                {
+                    nErr = GetDoubleErrorValue(pArray[i]);
+                    if (nErr != FormulaError::NoValue)
+                        fSum += pArray[i]; // Let errors encoded as NaNs propagate ???
+                }
+                else
+                    fSum += pArray[i];
+            }
+        }
+    }
+    return fSum;
+}
+
+} // end namespace sc::op
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx
index 49c7922b3c79..3404fb6d14a6 100644
--- a/sc/inc/kahan.hxx
+++ b/sc/inc/kahan.hxx
@@ -28,6 +28,12 @@ public:
     {
     }
 
+    constexpr KahanSum(double x_0, double err_0)
+        : m_fSum(x_0)
+        , m_fError(err_0)
+    {
+    }
+
     constexpr KahanSum(const KahanSum& fSum) = default;
 
 public:
@@ -182,6 +188,9 @@ public:
         return fSum.m_fSum != m_fSum || fSum.m_fError != m_fError;
     }
 
+    constexpr double* getSum() { return &m_fSum; }
+    constexpr double* getError() { return &m_fError; }
+
 public:
     /**
       * Returns the final sum.
diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx
index 4d97d4cc1689..e82d762d88ee 100644
--- a/sc/qa/unit/functions_statistical.cxx
+++ b/sc/qa/unit/functions_statistical.cxx
@@ -1,4 +1,5 @@
 #include "functions_test.hxx"
+#include <arraysumfunctor.hxx>
 
 class StatisticalFunctionsTest : public FunctionsTest
 {
@@ -6,9 +7,11 @@ public:
     StatisticalFunctionsTest();
 
     void testStatisticalFormulasFODS();
+    void testIntrinsicSums();
 
     CPPUNIT_TEST_SUITE(StatisticalFunctionsTest);
     CPPUNIT_TEST(testStatisticalFormulasFODS);
+    CPPUNIT_TEST(testIntrinsicSums);
     CPPUNIT_TEST_SUITE_END();
 
 };
@@ -26,6 +29,25 @@ StatisticalFunctionsTest::StatisticalFunctionsTest():
 {
 }
 
+void StatisticalFunctionsTest::testIntrinsicSums()
+{
+    // Checkout SSE2, AVX and AVX512 opperations
+        // Needs exactly 9 terms
+    double summands[9] = { 0, 1, 2, 3, 4, 10, 20, 2, -1 };
+    double* pCurrent = summands;
+    size_t i = 0;
+    if (sc::op::hasAVX512F)
+        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX512F(i, 9, pCurrent).get());
+    i = 0;
+    if (sc::op::hasAVX)
+        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX(i, 9, pCurrent).get());
+    i = 0;
+    if (sc::op::hasSSE2)
+        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeSSE2(i, 9, pCurrent).get());
+    i = 0;
+    CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get());
+}
+
 CPPUNIT_TEST_SUITE_REGISTRATION(StatisticalFunctionsTest);
 
 CPPUNIT_PLUGIN_IMPLEMENT();
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
deleted file mode 100644
index ae8d38db1338..000000000000
--- a/sc/source/core/inc/arraysumfunctor.hxx
+++ /dev/null
@@ -1,121 +0,0 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
-/*
- * This file is part of the LibreOffice project.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cmath>
-
-#include <sal/mathconf.h>
-#include <sal/types.h>
-#include <tools/simd.hxx>
-#include <tools/cpuid.hxx>
-#include <kahan.hxx>
-
-namespace sc
-{
-struct ArraySumFunctor
-{
-private:
-    const double* mpArray;
-    size_t mnSize;
-
-public:
-    ArraySumFunctor(const double* pArray, size_t nSize)
-        : mpArray(pArray)
-        , mnSize(nSize)
-    {
-    }
-
-    KahanSum operator()()
-    {
-        const static bool hasSSE2 = cpuid::hasSSE2();
-
-        KahanSum fSum = 0.0;
-        size_t i = 0;
-        const double* pCurrent = mpArray;
-
-        if (hasSSE2)
-        {
-            while (i < mnSize && !simd::isAligned<double, 16>(pCurrent))
-            {
-                fSum += *pCurrent++;
-                i++;
-            }
-            if (i < mnSize)
-            {
-                fSum += executeSSE2(i, pCurrent);
-            }
-        }
-        else
-            fSum = executeUnrolled(i, pCurrent);
-
-        // sum rest of the array
-
-        for (; i < mnSize; ++i)
-            fSum += mpArray[i];
-
-        // If the sum is a NaN, some of the terms were empty cells, probably.
-        // Re-calculate, carefully
-        double fVal = fSum.get();
-        if (!std::isfinite(fVal))
-        {
-            sal_uInt32 nErr = reinterpret_cast<sal_math_Double*>(&fVal)->nan_parts.fraction_lo;
-            if (nErr & 0xffff0000)
-            {
-                fSum = 0;
-                for (i = 0; i < mnSize; i++)
-                {
-                    if (!std::isfinite(mpArray[i]))
-                    {
-                        nErr = reinterpret_cast<const sal_math_Double*>(&mpArray[i])
-                                   ->nan_parts.fraction_lo;
-                        if (!(nErr & 0xffff0000))
-                            fSum += mpArray[i]; // Let errors encoded as NaNs propagate ???
-                    }
-                    else
-                        fSum += mpArray[i];
-                }
-            }
-        }
-        return fSum;
-    }
-
-private:
-    double executeSSE2(size_t& i, const double* pCurrent) const;
-    KahanSum executeUnrolled(size_t& i, const double* pCurrent) const
-    {
-        size_t nRealSize = mnSize - i;
-        size_t nUnrolledSize = nRealSize - (nRealSize % 4);
-
-        if (nUnrolledSize > 0)
-        {
-            KahanSum sum0 = 0.0;
-            KahanSum sum1 = 0.0;
-            KahanSum sum2 = 0.0;
-            KahanSum sum3 = 0.0;
-
-            for (; i < nUnrolledSize; i += 4)
-            {
-                sum0 += *pCurrent++;
-                sum1 += *pCurrent++;
-                sum2 += *pCurrent++;
-                sum3 += *pCurrent++;
-            }
-            // We are using pairwise summation alongside Kahan
-            return (sum0 + sum1) + (sum2 + sum3);
-        }
-        return 0.0;
-    }
-};
-
-} // end namespace sc
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx
new file mode 100644
index 000000000000..af12a08a8cf5
--- /dev/null
+++ b/sc/source/core/tool/arraysumAVX.cxx
@@ -0,0 +1,114 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
+#include <tools/simdsupport.hxx>
+
+namespace sc::op
+{
+#ifdef LO_AVX_AVAILABLE // Old processors
+
+const __m256d ANNULATE_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+
+/** Kahan sum with AVX.
+  */
+static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
+{
+    // Temporal parameter
+    __m256d t = _mm256_add_pd(sum, value);
+    // Absolute value of the total sum
+    __m256d asum = _mm256_and_pd(sum, ANNULATE_SIGN_BIT);
+    // Absolute value of the value to add
+    __m256d avalue = _mm256_and_pd(value, ANNULATE_SIGN_BIT);
+    // Comaprate the absolute values sum >= value
+    __m256d mask = _mm256_cmp_pd(asum, avalue, _CMP_GE_OQ);
+    // The following code has this form ( a - t + b)
+    // Case 1: a = sum b = value
+    // Case 2: a = value b = sum
+    __m256d a = _mm256_add_pd(_mm256_and_pd(mask, sum), _mm256_andnot_pd(mask, value));
+    __m256d b = _mm256_add_pd(_mm256_and_pd(mask, value), _mm256_andnot_pd(mask, sum));
+    err = _mm256_add_pd(err, _mm256_add_pd(_mm256_sub_pd(a, t), b));
+    // Store result
+    sum = t;
+}
+
+#endif
+
+/** Execute Kahan sum with AVX.
+  */
+KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_AVX_AVAILABLE
+    // Make sure we don't fall out of bounds.
+    // This works by sums of 8 terms.
+    // So the 8'th term is i+7
+    // If we iterate untill nSize won't fall out of bounds
+    if (nSize > i + 7)
+    {
+        // Setup sums and errors as 0
+        __m256d sum1 = _mm256_setzero_pd();
+        __m256d err1 = _mm256_setzero_pd();
+        __m256d sum2 = _mm256_setzero_pd();
+        __m256d err2 = _mm256_setzero_pd();
+
+        for (; i + 7 < nSize; i += 8)
+        {
+            // Kahan sum 1
+            __m256d load1 = _mm256_loadu_pd(pCurrent);
+            sumAVX(sum1, err1, load1);
+            pCurrent += 4;
+
+            // Kahan sum 2
+            __m256d load2 = _mm256_loadu_pd(pCurrent);
+            sumAVX(sum2, err2, load2);
+            pCurrent += 4;
+        }
+
+        // Now we combine pairwise summation with Kahan summation
+
+        // sum 1 + sum 2 -> sum 1
+        sumAVX(sum1, err1, sum2);
+        sumAVX(sum1, err1, err2);
+
+        // Store results
+        double sums[4];
+        double errs[4];
+        _mm256_storeu_pd(&sums[0], sum1);
+        _mm256_storeu_pd(&errs[0], err1);
+
+        // First Kahan & pairwise summation
+        // 0+1 1+2 -> 0, 2
+        sumNeumanierNormal(sums[0], errs[0], sums[1]);
+        sumNeumanierNormal(sums[2], errs[2], sums[3]);
+        sumNeumanierNormal(sums[0], errs[0], errs[1]);
+        sumNeumanierNormal(sums[2], errs[2], errs[3]);
+
+        // 0+2 -> 0
+        sumNeumanierNormal(sums[0], errs[0], sums[2]);
+        sumNeumanierNormal(sums[0], errs[0], errs[2]);
+
+        // Store result
+        return KahanSum(sums[0], errs[0]);
+    }
+    return 0.0;
+#else
+    SAL_WARN("sc", "Failed to use AVX");
+    (void)i;
+    (void)nSize;
+    (void)pCurrent;
+    return 0.0;
+#endif
+}
+
+} // end namespace sc::op
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx
new file mode 100644
index 000000000000..55764849edfb
--- /dev/null
+++ b/sc/source/core/tool/arraysumAVX512.cxx
@@ -0,0 +1,140 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
+#include <tools/simdsupport.hxx>
+
+/* TODO Remove this once GCC updated and AVX512 can work. */
+#ifdef __GNUC__
+#if __GNUC__ < 9
+#ifdef LO_AVX512F_AVAILABLE
+#define HAS_LO_AVX512F_AVAILABLE
+#undef LO_AVX512F_AVAILABLE
+#endif
+#endif
+#endif
+
+#ifdef LO_AVX512F_AVAILABLE
+const bool sc::op::hasAVX512F = cpuid::hasAVX512F();
+#else
+const bool sc::op::hasAVX512F = false;
+#endif
+
+namespace sc::op
+{
+#ifdef LO_AVX512F_AVAILABLE // New processors
+
+/** Kahan sum with AVX512.
+  */
+static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
+{
+    // Temporal parameter
+    __m512d t = _mm512_add_pd(sum, value);
+    // Absolute value of the total sum
+    __m512d asum = _mm512_abs_pd(sum);
+    // Absolute value of the value to add
+    __m512d avalue = _mm512_abs_pd(value);
+    // Comaprate the absolute values sum >= value
+    __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
+    // The following code has this form ( a - t + b)
+    // Case 1: a = sum b = value
+    // Case 2: a = value b = sum
+    __m512d a = _mm512_mask_blend_pd(mask, sum, value);
+    __m512d b = _mm512_mask_blend_pd(mask, value, sum);
+    err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
+    // Store result
+    sum = t;
+}
+
+#endif
+
+/** Execute Kahan sum with AVX512.
+  */
+KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_AVX512F_AVAILABLE // New processors
+    // Make sure we don't fall out of bounds.
+    // This works by sums of 8 terms.
+    // So the 8'th term is i+7
+    // If we iterate untill nSize won't fall out of bounds
+    if (nSize > i + 7)
+    {
+        // Setup sums and errors as 0
+        __m512d sum = _mm512_setzero_pd();
+        __m512d err = _mm512_setzero_pd();
+
+        // Sum the stuff
+        for (; i + 7 < nSize; i += 8)
+        {
+            // Kahan sum
+            __m512d load = _mm512_loadu_pd(pCurrent);
+            sumAVX512(sum, err, load);
+            pCurrent += 8;
+        }
+
+        // Store result
+        static_assert(sizeof(double) == 8);
+        double sums[8];
+        double errs[8];
+        _mm512_storeu_pd(static_cast<void*>(&sums[0]), sum);
+        _mm512_storeu_pd(static_cast<void*>(&errs[0]), err);
+
+        // First Kahan & pairwise summation
+        // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
+        sumNeumanierNormal(sums[0], errs[0], sums[1]);
+        sumNeumanierNormal(sums[2], errs[2], sums[3]);
+        sumNeumanierNormal(sums[4], errs[4], sums[5]);
+        sumNeumanierNormal(sums[6], errs[6], sums[7]);
+        sumNeumanierNormal(sums[0], errs[0], errs[1]);
+        sumNeumanierNormal(sums[2], errs[2], errs[3]);
+        sumNeumanierNormal(sums[4], errs[4], errs[5]);
+        sumNeumanierNormal(sums[6], errs[6], errs[7]);
+
+        // Second Kahan & pairwise summation
+        // 0+2 4+6 -> 0, 4
+        sumNeumanierNormal(sums[0], errs[0], sums[2]);
+        sumNeumanierNormal(sums[4], errs[4], sums[6]);
+        sumNeumanierNormal(sums[0], errs[0], errs[2]);
+        sumNeumanierNormal(sums[4], errs[4], errs[6]);
+
+        // Third Kahan & pairwise summation
+        // 0+4 -> 0
+        sumNeumanierNormal(sums[0], errs[0], sums[4]);
+        sumNeumanierNormal(sums[0], errs[0], errs[4]);
+
+        // Return final result
+        return KahanSum(sums[0], errs[0]);
+    }
+    else
+        return 0.0;
+#else
+    SAL_WARN("sc", "Failed to use AVX 512");
+    (void)i;
+    (void)nSize;
+    (void)pCurrent;
+    return 0.0;
+#endif
+}
+
+} // end namespace sc::op
+
+/* TODO Remove this once GCC updated and AVX512 can work. */
+#ifdef __GNUC__
+#if __GNUC__ < 9
+#ifdef HAS_LO_AVX512F_AVAILABLE
+#define LO_AVX512F_AVAILABLE
+#undef HAS_LO_AVX512F_AVAILABLE
+#endif
+#endif
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
index e69f672b6014..4f6a4b47a11d 100644
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -9,97 +9,121 @@
  */
 
 #include <arraysumfunctor.hxx>
+#include <sal/log.hxx>
+#include <tools/simd.hxx>
 #include <tools/simdsupport.hxx>
 
-namespace sc
+//AVX512VL + AVX512F + KNCNI
+
+namespace sc::op
 {
-double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const
+#ifdef LO_SSE2_AVAILABLE // Old processors
+
+const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+
+/** Kahan sum with SSE4.2.
+  */
+static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
 {
-#if defined(LO_SSE2_AVAILABLE)
-    double fSum = 0.0;
-    size_t nRealSize = mnSize - i;
-    size_t nUnrolledSize = nRealSize - (nRealSize % 8);
+    // Temporal parameter
+    __m128d t = _mm_add_pd(sum, value);
+    // Absolute value of the total sum
+    __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT);
+    // Absolute value of the value to add
+    __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT);
+    // Comaprate the absolute values sum >= value
+    __m128d mask = _mm_cmpge_pd(asum, avalue);
+    // The following code has this form ( a - t + b)
+    // Case 1: a = sum b = value
+    // Case 2: a = value b = sum
+    __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value));
+    __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum));
+    err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b));
+    // Store result
+    sum = t;
+}
+
+#endif
 
-    if (nUnrolledSize > 0)
+/** Execute Kahan sum with SSE2.
+  */
+KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
+{
+#ifdef LO_SSE2_AVAILABLE
+    // Make sure we don't fall out of bounds.
+    // This works by sums of 8 terms.
+    // So the 8'th term is i+7
+    // If we iterate untill nSize won't fall out of bounds
+    if (nSize > i + 7)
     {
+        // Setup sums and errors as 0
         __m128d sum1 = _mm_setzero_pd();
-        __m128d sum2 = _mm_setzero_pd();
-        __m128d sum3 = _mm_setzero_pd();
-        __m128d sum4 = _mm_setzero_pd();
-
         __m128d err1 = _mm_setzero_pd();
+        __m128d sum2 = _mm_setzero_pd();
         __m128d err2 = _mm_setzero_pd();
+        __m128d sum3 = _mm_setzero_pd();
         __m128d err3 = _mm_setzero_pd();
+        __m128d sum4 = _mm_setzero_pd();
         __m128d err4 = _mm_setzero_pd();
 
-        __m128d y, t;
-
-        for (; i < nUnrolledSize; i += 8)
+        for (; i + 7 < nSize; i += 8)
         {
             // Kahan sum 1
-            __m128d load1 = _mm_load_pd(pCurrent);
-            y = _mm_sub_pd(load1, err1);
-            t = _mm_add_pd(sum1, y);
-            err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
-            sum1 = t;
+            __m128d load1 = _mm_loadu_pd(pCurrent);
+            sumSSE2(sum1, err1, load1);
             pCurrent += 2;
 
             // Kahan sum 2
-            __m128d load2 = _mm_load_pd(pCurrent);
-            y = _mm_sub_pd(load2, err2);
-            t = _mm_add_pd(sum2, y);
-            err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y);
-            sum2 = t;
+            __m128d load2 = _mm_loadu_pd(pCurrent);
+            sumSSE2(sum2, err2, load2);
             pCurrent += 2;
 
             // Kahan sum 3
-            __m128d load3 = _mm_load_pd(pCurrent);
-            y = _mm_sub_pd(load3, err3);
-            t = _mm_add_pd(sum3, y);
-            err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y);
-            sum3 = t;
+            __m128d load3 = _mm_loadu_pd(pCurrent);
+            sumSSE2(sum3, err3, load3);
             pCurrent += 2;
 
             // Kahan sum 4
-            __m128d load4 = _mm_load_pd(pCurrent);
-            y = _mm_sub_pd(load4, err4);
-            t = _mm_add_pd(sum4, y);
-            err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y);
-            sum4 = t;
+            __m128d load4 = _mm_loadu_pd(pCurrent);
+            sumSSE2(sum4, err4, load4);
             pCurrent += 2;
         }
 
         // Now we combine pairwise summation with Kahan summation
 
-        // sum 1 + sum 2
-        y = _mm_sub_pd(sum2, err1);
-        t = _mm_add_pd(sum1, y);
-        err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y);
-        sum1 = t;
-
-        // sum 3 + sum 4
-        y = _mm_sub_pd(sum4, err3);
-        t = _mm_add_pd(sum3, y);
-        sum3 = t;
-
-        // sum 1 + sum 3
-        y = _mm_sub_pd(sum3, err1);
-        t = _mm_add_pd(sum1, y);
-        sum1 = t;
-
-        double temp;
-
-        _mm_storel_pd(&temp, sum1);
-        fSum += temp;
-
-        _mm_storeh_pd(&temp, sum1);
-        fSum += temp;
+        // 1+2 3+4 -> 1, 3
+        sumSSE2(sum1, err1, sum2);
+        sumSSE2(sum1, err1, err2);
+        sumSSE2(sum3, err3, sum4);
+        sumSSE2(sum3, err3, err4);
+
+        // 1+3 -> 1
+        sumSSE2(sum1, err1, sum3);
+        sumSSE2(sum1, err1, err3);
+
+        // Store results
+        double sums[2];
+        double errs[2];
+        _mm_storeu_pd(&sums[0], sum1);
+        _mm_storeu_pd(&errs[0], err1);
+
+        // First Kahan & pairwise summation
+        // 0+1 -> 0
+        sumNeumanierNormal(sums[0], errs[0], sums[1]);
+        sumNeumanierNormal(sums[0], errs[0], errs[1]);
+
+        // Store result
+        return KahanSum(sums[0], errs[0]);
     }
-    return fSum;
+    return 0.0;
 #else
+    SAL_WARN("sc", "Failed to use SSE2");
     (void)i;
+    (void)nSize;
     (void)pCurrent;
     return 0.0;
 #endif
 }
 }
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx
index a291b444ecdc..dbeab67fe35c 100644
--- a/sc/source/core/tool/interpr6.cxx
+++ b/sc/source/core/tool/interpr6.cxx
@@ -223,9 +223,7 @@ public:
                     return;
 
                 const double *p = &sc::numeric_block::at(*rNode.data, nOffset);
-                sc::ArraySumFunctor functor(p, nDataSize);
-
-                maSum += functor();
+                maSum += sc::op::sumArray(p, nDataSize);
                 break;
             }
 
diff --git a/tools/source/misc/cpuid.cxx b/tools/source/misc/cpuid.cxx
index 0395d0e0f001..855b87e6da63 100644
--- a/tools/source/misc/cpuid.cxx
+++ b/tools/source/misc/cpuid.cxx
@@ -56,6 +56,7 @@ bool checkAVXSupportInOS()
 #define XSAVE_bit (1 << 27)
 #define AVX_bit (1 << 28)
 #define AVX2_bit (1 << 5)
+#define AVX512F_bit (1 << 16)
 
 InstructionSetFlags getCpuInstructionSetFlags()
 {
@@ -98,6 +99,8 @@ InstructionSetFlags getCpuInstructionSetFlags()
 
                     if ((aExtendedInfo[1] & AVX2_bit) != 0)
                         eInstructions |= InstructionSetFlags::AVX2;
+                    if ((aExtendedInfo[1] & AVX512F_bit) != 0)
+                        eInstructions |= InstructionSetFlags::AVX512F;
                 }
             }
         }
@@ -127,6 +130,8 @@ OUString instructionSetSupportedString()
         aString += "AVX ";
     if (isCpuInstructionSetSupported(InstructionSetFlags::AVX2))
         aString += "AVX2 ";
+    if (isCpuInstructionSetSupported(InstructionSetFlags::AVX512F))
+        aString += "AVX512F ";
     return aString;
 }
author	dante <dante19031999@gmail.com>	2021-05-16 16:04:20 +0200
committer	Mike Kaganski <mike.kaganski@collabora.com>	2021-08-26 08:48:35 +0200
commit	5b9cf5881ef53fac5f1d8376f687dbadf9d3cf2b (patch)
tree	ff16b0dcf62236c48fed6ca87e7d584959ed1841
parent	fb1233d77500413ba237c335a84773d8a6f8e381 (diff)