diff options
author | Dennis Francis <dennis.francis@collabora.com> | 2019-10-07 18:02:04 +0530 |
---|---|---|
committer | Dennis Francis <dennis.francis@collabora.com> | 2019-10-17 08:07:32 +0200 |
commit | 3c2587a152476cbb0ca4a83138a4c34ec8065b32 (patch) | |
tree | a43ba7aea0b91c58aa896bf2a82449ca6822e81c | |
parent | a5b394a4c6ce5aa93654ff6d57fc497bcea93001 (diff) |
move SSE2sum code to separate cxx file...
and compile it with -arch:SSE2 if Windows.
This code however gets called only if cpuid::hasSSE2()
is true, so this does not cause problems with machines
without SSE2 support.
Change-Id: Ice23ac71d4c577b8811b08c74a3ca500a94fdc09
Reviewed-on: https://gerrit.libreoffice.org/80847
Tested-by: Jenkins
Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
-rw-r--r-- | sc/Library_sc.mk | 11 | ||||
-rw-r--r-- | sc/source/core/inc/arraysumfunctor.hxx | 51 | ||||
-rw-r--r-- | sc/source/core/tool/arraysumSSE2.cxx | 65 |
3 files changed, 77 insertions, 50 deletions
diff --git a/sc/Library_sc.mk b/sc/Library_sc.mk index d594ca110f6d..9c057eaf6a6d 100644 --- a/sc/Library_sc.mk +++ b/sc/Library_sc.mk @@ -98,6 +98,17 @@ $(eval $(call gb_Library_use_libraries,sc,\ xo \ )) +ifeq ($(OS),WNT) +$(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumSSE2, -arch:SSE2 \ +)) + +else +$(eval $(call gb_Library_add_exception_objects,sc,\ + sc/source/core/tool/arraysumSSE2 \ +)) +endif + $(eval $(call gb_Library_add_exception_objects,sc,\ sc/source/core/data/attarray \ sc/source/core/data/attrib \ diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx index 7ef8a7face05..f1182874b1a1 100644 --- a/sc/source/core/inc/arraysumfunctor.hxx +++ b/sc/source/core/inc/arraysumfunctor.hxx @@ -14,7 +14,6 @@ #include <cstdint> #include <rtl/math.hxx> -#include <tools/simdsupport.hxx> #include <tools/simd.hxx> #include <tools/cpuid.hxx> @@ -87,56 +86,8 @@ public: } private: - double executeSSE2(size_t& i, const double* pCurrent) const - { -#if defined(LO_SSE2_AVAILABLE) - double fSum = 0.0; - size_t nRealSize = mnSize - i; - size_t nUnrolledSize = nRealSize - (nRealSize % 8); - - if (nUnrolledSize > 0) - { - __m128d sum1 = _mm_setzero_pd(); - __m128d sum2 = _mm_setzero_pd(); - __m128d sum3 = _mm_setzero_pd(); - __m128d sum4 = _mm_setzero_pd(); - - for (; i < nUnrolledSize; i += 8) - { - __m128d load1 = _mm_load_pd(pCurrent); - sum1 = _mm_add_pd(sum1, load1); - pCurrent += 2; - - __m128d load2 = _mm_load_pd(pCurrent); - sum2 = _mm_add_pd(sum2, load2); - pCurrent += 2; - - __m128d load3 = _mm_load_pd(pCurrent); - sum3 = _mm_add_pd(sum3, load3); - pCurrent += 2; - - __m128d load4 = _mm_load_pd(pCurrent); - sum4 = _mm_add_pd(sum4, load4); - pCurrent += 2; - } - sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4)); - - double temp; - - _mm_storel_pd(&temp, sum1); - fSum += temp; - - _mm_storeh_pd(&temp, sum1); - fSum += temp; - } - return fSum; -#else - (void) i; - (void) pCurrent; - return 0.0; -#endif - } + double executeSSE2(size_t& i, const double* pCurrent) const; double executeUnrolled(size_t& i, const double* pCurrent) const { size_t nRealSize = mnSize - i; diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx new file mode 100644 index 000000000000..894675335834 --- /dev/null +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -0,0 +1,65 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include <arraysumfunctor.hxx> +#include <tools/simdsupport.hxx> + +namespace sc +{ +double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const +{ +#if defined(LO_SSE2_AVAILABLE) + double fSum = 0.0; + size_t nRealSize = mnSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 8); + + if (nUnrolledSize > 0) + { + __m128d sum1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); + + for (; i < nUnrolledSize; i += 8) + { + __m128d load1 = _mm_load_pd(pCurrent); + sum1 = _mm_add_pd(sum1, load1); + pCurrent += 2; + + __m128d load2 = _mm_load_pd(pCurrent); + sum2 = _mm_add_pd(sum2, load2); + pCurrent += 2; + + __m128d load3 = _mm_load_pd(pCurrent); + sum3 = _mm_add_pd(sum3, load3); + pCurrent += 2; + + __m128d load4 = _mm_load_pd(pCurrent); + sum4 = _mm_add_pd(sum4, load4); + pCurrent += 2; + } + sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4)); + + double temp; + + _mm_storel_pd(&temp, sum1); + fSum += temp; + + _mm_storeh_pd(&temp, sum1); + fSum += temp; + } + return fSum; +#else + (void)i; + (void)pCurrent; + return 0.0; +#endif +} +}
\ No newline at end of file |