Improve multi-threaded scaling in BitmapScaleSuperFilter

The old approach was to calculate the number of stripes of the bitmap per thread and later create the exact number tasks (ScaleTask) as there are threads, where each task would process stripes it had been given. This is needlesly complicated as the job of a thread pool is to properly delegate the tasks between threads. This was now changed so that we create one stripe per ScaleTask and let the threadpool delegate the tasks to its threads (that are available). It also wanted to be clever and use the main thread to do the work also, but it had a major flaw. The threadpool started to process the tasks only when "waitUntilDone" method was called, but the code first processed its slices and then called the threadpool method to start processing. Because of this the performance of scaling wasn't as good as it could be. This behaviour was now changed so that the main thread isn't involved in processing. It just creates the task, runs the threadpool and waits until the tasks are finished. Change-Id: I1e8c733bdbced8867d0a7f1190f0421a0cc3e067 Reviewed-on: https://gerrit.libreoffice.org/70668 Reviewed-by: Tomaž Vajngerl <quikee@gmail.com> Tested-by: Tomaž Vajngerl <quikee@gmail.com>
author: Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk> 2019-04-12 22:38:13 +0900
committer: Tomaž Vajngerl <quikee@gmail.com> 2019-04-13 04:23:01 +0200
commit: 9695896d8f0e3d4b2961c7a753c279a70f5bbaf2 (patch)
tree: 1d7286c0a0d66a727ef464fff6f8cc16493617cf /vcl/source
parent: 7a092e254111a2d98446e7140ef24c652c245bfa (diff)
1 files changed, 35 insertions, 37 deletions
diff --git a/vcl/source/bitmap/BitmapScaleSuperFilter.cxx b/vcl/source/bitmap/BitmapScaleSuperFilter.cxx
index ea73f3b10a04..a10bd8ccc17c 100644
--- a/vcl/source/bitmap/BitmapScaleSuperFilter.cxx
+++ b/vcl/source/bitmap/BitmapScaleSuperFilter.cxx
@@ -77,29 +77,32 @@ struct ScaleContext {
     }
 };
 
-#define SCALE_THREAD_STRIP 32
-struct ScaleRangeContext {
-    ScaleContext *mrCtx;
-    long mnStartY, mnEndY;
-    ScaleRangeContext( ScaleContext *rCtx, long nStartY )
-        : mrCtx( rCtx ), mnStartY( nStartY ),
-          mnEndY( nStartY + SCALE_THREAD_STRIP ) {}
-};
+constexpr long constScaleThreadStrip = 32;
 
-typedef void (*ScaleRangeFn)(ScaleContext &rCtx, long nStartY, long nEndY);
+typedef void (*ScaleRangeFn)(ScaleContext &rContext, long nStartY, long nEndY);
 
 class ScaleTask : public comphelper::ThreadTask
 {
-    ScaleRangeFn const mpFn;
-    std::vector< ScaleRangeContext > maStrips;
+    ScaleRangeFn const mpScaleRangeFunction;
+    ScaleContext& mrContext;
+    const long mnStartY;
+    const long mnEndY;
+
 public:
-    explicit ScaleTask( const std::shared_ptr<comphelper::ThreadTaskTag>& pTag, ScaleRangeFn pFn )
-        : comphelper::ThreadTask(pTag), mpFn( pFn ) {}
-    void push( ScaleRangeContext const &aRC ) { maStrips.push_back( aRC ); }
+    explicit ScaleTask(const std::shared_ptr<comphelper::ThreadTaskTag>& pTag,
+                       ScaleRangeFn pScaleRangeFunction,
+                       ScaleContext& rContext,
+                       long nStartY, long nEndY)
+        : comphelper::ThreadTask(pTag)
+        , mpScaleRangeFunction(pScaleRangeFunction)
+        , mrContext(rContext)
+        , mnStartY(nStartY)
+        , mnEndY(nEndY)
+    {}
+
     virtual void doWork() override
     {
-        for (auto const& strip : maStrips)
-            mpFn( *(strip.mrCtx), strip.mnStartY, strip.mnEndY );
+        mpScaleRangeFunction(mrContext, mnStartY, mnEndY);
     }
 };
 
@@ -1026,12 +1029,11 @@ BitmapEx BitmapScaleSuperFilter::execute(BitmapEx const& rBitmap) const
             // We want to thread - only if there is a lot of work to do:
             // We work hard when there is a large destination image, or
             // A large source image.
-            bool bHorizontalWork = pReadAccess->Width() > 512 || pWriteAccess->Width() > 512;
+            bool bHorizontalWork = pReadAccess->Height() >= 512 && pReadAccess->Width() >= 512;
             bool bUseThreads = true;
 
             static bool bDisableThreadedScaling = getenv ("VCL_NO_THREAD_SCALE");
-            if ( bDisableThreadedScaling || !bHorizontalWork ||
-                 nEndY - nStartY < SCALE_THREAD_STRIP )
+            if (bDisableThreadedScaling || !bHorizontalWork)
             {
                 SAL_INFO("vcl.gdi", "Scale in main thread");
                 bUseThreads = false;
@@ -1044,26 +1046,22 @@ BitmapEx BitmapScaleSuperFilter::execute(BitmapEx const& rBitmap) const
                     // partition and queue work
                     comphelper::ThreadPool &rShared = comphelper::ThreadPool::getSharedOptimalPool();
                     std::shared_ptr<comphelper::ThreadTaskTag> pTag = comphelper::ThreadPool::createThreadTaskTag();
-                    sal_uInt32 nThreads = rShared.getWorkerCount();
-                    assert( nThreads > 0 );
-                    sal_uInt32 nStrips = ((nEndY - nStartY) + SCALE_THREAD_STRIP - 1) / SCALE_THREAD_STRIP;
-                    sal_uInt32 nStripsPerThread = nStrips / nThreads;
-                    SAL_INFO("vcl.gdi", "Scale in " << nStrips << " strips " << nStripsPerThread << " per thread we have " << nThreads << " CPU threads ");
-                    long nStripY = nStartY;
-                    for ( sal_uInt32 t = 0; t < nThreads - 1; t++ )
+
+                    long nStripYStart = nStartY;
+                    long nStripYEnd = nStripYStart + constScaleThreadStrip - 1;
+
+                    while (nStripYEnd < nEndY)
                     {
-                        std::unique_ptr<ScaleTask> pTask(new ScaleTask( pTag, pScaleRangeFn ));
-                        for ( sal_uInt32 j = 0; j < nStripsPerThread; j++ )
-                        {
-                            ScaleRangeContext aRC( &aContext, nStripY );
-                            pTask->push( aRC );
-                            nStripY += SCALE_THREAD_STRIP;
-                        }
-                        rShared.pushTask( std::move(pTask) );
+                        std::unique_ptr<ScaleTask> pTask(new ScaleTask(pTag, pScaleRangeFn, aContext, nStripYStart, nStripYEnd));
+                        rShared.pushTask(std::move(pTask));
+                        nStripYStart += constScaleThreadStrip;
+                        nStripYEnd += constScaleThreadStrip;
+                    }
+                    if (nStripYStart <= nEndY)
+                    {
+                        std::unique_ptr<ScaleTask> pTask(new ScaleTask(pTag, pScaleRangeFn, aContext, nStripYStart, nEndY));
+                        rShared.pushTask(std::move(pTask));
                     }
-                    // finish any remaining bits here
-                    pScaleRangeFn( aContext, nStripY, nEndY );
-
                     rShared.waitUntilDone(pTag);
                     SAL_INFO("vcl.gdi", "All threaded scaling tasks complete");
                 }
author	Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk>	2019-04-12 22:38:13 +0900
committer	Tomaž Vajngerl <quikee@gmail.com>	2019-04-13 04:23:01 +0200
commit	9695896d8f0e3d4b2961c7a753c279a70f5bbaf2 (patch)
tree	1d7286c0a0d66a727ef464fff6f8cc16493617cf /vcl/source
parent	7a092e254111a2d98446e7140ef24c652c245bfa (diff)