From 38112060dc2da351c6dde8f12d0ee5cfaeac5084 Mon Sep 17 00:00:00 2001
From: Marco Costalba <mcostalba@gmail.com>
Date: Sun, 22 Feb 2015 14:59:55 +0100
Subject: [PATCH] Use spinlock instead of mutex for Threads and SplitPoint

It is reported to be defenitly faster with increasing
number of threads, we go from a +3.5% with 4 threads
to a +15% with 16 threads.

The only drawback is that now when testing with more
threads than physical available cores, the speed slows
down to a crawl. This is expected and was similar at what
we had setting the old sleepingThreads to false.

No functional change.
---
 src/search.cpp | 36 ++++++++++++++++++------------------
 src/thread.cpp | 16 ++++++++--------
 src/thread.h   | 30 +++++++++++++++---------------
 3 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 89330895..5dacc877 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -765,7 +765,7 @@ moves_loop: // When in check and at SpNode search starts from here
               continue;
 
           moveCount = ++splitPoint->moveCount;
-          splitPoint->mutex.unlock();
+          splitPoint->spinlock.release();
       }
       else
           ++moveCount;
@@ -834,7 +834,7 @@ moves_loop: // When in check and at SpNode search starts from here
               && moveCount >= FutilityMoveCounts[improving][depth])
           {
               if (SpNode)
-                  splitPoint->mutex.lock();
+                  splitPoint->spinlock.acquire();
 
               continue;
           }
@@ -853,7 +853,7 @@ moves_loop: // When in check and at SpNode search starts from here
 
                   if (SpNode)
                   {
-                      splitPoint->mutex.lock();
+                      splitPoint->spinlock.acquire();
                       if (bestValue > splitPoint->bestValue)
                           splitPoint->bestValue = bestValue;
                   }
@@ -865,7 +865,7 @@ moves_loop: // When in check and at SpNode search starts from here
           if (predictedDepth < 4 * ONE_PLY && pos.see_sign(move) < VALUE_ZERO)
           {
               if (SpNode)
-                  splitPoint->mutex.lock();
+                  splitPoint->spinlock.acquire();
 
               continue;
           }
@@ -965,7 +965,7 @@ moves_loop: // When in check and at SpNode search starts from here
       // Step 18. Check for new best move
       if (SpNode)
       {
-          splitPoint->mutex.lock();
+          splitPoint->spinlock.acquire();
           bestValue = splitPoint->bestValue;
           alpha = splitPoint->alpha;
       }
@@ -1526,13 +1526,13 @@ void Thread::idle_loop() {
       // If this thread has been assigned work, launch a search
       while (searching)
       {
-          Threads.mutex.lock();
+          Threads.spinlock.acquire();
 
           assert(activeSplitPoint);
 
           SplitPoint* sp = activeSplitPoint;
 
-          Threads.mutex.unlock();
+          Threads.spinlock.release();
 
           Stack stack[MAX_PLY+4], *ss = stack+2; // To allow referencing (ss-2) and (ss+2)
           Position pos(*sp->pos, this);
@@ -1540,7 +1540,7 @@ void Thread::idle_loop() {
           std::memcpy(ss-2, sp->ss-2, 5 * sizeof(Stack));
           ss->splitPoint = sp;
 
-          sp->mutex.lock();
+          sp->spinlock.acquire();
 
           assert(activePosition == nullptr);
 
@@ -1578,7 +1578,7 @@ void Thread::idle_loop() {
           // After releasing the lock we can't access any SplitPoint related data
           // in a safe way because it could have been released under our feet by
           // the sp master.
-          sp->mutex.unlock();
+          sp->spinlock.release();
 
           // Try to late join to another split point if none of its slaves has
           // already finished.
@@ -1593,7 +1593,7 @@ void Thread::idle_loop() {
               if (   sp
                   && sp->allSlavesSearching
                   && sp->slavesMask.count() < MAX_SLAVES_PER_SPLITPOINT
-                  && available_to(th))
+                  && available_to(sp->master))
               {
                   assert(this != th);
                   assert(!(this_sp && this_sp->slavesMask.none()));
@@ -1618,8 +1618,8 @@ void Thread::idle_loop() {
               sp = bestSp;
 
               // Recheck the conditions under lock protection
-              Threads.mutex.lock();
-              sp->mutex.lock();
+              Threads.spinlock.acquire();
+              sp->spinlock.acquire();
 
               if (   sp->allSlavesSearching
                   && sp->slavesMask.count() < MAX_SLAVES_PER_SPLITPOINT
@@ -1630,8 +1630,8 @@ void Thread::idle_loop() {
                   searching = true;
               }
 
-              sp->mutex.unlock();
-              Threads.mutex.unlock();
+              sp->spinlock.release();
+              Threads.spinlock.release();
           }
       }
 
@@ -1687,7 +1687,7 @@ void check_time() {
 
   else if (Limits.nodes)
   {
-      Threads.mutex.lock();
+      Threads.spinlock.acquire();
 
       int64_t nodes = RootPos.nodes_searched();
 
@@ -1698,7 +1698,7 @@ void check_time() {
           {
               SplitPoint& sp = th->splitPoints[i];
 
-              sp.mutex.lock();
+              sp.spinlock.acquire();
 
               nodes += sp.nodes;
 
@@ -1706,10 +1706,10 @@ void check_time() {
                   if (sp.slavesMask.test(idx) && Threads[idx]->activePosition)
                       nodes += Threads[idx]->activePosition->nodes_searched();
 
-              sp.mutex.unlock();
+              sp.spinlock.release();
           }
 
-      Threads.mutex.unlock();
+      Threads.spinlock.release();
 
       if (nodes >= Limits.nodes)
           Signals.stop = true;
diff --git a/src/thread.cpp b/src/thread.cpp
index a466df87..1217e3ab 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -165,8 +165,8 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
   // Try to allocate available threads and ask them to start searching setting
   // 'searching' flag. This must be done under lock protection to avoid concurrent
   // allocation of the same slave by another master.
-  Threads.mutex.lock();
-  sp.mutex.lock();
+  Threads.spinlock.acquire();
+  sp.spinlock.acquire();
 
   sp.allSlavesSearching = true; // Must be set under lock protection
   ++splitPointsSize;
@@ -188,8 +188,8 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
   // it will instantly launch a search, because its 'searching' flag is set.
   // The thread will return from the idle loop when all slaves have finished
   // their work at this split point.
-  sp.mutex.unlock();
-  Threads.mutex.unlock();
+  sp.spinlock.release();
+  Threads.spinlock.release();
 
   Thread::idle_loop(); // Force a call to base class idle_loop()
 
@@ -202,8 +202,8 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
   // We have returned from the idle loop, which means that all threads are
   // finished. Note that setting 'searching' and decreasing splitPointsSize must
   // be done under lock protection to avoid a race with Thread::available_to().
-  Threads.mutex.lock();
-  sp.mutex.lock();
+  Threads.spinlock.acquire();
+  sp.spinlock.acquire();
 
   searching = true;
   --splitPointsSize;
@@ -213,8 +213,8 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
   *bestMove = sp.bestMove;
   *bestValue = sp.bestValue;
 
-  sp.mutex.unlock();
-  Threads.mutex.unlock();
+  sp.spinlock.release();
+  Threads.spinlock.release();
 }
 
 
diff --git a/src/thread.h b/src/thread.h
index 166ea80f..f8e65941 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -39,6 +39,19 @@ const size_t MAX_THREADS = 128;
 const size_t MAX_SPLITPOINTS_PER_THREAD = 8;
 const size_t MAX_SLAVES_PER_SPLITPOINT = 4;
 
+/// Spinlock class wraps low level atomic operations to provide spin lock functionality
+
+class Spinlock {
+
+  std::atomic_flag lock;
+
+public:
+  Spinlock() { std::atomic_flag_clear(&lock); }
+  void acquire() { while (lock.test_and_set(std::memory_order_acquire)) {} }
+  void release() { lock.clear(std::memory_order_release); }
+};
+
+
 /// SplitPoint struct stores information shared by the threads searching in
 /// parallel below the same split point. It is populated at splitting time.
 
@@ -58,7 +71,7 @@ struct SplitPoint {
   SplitPoint* parentSplitPoint;
 
   // Shared variable data
-  std::mutex mutex;
+  Spinlock spinlock;
   std::bitset<MAX_THREADS> slavesMask;
   volatile bool allSlavesSearching;
   volatile uint64_t nodes;
@@ -70,19 +83,6 @@ struct SplitPoint {
 };
 
 
-/// Spinlock class wraps low level atomic operations to provide spin lock functionality
-
-class Spinlock {
-
-  std::atomic_flag lock;
-
-public:
-  Spinlock() { std::atomic_flag_clear(&lock); }
-  void acquire() { while (lock.test_and_set(std::memory_order_acquire)) {} }
-  void release() { lock.clear(std::memory_order_release); }
-};
-
-
 /// ThreadBase struct is the base of the hierarchy from where we derive all the
 /// specialized thread classes.
 
@@ -162,7 +162,7 @@ struct ThreadPool : public std::vector<Thread*> {
   void start_thinking(const Position&, const Search::LimitsType&, Search::StateStackPtr&);
 
   Depth minimumSplitDepth;
-  std::mutex mutex;
+  Spinlock spinlock;
   std::condition_variable sleepCondition;
   TimerThread* timer;
 };
-- 
2.39.2