Improve smp performance for high number of threads
authorJoona Kiiski <joona.kiiski@gmail.com>
Sat, 14 Feb 2015 20:46:00 +0000 (20:46 +0000)
committerJoona Kiiski <joona.kiiski@gmail.com>
Mon, 16 Feb 2015 20:36:13 +0000 (20:36 +0000)
Balance threads between split points.

There are huge differences between different machines and autopurging makes it very difficult to measure the improvement in fishtest, but the following was recorded for 16 threads at 15+0.05:

    For Bravone (1000 games): 0 ELO
    For Glinscott (1000 games): +20 ELO
    For bKingUs (1000 games): +50 ELO
    For fastGM (1500 games): +50 ELO

The change was regression for no one, and a big improvement for some, so it should be fine to commit it.
Also for 8 threads at 15+0.05 we measured a statistically significant improvement:
ELO: 6.19 +-3.9 (95%) LOS: 99.9%
Total: 10325 W: 1824 L: 1640 D: 6861

Finally it was verified that there was no (significant) regression for

4 threads:
ELO: 0.09 +-2.8 (95%) LOS: 52.4%
Total: 19908 W: 3422 L: 3417 D: 13069

2 threads:
ELO: 0.38 +-3.0 (95%) LOS: 60.0%
Total: 19044 W: 3480 L: 3459 D: 12105

1 thread:
ELO: -1.27 +-2.1 (95%) LOS: 12.3%
Total: 40000 W: 7829 L: 7975 D: 24196

Resolves #258

src/search.cpp
src/thread.cpp
src/thread.h

index ffe361268e6b0a113468e15b4ac18b758e32c977..29820a1f45ac884a6c77edeb5abaa46b0677dd4d 100644 (file)
@@ -1042,7 +1042,9 @@ moves_loop: // When in check and at SpNode search starts from here
           &&  Threads.size() >= 2
           &&  depth >= Threads.minimumSplitDepth
           &&  (   !thisThread->activeSplitPoint
-               || !thisThread->activeSplitPoint->allSlavesSearching)
+               || !thisThread->activeSplitPoint->allSlavesSearching
+               || (   int(Threads.size()) > MAX_SLAVES_PER_SPLITPOINT
+                   && thisThread->activeSplitPoint->slavesCount == MAX_SLAVES_PER_SPLITPOINT))
           &&  thisThread->splitPointsSize < MAX_SPLITPOINTS_PER_THREAD)
       {
           assert(bestValue > -VALUE_INFINITE && bestValue < beta);
@@ -1587,6 +1589,11 @@ void Thread::idle_loop() {
           // Try to late join to another split point if none of its slaves has
           // already finished.
           if (Threads.size() > 2)
+          {
+              SplitPoint *bestSp = NULL;
+              int bestThread = 0;
+              int bestScore = INT_MAX;
+
               for (size_t i = 0; i < Threads.size(); ++i)
               {
                   const int size = Threads[i]->splitPointsSize; // Local copy
@@ -1594,26 +1601,42 @@ void Thread::idle_loop() {
 
                   if (   sp
                       && sp->allSlavesSearching
+                      && sp->slavesCount < MAX_SLAVES_PER_SPLITPOINT
                       && available_to(Threads[i]))
                   {
-                      // Recheck the conditions under lock protection
-                      Threads.mutex.lock();
-                      sp->mutex.lock();
+                      int score = sp->spLevel * 256 * 256 + sp->slavesCount * 256 - sp->depth * 1;
 
-                      if (   sp->allSlavesSearching
-                          && available_to(Threads[i]))
+                      if (score < bestScore)
                       {
-                           sp->slavesMask.set(idx);
-                           activeSplitPoint = sp;
-                           searching = true;
+                           bestSp = sp;
+                           bestThread = i;
+                           bestScore = score;
                       }
+                  }
+              }
 
-                      sp->mutex.unlock();
-                      Threads.mutex.unlock();
-
-                      break; // Just a single attempt
+              if (bestSp)
+              {
+                  sp = bestSp;
+                  // Recheck the conditions under lock protection
+                  Threads.mutex.lock();
+                  sp->mutex.lock();
+
+                  if (   sp->allSlavesSearching
+                      && sp->slavesCount < MAX_SLAVES_PER_SPLITPOINT
+                      && available_to(Threads[bestThread]))
+                  {
+                      sp->slavesMask.set(idx);
+                      sp->slavesCount++;
+                      activeSplitPoint = sp;
+                      searching = true;
                   }
+
+                  sp->mutex.unlock();
+                  Threads.mutex.unlock();
               }
+          }
       }
 
       // Grab the lock to avoid races with Thread::notify_one()
index b8571dcd20283c9b9b7ef502559e7ca91ff05db5..02cb45629dff48b8048890911139efe0eae5d46a 100644 (file)
@@ -154,7 +154,9 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
 
   sp.masterThread = this;
   sp.parentSplitPoint = activeSplitPoint;
+  sp.spLevel = activeSplitPoint ? activeSplitPoint->spLevel + 1 : 0;
   sp.slavesMask = 0, sp.slavesMask.set(idx);
+  sp.slavesCount = 1;
   sp.depth = depth;
   sp.bestValue = *bestValue;
   sp.bestMove = *bestMove;
@@ -182,9 +184,11 @@ void Thread::split(Position& pos, Stack* ss, Value alpha, Value beta, Value* bes
 
   Thread* slave;
 
-  while ((slave = Threads.available_slave(this)) != NULL)
+  while (    sp.slavesCount < MAX_SLAVES_PER_SPLITPOINT 
+         && (slave = Threads.available_slave(this)) != NULL)
   {
       sp.slavesMask.set(slave->idx);
+      sp.slavesCount++;
       slave->activeSplitPoint = &sp;
       slave->searching = true; // Slave leaves idle_loop()
       slave->notify_one(); // Could be sleeping
index 9750ed7ba020be8008007f482f7ab9e146e4c9dd..949f87ad28ae086fe75e0577b98e859029cfc4ed 100644 (file)
@@ -33,6 +33,7 @@ struct Thread;
 
 const int MAX_THREADS = 128;
 const int MAX_SPLITPOINTS_PER_THREAD = 8;
+const int MAX_SLAVES_PER_SPLITPOINT = 4;
 
 /// Mutex and ConditionVariable struct are wrappers of the low level locking
 /// machinery and are modeled after the corresponding C++11 classes.
@@ -72,6 +73,7 @@ struct SplitPoint {
   const Position* pos;
   Search::Stack* ss;
   Thread* masterThread;
+  int spLevel;
   Depth depth;
   Value beta;
   int nodeType;
@@ -84,6 +86,7 @@ struct SplitPoint {
   // Shared variable data
   Mutex mutex;
   std::bitset<MAX_THREADS> slavesMask;
+  int slavesCount;
   volatile bool allSlavesSearching;
   volatile uint64_t nodes;
   volatile Value alpha;