The heuristic to avoid thread binding if less than 8 threads are requested resulted in the first 7 threads not being bound.
The branch was verified to yield a roughly 13% speedup by @CoffeeOne on the appropriate hardware and OS, and an earlier version of this patch tested well on his machine:
http://tests.stockfishchess.org/tests/view/
5a3693480ebc590ccbb8be5a
ELO: 9.24 +-4.6 (95%) LOS: 100.0%
Total: 5000 W: 634 L: 501 D: 3865
To make sure all threads (including mainThread) are bound as soon as the total number exceeds 7, recreate all threads on a change of thread number.
To do this, unify Threads::init, Threads::exit and Threads::set are unified in a single Threads::set function that goes through the needed steps.
The code includes several suggestions from @joergoster.
Fixes issue #1312
No functional change
Pawns::init();
Tablebases::init(Options["SyzygyPath"]);
TT.resize(Options["Hash"]);
Pawns::init();
Tablebases::init(Options["SyzygyPath"]);
TT.resize(Options["Hash"]);
- Threads.init(Options["Threads"]);
+ Threads.set(Options["Threads"]);
Search::clear(); // After threads are up
UCI::loop(argc, argv);
Search::clear(); // After threads are up
UCI::loop(argc, argv);
void bindThisThread(size_t idx) {
void bindThisThread(size_t idx) {
- // If OS already scheduled us on a different group than 0 then don't overwrite
- // the choice, eventually we are one of many one-threaded processes running on
- // some Windows NUMA hardware, for instance in fishtest. To make it simple,
- // just check if running threads are below a threshold, in this case all this
- // NUMA machinery is not needed.
- if (Threads.size() < 8)
- return;
-
// Use only local variables to be thread-safe
int group = get_group(idx);
// Use only local variables to be thread-safe
int group = get_group(idx);
Time.availableNodes = 0;
TT.clear();
Time.availableNodes = 0;
TT.clear();
-
- for (Thread* th : Threads)
- th->clear();
-
- Threads.main()->callsCnt = 0;
- Threads.main()->previousScore = VALUE_INFINITE;
- Threads.main()->previousTimeReduction = 1;
#include "movegen.h"
#include "search.h"
#include "thread.h"
#include "movegen.h"
#include "search.h"
#include "thread.h"
#include "syzygy/tbprobe.h"
ThreadPool Threads; // Global object
#include "syzygy/tbprobe.h"
ThreadPool Threads; // Global object
Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
wait_for_search_finished();
Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
wait_for_search_finished();
- clear(); // Zero-init histories (based on std::array)
void Thread::idle_loop() {
void Thread::idle_loop() {
- WinProcGroup::bindThisThread(idx);
+ // If OS already scheduled us on a different group than 0 then don't overwrite
+ // the choice, eventually we are one of many one-threaded processes running on
+ // some Windows NUMA hardware, for instance in fishtest. To make it simple,
+ // just check if running threads are below a threshold, in this case all this
+ // NUMA machinery is not needed.
+ if (Options["Threads"] >= 8)
+ WinProcGroup::bindThisThread(idx);
+/// ThreadPool::set() creates/destroys threads to match the requested number.
+/// Created and launced threads wil go immediately to sleep in idle_loop.
+/// Upon resizing, threads are recreated to allow for binding if necessary.
-/// ThreadPool::init() creates and launches the threads that will go
-/// immediately to sleep in idle_loop. We cannot use the constructor because
-/// Threads is a static object and we need a fully initialized engine at
-/// this point due to allocation of Endgames in the Thread constructor.
-
-void ThreadPool::init(size_t requested) {
-
- push_back(new MainThread(0));
- set(requested);
-}
+void ThreadPool::set(size_t requested) {
+ if (size() > 0) { // destroy any existing thread(s)
+ main()->wait_for_search_finished();
-/// ThreadPool::exit() terminates threads before the program exits. Cannot be
-/// done in the destructor because threads must be terminated before deleting
-/// any static object, so before main() returns.
+ while (size() > 0)
+ delete back(), pop_back();
+ }
-void ThreadPool::exit() {
+ if (requested > 0) { // create new thread(s)
+ push_back(new MainThread(0));
- main()->wait_for_search_finished();
- set(0);
+ while (size() < requested)
+ push_back(new Thread(size()));
+ clear();
+ }
+/// ThreadPool::clear() sets threadPool data to initial values.
-/// ThreadPool::set() creates/destroys threads to match the requested number
+void ThreadPool::clear() {
-void ThreadPool::set(size_t requested) {
+ for (Thread* th : *this)
+ th->clear();
- while (size() < requested)
- push_back(new Thread(size()));
-
- while (size() > requested)
- delete back(), pop_back();
+ main()->callsCnt = 0;
+ main()->previousScore = VALUE_INFINITE;
+ main()->previousTimeReduction = 1;
/// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
/// returns immediately. Main thread will wake up other threads and start the search.
/// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
/// returns immediately. Main thread will wake up other threads and start the search.
// is shared by threads but is accessed in read-only mode.
StateInfo tmp = setupStates->back();
// is shared by threads but is accessed in read-only mode.
StateInfo tmp = setupStates->back();
- for (Thread* th : Threads)
+ for (Thread* th : *this)
{
th->nodes = th->tbHits = 0;
th->rootDepth = th->completedDepth = DEPTH_ZERO;
{
th->nodes = th->tbHits = 0;
th->rootDepth = th->completedDepth = DEPTH_ZERO;
struct ThreadPool : public std::vector<Thread*> {
struct ThreadPool : public std::vector<Thread*> {
- void init(size_t); // No constructor and destructor, threads rely on globals that should
- void exit(); // be initialized and valid during the whole thread lifetime.
void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
void set(size_t);
MainThread* main() const { return static_cast<MainThread*>(front()); }
void set(size_t);
MainThread* main() const { return static_cast<MainThread*>(front()); }