Use optimized pop_1st_bit() under Windows 64 with icc

[stockfish] / src / tt.cpp
diff --git a/src/tt.cpp b/src/tt.cpp

index 8ef2a63500acf02827028a6e485a2286881dfd6f..e140a0ba10ec5fa2183b506b68750a4dcbc39289 100644 (file)
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -30,10 +30,8 @@
  #include "movegen.h"
  #include "tt.h"
  
-
-/// This is the number of TTEntry slots for each position
-static const int ClusterSize = 5;
-
+// The main transposition table
+TranspositionTable TT;
  
  ////
  //// Functions
@@ -63,14 +61,14 @@ void TranspositionTable::set_size(unsigned mbSize) {
  
    // We store a cluster of ClusterSize number of TTEntry for each position
    // and newSize is the maximum number of storable positions.
-  while ((2 * newSize) * ClusterSize * (sizeof(TTEntry)) <= (mbSize << 20))
+  while ((2 * newSize) * sizeof(TTCluster) <= (mbSize << 20))
        newSize *= 2;
  
    if (newSize != size)
    {
        size = newSize;
        delete [] entries;
-      entries = new TTEntry[size * ClusterSize];
+      entries = new TTCluster[size];
        if (!entries)
        {
            std::cerr << "Failed to allocate " << mbSize
@@ -89,7 +87,7 @@ void TranspositionTable::set_size(unsigned mbSize) {
  
  void TranspositionTable::clear() {
  
-  memset(entries, 0, size * ClusterSize * sizeof(TTEntry));
+  memset(entries, 0, size * sizeof(TTCluster));
  }
  
  
@@ -99,7 +97,7 @@ void TranspositionTable::clear() {
  
  inline TTEntry* TranspositionTable::first_entry(const Key posKey) const {
  
-  return entries + ((uint32_t(posKey) & (size - 1)) * ClusterSize);
+  return entries[uint32_t(posKey) & (size - 1)].data;
  }
  
  
@@ -170,11 +168,19 @@ TTEntry* TranspositionTable::retrieve(const Key posKey) const {
  /// blocking function and do not stalls the CPU waiting for data
  /// to be loaded from RAM, that can be very slow. When we will
  /// subsequently call retrieve() the TT data will be already
-/// quickly accessible in L1/l2 CPU cache.
+/// quickly accessible in L1/L2 CPU cache.
  
  void TranspositionTable::prefetch(const Key posKey) const {
  
-  _mm_prefetch((char*)first_entry(posKey), _MM_HINT_T0);
+#if defined(__INTEL_COMPILER) || defined(__ICL)
+   // This hack prevents prefetches to be optimized away by the
+   // Intel compiler. Both MSVC and gcc seems not affected.
+   __asm__ ("");
+#endif
+
+   char const* addr = (char*)first_entry(posKey);
+  _mm_prefetch(addr, _MM_HINT_T2);
+  _mm_prefetch(addr+64, _MM_HINT_T2); // 64 bytes ahead
  }