Optimize pop_1st_bit() on 32 bits x86
authorMarco Costalba <mcostalba@gmail.com>
Thu, 18 Sep 2008 14:09:19 +0000 (16:09 +0200)
committerMarco Costalba <mcostalba@gmail.com>
Thu, 18 Sep 2008 14:09:19 +0000 (16:09 +0200)
Operations on 64 bits Bitboard types are slow
on x86 compiled with gcc, so optimize this case.

BTW profiling shows that pop_1st_bit() is a
veeery performance critical path!

Signed-off-by: Marco Costalba <mcostalba@gmail.com>
src/bitboard.cpp

index 0bbd155b10e0d71a1f3d52d83f25a95da72a86ec..5dd0137da39d225d34b4a41bb48279851fa29644 100644 (file)
@@ -6,12 +6,12 @@
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-  
+
   Glaurung is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-  
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -38,7 +38,7 @@ const Bitboard FileBB[8] = {
 };
 
 const Bitboard NeighboringFilesBB[8] = {
-  FileBBB, FileABB|FileCBB, FileBBB|FileDBB, FileCBB|FileEBB, 
+  FileBBB, FileABB|FileCBB, FileBBB|FileDBB, FileCBB|FileEBB,
   FileDBB|FileFBB, FileEBB|FileGBB, FileFBB|FileHBB, FileGBB
 };
 
@@ -47,7 +47,7 @@ const Bitboard ThisAndNeighboringFilesBB[8] = {
   FileBBB|FileCBB|FileDBB, FileCBB|FileDBB|FileEBB,
   FileDBB|FileEBB|FileFBB, FileEBB|FileFBB|FileGBB,
   FileFBB|FileGBB|FileHBB, FileGBB|FileHBB
-};  
+};
 
 const Bitboard RankBB[8] = {
   Rank1BB, Rank2BB, Rank3BB, Rank4BB, Rank5BB, Rank6BB, Rank7BB, Rank8BB
@@ -126,27 +126,27 @@ const int RShift[64] = {
 #else // if defined(USE_32BIT_ATTACKS)
 
 const uint64_t RMult[64] = {
-  0xa8002c000108020ULL, 0x4440200140003000ULL, 0x8080200010011880ULL, 
-  0x380180080141000ULL, 0x1a00060008211044ULL, 0x410001000a0c0008ULL, 
-  0x9500060004008100ULL, 0x100024284a20700ULL, 0x802140008000ULL, 
-  0x80c01002a00840ULL, 0x402004282011020ULL, 0x9862000820420050ULL, 
-  0x1001448011100ULL, 0x6432800200800400ULL, 0x40100010002000cULL, 
-  0x2800d0010c080ULL, 0x90c0008000803042ULL, 0x4010004000200041ULL, 
-  0x3010010200040ULL, 0xa40828028001000ULL, 0x123010008000430ULL, 
-  0x24008004020080ULL, 0x60040001104802ULL, 0x582200028400d1ULL, 
-  0x4000802080044000ULL, 0x408208200420308ULL, 0x610038080102000ULL, 
-  0x3601000900100020ULL, 0x80080040180ULL, 0xc2020080040080ULL, 
-  0x80084400100102ULL, 0x4022408200014401ULL, 0x40052040800082ULL, 
-  0xb08200280804000ULL, 0x8a80a008801000ULL, 0x4000480080801000ULL, 
-  0x911808800801401ULL, 0x822a003002001894ULL, 0x401068091400108aULL, 
-  0x4a10a00004cULL, 0x2000800640008024ULL, 0x1486408102020020ULL, 
-  0x100a000d50041ULL, 0x810050020b0020ULL, 0x204000800808004ULL, 
-  0x20048100a000cULL, 0x112000831020004ULL, 0x9000040810002ULL, 
-  0x440490200208200ULL, 0x8910401000200040ULL, 0x6404200050008480ULL, 
-  0x4b824a2010010100ULL, 0x4080801810c0080ULL, 0x400802a0080ULL, 
-  0x8224080110026400ULL, 0x40002c4104088200ULL, 0x1002100104a0282ULL, 
-  0x1208400811048021ULL, 0x3201014a40d02001ULL, 0x5100019200501ULL, 
-  0x101000208001005ULL, 0x2008450080702ULL, 0x1002080301d00cULL, 
+  0xa8002c000108020ULL, 0x4440200140003000ULL, 0x8080200010011880ULL,
+  0x380180080141000ULL, 0x1a00060008211044ULL, 0x410001000a0c0008ULL,
+  0x9500060004008100ULL, 0x100024284a20700ULL, 0x802140008000ULL,
+  0x80c01002a00840ULL, 0x402004282011020ULL, 0x9862000820420050ULL,
+  0x1001448011100ULL, 0x6432800200800400ULL, 0x40100010002000cULL,
+  0x2800d0010c080ULL, 0x90c0008000803042ULL, 0x4010004000200041ULL,
+  0x3010010200040ULL, 0xa40828028001000ULL, 0x123010008000430ULL,
+  0x24008004020080ULL, 0x60040001104802ULL, 0x582200028400d1ULL,
+  0x4000802080044000ULL, 0x408208200420308ULL, 0x610038080102000ULL,
+  0x3601000900100020ULL, 0x80080040180ULL, 0xc2020080040080ULL,
+  0x80084400100102ULL, 0x4022408200014401ULL, 0x40052040800082ULL,
+  0xb08200280804000ULL, 0x8a80a008801000ULL, 0x4000480080801000ULL,
+  0x911808800801401ULL, 0x822a003002001894ULL, 0x401068091400108aULL,
+  0x4a10a00004cULL, 0x2000800640008024ULL, 0x1486408102020020ULL,
+  0x100a000d50041ULL, 0x810050020b0020ULL, 0x204000800808004ULL,
+  0x20048100a000cULL, 0x112000831020004ULL, 0x9000040810002ULL,
+  0x440490200208200ULL, 0x8910401000200040ULL, 0x6404200050008480ULL,
+  0x4b824a2010010100ULL, 0x4080801810c0080ULL, 0x400802a0080ULL,
+  0x8224080110026400ULL, 0x40002c4104088200ULL, 0x1002100104a0282ULL,
+  0x1208400811048021ULL, 0x3201014a40d02001ULL, 0x5100019200501ULL,
+  0x101000208001005ULL, 0x2008450080702ULL, 0x1002080301d00cULL,
   0x410201ce5c030092ULL
 };
 
@@ -190,7 +190,7 @@ const uint64_t BMult[64] = {
   0x881c7c67fcbfc4f6ULL, 0x47ca41e7e440d423ULL, 0xeb0c88112048d004ULL,
   0x51c60e04359aef1aULL, 0x1aa1fe0e957a5554ULL, 0xdd9448db4f5e3104ULL,
   0xdc01f6dca4bebbdcULL,
-}; 
+};
 
 const int BShift[64] = {
   26, 27, 27, 27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27,
@@ -202,27 +202,27 @@ const int BShift[64] = {
 #else // if defined(USE_32BIT_ATTACKS)
 
 const uint64_t BMult[64] = {
-  0x440049104032280ULL, 0x1021023c82008040ULL, 0x404040082000048ULL, 
-  0x48c4440084048090ULL, 0x2801104026490000ULL, 0x4100880442040800ULL, 
-  0x181011002e06040ULL, 0x9101004104200e00ULL, 0x1240848848310401ULL, 
-  0x2000142828050024ULL, 0x1004024d5000ULL, 0x102044400800200ULL, 
-  0x8108108820112000ULL, 0xa880818210c00046ULL, 0x4008008801082000ULL, 
-  0x60882404049400ULL, 0x104402004240810ULL, 0xa002084250200ULL, 
-  0x100b0880801100ULL, 0x4080201220101ULL, 0x44008080a00000ULL, 
-  0x202200842000ULL, 0x5006004882d00808ULL, 0x200045080802ULL, 
-  0x86100020200601ULL, 0xa802080a20112c02ULL, 0x80411218080900ULL, 
-  0x200a0880080a0ULL, 0x9a01010000104000ULL, 0x28008003100080ULL, 
-  0x211021004480417ULL, 0x401004188220806ULL, 0x825051400c2006ULL, 
-  0x140c0210943000ULL, 0x242800300080ULL, 0xc2208120080200ULL, 
-  0x2430008200002200ULL, 0x1010100112008040ULL, 0x8141050100020842ULL, 
-  0x822081014405ULL, 0x800c049e40400804ULL, 0x4a0404028a000820ULL, 
-  0x22060201041200ULL, 0x360904200840801ULL, 0x881a08208800400ULL, 
-  0x60202c00400420ULL, 0x1204440086061400ULL, 0x8184042804040ULL, 
-  0x64040315300400ULL, 0xc01008801090a00ULL, 0x808010401140c00ULL, 
-  0x4004830c2020040ULL, 0x80005002020054ULL, 0x40000c14481a0490ULL, 
-  0x10500101042048ULL, 0x1010100200424000ULL, 0x640901901040ULL, 
-  0xa0201014840ULL, 0x840082aa011002ULL, 0x10010840084240aULL, 
-  0x420400810420608ULL, 0x8d40230408102100ULL, 0x4a00200612222409ULL, 
+  0x440049104032280ULL, 0x1021023c82008040ULL, 0x404040082000048ULL,
+  0x48c4440084048090ULL, 0x2801104026490000ULL, 0x4100880442040800ULL,
+  0x181011002e06040ULL, 0x9101004104200e00ULL, 0x1240848848310401ULL,
+  0x2000142828050024ULL, 0x1004024d5000ULL, 0x102044400800200ULL,
+  0x8108108820112000ULL, 0xa880818210c00046ULL, 0x4008008801082000ULL,
+  0x60882404049400ULL, 0x104402004240810ULL, 0xa002084250200ULL,
+  0x100b0880801100ULL, 0x4080201220101ULL, 0x44008080a00000ULL,
+  0x202200842000ULL, 0x5006004882d00808ULL, 0x200045080802ULL,
+  0x86100020200601ULL, 0xa802080a20112c02ULL, 0x80411218080900ULL,
+  0x200a0880080a0ULL, 0x9a01010000104000ULL, 0x28008003100080ULL,
+  0x211021004480417ULL, 0x401004188220806ULL, 0x825051400c2006ULL,
+  0x140c0210943000ULL, 0x242800300080ULL, 0xc2208120080200ULL,
+  0x2430008200002200ULL, 0x1010100112008040ULL, 0x8141050100020842ULL,
+  0x822081014405ULL, 0x800c049e40400804ULL, 0x4a0404028a000820ULL,
+  0x22060201041200ULL, 0x360904200840801ULL, 0x881a08208800400ULL,
+  0x60202c00400420ULL, 0x1204440086061400ULL, 0x8184042804040ULL,
+  0x64040315300400ULL, 0xc01008801090a00ULL, 0x808010401140c00ULL,
+  0x4004830c2020040ULL, 0x80005002020054ULL, 0x40000c14481a0490ULL,
+  0x10500101042048ULL, 0x1010100200424000ULL, 0x640901901040ULL,
+  0xa0201014840ULL, 0x840082aa011002ULL, 0x10010840084240aULL,
+  0x420400810420608ULL, 0x8d40230408102100ULL, 0x4a00200612222409ULL,
   0xa08520292120600ULL
 };
 
@@ -320,9 +320,9 @@ void init_bitboards() {
 #if defined(USE_FOLDED_BITSCAN)
 
 static const int BitTable[64] = {
-  63, 30, 3, 32, 25, 41, 22, 33, 15, 50, 42, 13, 11, 53, 19, 34, 61, 29, 2, 
-  51, 21, 43, 45, 10, 18, 47, 1, 54, 9, 57, 0, 35, 62, 31, 40, 4, 49, 5, 52, 
-  26, 60, 6, 23, 44, 46, 27, 56, 16, 7, 39, 48, 24, 59, 14, 12, 55, 38, 28, 
+  63, 30, 3, 32, 25, 41, 22, 33, 15, 50, 42, 13, 11, 53, 19, 34, 61, 29, 2,
+  51, 21, 43, 45, 10, 18, 47, 1, 54, 9, 57, 0, 35, 62, 31, 40, 4, 49, 5, 52,
+  26, 60, 6, 23, 44, 46, 27, 56, 16, 7, 39, 48, 24, 59, 14, 12, 55, 38, 28,
   58, 20, 37, 17, 36, 8
 };
 
@@ -339,6 +339,26 @@ Square first_1(Bitboard b) {
 /// pop_1st_bit() finds and clears the least significant nonzero bit in a
 /// nonzero bitboard.
 
+#if defined(USE_32BIT_ATTACKS)
+
+Square pop_1st_bit(Bitboard *bb) {
+
+  uint32_t  t = uint32_t(*bb);
+  uint32_t* p = t ? (uint32_t*)bb : (uint32_t*)bb + 1; // Little endian only?
+  uint32_t  b = t ? t : *p;
+
+  *p = b & (b -1);
+
+  if (t)
+     b ^= (b - 1);
+  else
+     b = ~(b ^ (b - 1));
+
+  return Square(BitTable[(b * 0x783a9b23) >> 26]);
+}
+
+#else
+
 Square pop_1st_bit(Bitboard *b) {
   Bitboard bb = *b ^ (*b - 1);
   uint32_t fold = int(bb) ^ int(bb >> 32);
@@ -346,6 +366,8 @@ Square pop_1st_bit(Bitboard *b) {
   return Square(BitTable[(fold * 0x783a9b23) >> 26]);
 }
 
+#endif
+
 #else
 
 static const int BitTable[64] = {
@@ -369,7 +391,7 @@ Square first_1(Bitboard b) {
 Square pop_1st_bit(Bitboard *b) {
   Bitboard bb = *b;
   *b &= (*b - 1);
-  return Square(BitTable[((bb & -bb) * 0x218a392cd3d5dbfULL) >> 58]); 
+  return Square(BitTable[((bb & -bb) * 0x218a392cd3d5dbfULL) >> 58]);
 }
 
 #endif // defined(USE_FOLDED_BITSCAN)
@@ -417,7 +439,7 @@ namespace {
       {-7,-9,0}, {17,15,10,6,-6,-10,-15,-17}, {9,7,-7,-9,0}, {8,1,-1,-8,0},
       {9,7,-7,-9,8,1,-1,-8}, {9,7,-7,-9,8,1,-1,-8}
     };
-    
+
     for(i = 0; i < 64; i++) {
       for(j = 0; j <= int(BK); j++) {
         StepAttackBB[j][i] = EmptyBoardBB;
@@ -483,14 +505,14 @@ namespace {
     Bitboard b;
     for(i = 0; i < 64; i++) {
       attackIndex[i] = index;
-      mask[i] = sliding_attacks(i, 0ULL, 4, deltas, 1, 6, 1, 6);      
+      mask[i] = sliding_attacks(i, 0ULL, 4, deltas, 1, 6, 1, 6);
       j = (1 << (64 - shift[i]));
       for(k = 0; k < j; k++) {
 #if defined(USE_32BIT_ATTACKS)
         b = index_to_bitboard(k, mask[i]);
-        attacks[index + 
-                 (unsigned(int(b) * int(mult[i]) ^ 
-                           int(b >> 32) * int(mult[i] >> 32)) 
+        attacks[index +
+                 (unsigned(int(b) * int(mult[i]) ^
+                           int(b >> 32) * int(mult[i] >> 32))
                   >> shift[i])] =
           sliding_attacks(i, b, 4, deltas);
 #else
@@ -502,7 +524,7 @@ namespace {
       index += j;
     }
   }
-  
+
 
   void init_pseudo_attacks() {
     Square s;
@@ -537,5 +559,5 @@ namespace {
     }
   }
 #endif // defined(USE_COMPACT_ROOK_ATTACKS)
-    
+
 }