From: Marco Costalba Date: Mon, 25 May 2009 06:52:59 +0000 (+0100) Subject: Merge hardware POPCNT detection and use X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=3d0b60b0653852198011306a4c8d34f8ef98fc5e;hp=bdb1bfecfb5665329a7d66f4c366a9736bab6c0b;p=stockfish Merge hardware POPCNT detection and use Tests on Joona luxury iCore7 QUAD show that speed increase against standrd 64bit routine is between 3% and 4%. So it seems a good thing to have. Also the user feedback at startup regarding the compile and the hardware detection can be an useful debug tool. No functional change. Signed-off-by: Marco Costalba --- diff --git a/src/bitboard.cpp b/src/bitboard.cpp index 474e321c..a73c5a2d 100644 --- a/src/bitboard.cpp +++ b/src/bitboard.cpp @@ -35,6 +35,7 @@ #include #include "bitboard.h" +#include "bitcount.h" #include "direction.h" @@ -339,7 +340,7 @@ Square pop_1st_bit(Bitboard *b) { #endif -#else +#else // defined(USE_FOLDED_BITSCAN) static const int BitTable[64] = { 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40, 5, 17, 26, 38, 15, diff --git a/src/bitboard.h b/src/bitboard.h index d3a4ae53..54ed5053 100644 --- a/src/bitboard.h +++ b/src/bitboard.h @@ -22,7 +22,6 @@ #if !defined(BITBOARD_H_INCLUDED) #define BITBOARD_H_INCLUDED - //// //// Defines //// @@ -47,15 +46,10 @@ //#define USE_32BIT_ATTACKS #define USE_FOLDED_BITSCAN -#define BITCOUNT_SWAR_64 -//#define BITCOUNT_SWAR_32 -//#define BITCOUNT_LOOP - #else #define USE_32BIT_ATTACKS #define USE_FOLDED_BITSCAN -#define BITCOUNT_SWAR_32 #endif @@ -429,65 +423,6 @@ inline Bitboard isolated_pawn_mask(Square s) { } -/// count_1s() counts the number of nonzero bits in a bitboard. - -#if defined(BITCOUNT_LOOP) - -inline int count_1s(Bitboard b) { - int r; - for(r = 0; b; r++, b &= b - 1); - return r; -} - -inline int count_1s_max_15(Bitboard b) { - return count_1s(b); -} - -#elif defined(BITCOUNT_SWAR_32) - -inline int count_1s(Bitboard b) { - unsigned w = unsigned(b >> 32), v = unsigned(b); - v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits - w -= (w >> 1) & 0x55555555; - v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits - w = ((w >> 2) & 0x33333333) + (w & 0x33333333); - v = ((v >> 4) + v) & 0x0F0F0F0F; // 0-8 in 8 bits - v += (((w >> 4) + w) & 0x0F0F0F0F); // 0-16 in 8 bits - v *= 0x01010101; // mul is fast on amd procs - return int(v >> 24); -} - -inline int count_1s_max_15(Bitboard b) { - unsigned w = unsigned(b >> 32), v = unsigned(b); - v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits - w -= (w >> 1) & 0x55555555; - v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits - w = ((w >> 2) & 0x33333333) + (w & 0x33333333); - v += w; // 0-8 in 4 bits - v *= 0x11111111; - return int(v >> 28); -} - -#elif defined(BITCOUNT_SWAR_64) - -inline int count_1s(Bitboard b) { - b -= ((b>>1) & 0x5555555555555555ULL); - b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); - b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; - b *= 0x0101010101010101ULL; - return int(b >> 56); -} - -inline int count_1s_max_15(Bitboard b) { - b -= (b>>1) & 0x5555555555555555ULL; - b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); - b *= 0x1111111111111111ULL; - return int(b >> 60); -} - -#endif // BITCOUNT - - //// //// Prototypes //// diff --git a/src/bitcount.h b/src/bitcount.h new file mode 100644 index 00000000..d57b3f40 --- /dev/null +++ b/src/bitcount.h @@ -0,0 +1,186 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2008 Tord Romstad (Glaurung author) + Copyright (C) 2008-2009 Marco Costalba + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + + +#if !defined(BITCOUNT_H_INCLUDED) +#define BITCOUNT_H_INCLUDED + +// To disable POPCNT support uncomment following line. You should do it only +// in PGO compiling to exercise the default fallback path. Don't forget to +// re-comment the line for the final optimized compile though ;-) +//#define DISABLE_POPCNT_SUPPORT + + +#include "bitboard.h" + + +// Select type of software bit count function to use + +#if !defined(AUTO_CONFIGURATION) || defined(IS_64BIT) + +//#define USE_COMPACT_ROOK_ATTACKS +//#define USE_32BIT_ATTACKS +#define USE_FOLDED_BITSCAN + +#define BITCOUNT_SWAR_64 +//#define BITCOUNT_SWAR_32 +//#define BITCOUNT_LOOP + +#else + +#define USE_32BIT_ATTACKS +#define USE_FOLDED_BITSCAN +#define BITCOUNT_SWAR_32 + +#endif + + +// Select type of intrinsic bit count instruction to use + +#if defined(_MSC_VER) && defined(_WIN64) // Microsoft compiler + +#include + +inline bool cpu_has_popcnt() { + + int CPUInfo[4] = {-1}; + __cpuid(CPUInfo, 0x00000001); + return (CPUInfo[2] >> 23) & 1; +} + +#define POPCNT_INTRINSIC(x) __popcnt64(x) + +#elif defined(__INTEL_COMPILER) && (defined(__x86_64) || defined(_M_X64)) // Intel compiler + +#include + +inline bool cpu_has_popcnt() { + + int CPUInfo[4] = {-1}; + __cpuid(CPUInfo, 0x00000001); + return (CPUInfo[2] >> 23) & 1; +} + +#define POPCNT_INTRINSIC(x) _mm_popcnt_u64(x) + +#else // Safe fallback for unsupported compilers + +inline bool cpu_has_popcnt() { return false; } + +#define POPCNT_INTRINSIC(x) count_1s(x) + +#endif + + +/// Software implementation of bit count functions + +#if defined(BITCOUNT_LOOP) + +inline int count_1s(Bitboard b) { + int r; + for(r = 0; b; r++, b &= b - 1); + return r; +} + +inline int count_1s_max_15(Bitboard b) { + return count_1s(b); +} + +#elif defined(BITCOUNT_SWAR_32) + +inline int count_1s(Bitboard b) { + unsigned w = unsigned(b >> 32), v = unsigned(b); + v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits + w -= (w >> 1) & 0x55555555; + v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits + w = ((w >> 2) & 0x33333333) + (w & 0x33333333); + v = ((v >> 4) + v) & 0x0F0F0F0F; // 0-8 in 8 bits + v += (((w >> 4) + w) & 0x0F0F0F0F); // 0-16 in 8 bits + v *= 0x01010101; // mul is fast on amd procs + return int(v >> 24); +} + +inline int count_1s_max_15(Bitboard b) { + unsigned w = unsigned(b >> 32), v = unsigned(b); + v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits + w -= (w >> 1) & 0x55555555; + v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits + w = ((w >> 2) & 0x33333333) + (w & 0x33333333); + v += w; // 0-8 in 4 bits + v *= 0x11111111; + return int(v >> 28); +} + +#elif defined(BITCOUNT_SWAR_64) + +inline int count_1s(Bitboard b) { + b -= ((b>>1) & 0x5555555555555555ULL); + b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); + b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL; + b *= 0x0101010101010101ULL; + return int(b >> 56); +} + +inline int count_1s_max_15(Bitboard b) { + b -= (b>>1) & 0x5555555555555555ULL; + b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL); + b *= 0x1111111111111111ULL; + return int(b >> 60); +} + +#endif // BITCOUNT + + +/// count_1s() counts the number of nonzero bits in a bitboard. +/// If template parameter is true an intrinsic is called, otherwise +/// we fallback on a software implementation. + +template +inline int count_1s(Bitboard b) { + + return UseIntrinsic ? POPCNT_INTRINSIC(b) : count_1s(b); +} + +template +inline int count_1s_max_15(Bitboard b) { + + return UseIntrinsic ? POPCNT_INTRINSIC(b) : count_1s_max_15(b); +} + + +// Global variable initialized at startup that is set to true if +// CPU on which application runs supports POPCNT intrinsic. Unless +// DISABLE_POPCNT_SUPPORT is defined. +#if defined(DISABLE_POPCNT_SUPPORT) +const bool CpuHasPOPCNT = false; +#else +const bool CpuHasPOPCNT = cpu_has_popcnt(); +#endif + + +// Global variable used to print info about the use of 64 optimized +// functions to verify that a 64bit compile has been correctly built. +#if defined(BITCOUNT_SWAR_64) +const bool CpuHas64BitPath = true; +#else +const bool CpuHas64BitPath = false; +#endif + +#endif // !defined(BITCOUNT_H_INCLUDED) diff --git a/src/endgame.cpp b/src/endgame.cpp index 3f3bdcce..3258c6fc 100644 --- a/src/endgame.cpp +++ b/src/endgame.cpp @@ -25,6 +25,7 @@ #include #include "bitbase.h" +#include "bitcount.h" #include "endgame.h" diff --git a/src/evaluate.cpp b/src/evaluate.cpp index c0b4866b..9fd4040e 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -25,6 +25,7 @@ #include #include +#include "bitcount.h" #include "evaluate.h" #include "material.h" #include "pawns.h" @@ -267,11 +268,14 @@ namespace { uint8_t BitCount8Bit[256]; // Function prototypes - template + template + Value do_evaluate(const Position& pos, EvalInfo& ei, int threadID); + + template void evaluate_pieces(const Position& p, Color us, EvalInfo& ei); template<> - void evaluate_pieces(const Position& p, Color us, EvalInfo &ei); + void evaluate_pieces(const Position& p, Color us, EvalInfo &ei); void evaluate_passed_pawns(const Position &pos, EvalInfo &ei); void evaluate_trapped_bishop_a7h7(const Position &pos, Square s, Color us, @@ -294,11 +298,19 @@ namespace { //// Functions //// -/// evaluate() is the main evaluation function. It always computes two +/// evaluate() is the main evaluation function. It always computes two /// values, an endgame score and a middle game score, and interpolates /// between them based on the remaining material. +Value evaluate(const Position& pos, EvalInfo& ei, int threadID) { + + return CpuHasPOPCNT ? do_evaluate(pos, ei, threadID) + : do_evaluate(pos, ei, threadID); +} + +namespace { -Value evaluate(const Position &pos, EvalInfo &ei, int threadID) { +template +Value do_evaluate(const Position& pos, EvalInfo& ei, int threadID) { assert(pos.is_ok()); assert(threadID >= 0 && threadID < THREAD_MAX); @@ -339,16 +351,16 @@ Value evaluate(const Position &pos, EvalInfo &ei, int threadID) { // Initialize pawn attack bitboards for both sides ei.attackedBy[WHITE][PAWN] = ((pos.pawns(WHITE) << 9) & ~FileABB) | ((pos.pawns(WHITE) << 7) & ~FileHBB); ei.attackedBy[BLACK][PAWN] = ((pos.pawns(BLACK) >> 7) & ~FileABB) | ((pos.pawns(BLACK) >> 9) & ~FileHBB); - ei.kingAttackersCount[WHITE] = count_1s_max_15(ei.attackedBy[WHITE][PAWN] & ei.attackedBy[BLACK][KING])/2; - ei.kingAttackersCount[BLACK] = count_1s_max_15(ei.attackedBy[BLACK][PAWN] & ei.attackedBy[WHITE][KING])/2; + ei.kingAttackersCount[WHITE] = count_1s_max_15(ei.attackedBy[WHITE][PAWN] & ei.attackedBy[BLACK][KING])/2; + ei.kingAttackersCount[BLACK] = count_1s_max_15(ei.attackedBy[BLACK][PAWN] & ei.attackedBy[WHITE][KING])/2; // Evaluate pieces for (Color c = WHITE; c <= BLACK; c++) { - evaluate_pieces(pos, c, ei); - evaluate_pieces(pos, c, ei); - evaluate_pieces(pos, c, ei); - evaluate_pieces(pos, c, ei); + evaluate_pieces(pos, c, ei); + evaluate_pieces(pos, c, ei); + evaluate_pieces(pos, c, ei); + evaluate_pieces(pos, c, ei); // Sum up all attacked squares ei.attackedBy[c][0] = ei.attackedBy[c][PAWN] | ei.attackedBy[c][KNIGHT] @@ -360,7 +372,7 @@ Value evaluate(const Position &pos, EvalInfo &ei, int threadID) { // because we need complete attack information for all pieces when computing // the king safety evaluation. for (Color c = WHITE; c <= BLACK; c++) - evaluate_pieces(pos, c, ei); + evaluate_pieces(pos, c, ei); // Evaluate passed pawns. We evaluate passed pawns for both sides at once, // because we need to know which side promotes first in positions where @@ -436,6 +448,7 @@ Value evaluate(const Position &pos, EvalInfo &ei, int threadID) { return (ei.mateThreat[stm] == MOVE_NONE ? v : 8 * QueenValueMidgame - v); } +} // namespace /// quick_evaluate() does a very approximate evaluation of the current position. /// It currently considers only material and piece square table scores. Perhaps @@ -527,7 +540,7 @@ namespace { // evaluate_common() computes terms common to all pieces attack - template + template int evaluate_common(const Position& p, const Bitboard& b, Color us, EvalInfo& ei, Square s = SQ_NONE) { static const int AttackWeight[] = { 0, 0, KnightAttackWeight, BishopAttackWeight, RookAttackWeight, QueenAttackWeight }; @@ -547,15 +560,15 @@ namespace { ei.kingAttackersWeight[us] += AttackWeight[Piece]; Bitboard bb = (b & ei.attackedBy[them][KING]); if (bb) - ei.kingAdjacentZoneAttacksCount[us] += count_1s_max_15(bb); + ei.kingAdjacentZoneAttacksCount[us] += count_1s_max_15(bb); } // Remove squares protected by enemy pawns Bitboard bb = (b & ~ei.attackedBy[them][PAWN]); // Mobility - int mob = (Piece != QUEEN ? count_1s_max_15(bb & ~p.pieces_of_color(us)) - : count_1s(bb & ~p.pieces_of_color(us))); + int mob = (Piece != QUEEN ? count_1s_max_15(bb & ~p.pieces_of_color(us)) + : count_1s(bb & ~p.pieces_of_color(us))); ei.mgMobility += Sign[us] * MgBonus[Piece][mob]; ei.egMobility += Sign[us] * EgBonus[Piece][mob]; @@ -587,7 +600,7 @@ namespace { // evaluate_pieces<>() assigns bonuses and penalties to the pieces of a given // color. - template + template void evaluate_pieces(const Position& pos, Color us, EvalInfo& ei) { Bitboard b; @@ -608,7 +621,7 @@ namespace { b = rook_attacks_bb(s, pos.occupied_squares() & ~pos.rooks_and_queens(us)); // Attacks, mobility and outposts - mob = evaluate_common(pos, b, us, ei, s); + mob = evaluate_common(pos, b, us, ei, s); // Special patterns: trapped bishops on a7/h7/a2/h2 // and trapped bishops on a1/h1/a8/h8 in Chess960. @@ -691,7 +704,7 @@ namespace { // color. template<> - void evaluate_pieces(const Position& p, Color us, EvalInfo& ei) { + void evaluate_pieces(const Position& p, Color us, EvalInfo& ei) { int shelter = 0, sign = Sign[us]; Square s = p.king_square(us); diff --git a/src/main.cpp b/src/main.cpp index e009fd96..fc970a0e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -29,6 +29,7 @@ #include #include "benchmark.h" +#include "bitcount.h" #include "misc.h" #include "uci.h" @@ -74,9 +75,12 @@ int main(int argc, char *argv[]) { } // Print copyright notice - cout << engine_name() << ". Copyright (C) " + cout << engine_name() << ". Copyright (C) " << "2004-2009 Tord Romstad, Marco Costalba. " << endl; + if (CpuHasPOPCNT) + cout << "Good! CPU has hardware POPCNT. We will use it." << endl; + // Enter UCI mode uci_main_loop(); return 0; diff --git a/src/misc.cpp b/src/misc.cpp index ba4da568..149cfb45 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -65,6 +65,7 @@ static int gettimeofday(struct timeval* tp, struct timezone*) #include #include +#include "bitcount.h" #include "misc.h" using namespace std; @@ -162,8 +163,10 @@ void dbg_print_mean(ofstream& logFile) { const string engine_name() { + const string cpu64(CpuHas64BitPath ? " 64bit" : ""); + if (!EngineVersion.empty()) - return "Stockfish " + EngineVersion; + return AppName+ " " + EngineVersion + cpu64; string date(__DATE__); // From compiler, format is "Sep 21 2008" string months("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec"); @@ -176,7 +179,7 @@ const string engine_name() { string name = AppName + " " + AppTag + " "; s << name << date.substr(date.length() - 2) << setfill('0') - << setw(2) << mon << setw(2) << day; + << setw(2) << mon << setw(2) << day << cpu64; return s.str(); } diff --git a/src/movegen.cpp b/src/movegen.cpp index 3845cd97..995f54f0 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -24,6 +24,7 @@ #include +#include "bitcount.h" #include "movegen.h" // Simple macro to wrap a very common while loop, no facny, no flexibility, diff --git a/src/pawns.cpp b/src/pawns.cpp index 3cfe3750..19bf6c51 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -25,6 +25,7 @@ #include #include +#include "bitcount.h" #include "pawns.h" #include "position.h" diff --git a/src/position.cpp b/src/position.cpp index f4752c5e..dd6ec05b 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -27,6 +27,7 @@ #include #include +#include "bitcount.h" #include "mersenne.h" #include "movegen.h" #include "movepick.h"