This improves the speed of NNUE by a bit on old hardware that code path
is intended for, like a Pentium III 1.13 GHz:
10 repeats of "./stockfish bench 16 1 13 default depth NNUE":
Before:
54 642 504 897 cycles (± 0.12%)
62 301 937 829 instructions (± 0.03%)
After:
54 320 821 928 cycles (± 0.13%)
62 084 742 699 instructions (± 0.02%)
Speed of go depth 20 from startpos:
Before: 53103 nps
After: 53856 nps
closes https://github.com/official-stockfish/Stockfish/pull/3476
No functional change.
#elif USE_MMX
typedef __m64 vec_t;
#elif USE_MMX
typedef __m64 vec_t;
- typedef std::int32_t psqt_vec_t;
+ typedef __m64 psqt_vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
- #define vec_add_psqt_32(a,b) a+b
- #define vec_sub_psqt_32(a,b) a-b
- #define vec_zero_psqt() 0
+ #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
+ #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
+ #define vec_zero_psqt() _mm_setzero_si64()
static constexpr IndexType NumRegs = 8;
static constexpr IndexType NumRegs = 8;
- static constexpr IndexType NumPsqtRegs = 8;
+ static constexpr IndexType NumPsqtRegs = 4;
#elif USE_NEON
typedef int16x8_t vec_t;
#elif USE_NEON
typedef int16x8_t vec_t;