X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fsimd.h;h=ffa54d9627bd684a01057bd21bf3a0104f19a0cf;hb=d579db34a3f7a26cd5502e79dafde2a80614d645;hp=584148f126023c758704f4d0826f1740fa9e1b13;hpb=18dcf1f09754284325157f2d270df10a09297958;p=stockfish diff --git a/src/simd.h b/src/simd.h index 584148f1..ffa54d96 100644 --- a/src/simd.h +++ b/src/simd.h @@ -46,6 +46,13 @@ #define USE_INLINE_ASM #endif +// Use either the AVX512 or AVX-VNNI version of the VNNI instructions. +#if defined(USE_AVXVNNI) +#define VNNI_PREFIX "%{vex%} " +#else +#define VNNI_PREFIX "" +#endif + namespace Stockfish::Simd { #if defined (USE_AVX512) @@ -208,7 +215,7 @@ namespace Stockfish::Simd { # if defined (USE_VNNI) # if defined (USE_INLINE_ASM) asm( - "vpdpbusd %[b], %[a], %[acc]\n\t" + VNNI_PREFIX "vpdpbusd %[b], %[a], %[acc]\n\t" : [acc]"+v"(acc) : [a]"v"(a), [b]"vm"(b) ); @@ -240,8 +247,8 @@ namespace Stockfish::Simd { # if defined (USE_VNNI) # if defined (USE_INLINE_ASM) asm( - "vpdpbusd %[b0], %[a0], %[acc]\n\t" - "vpdpbusd %[b1], %[a1], %[acc]\n\t" + VNNI_PREFIX "vpdpbusd %[b0], %[a0], %[acc]\n\t" + VNNI_PREFIX "vpdpbusd %[b1], %[a1], %[acc]\n\t" : [acc]"+v"(acc) : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1) ); @@ -336,6 +343,45 @@ namespace Stockfish::Simd { #endif +#if defined (USE_NEON) + + [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { +# if USE_NEON >= 8 + return vaddvq_s32(s); +# else + return s[0] + s[1] + s[2] + s[3]; +# endif + } + + [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { + return neon_m128_reduce_add_epi32(sum) + bias; + } + + [[maybe_unused]] static int32x4_t neon_m128_haddx4( + int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3, + int32x4_t bias) { + + int32x4_t hsums { + neon_m128_reduce_add_epi32(sum0), + neon_m128_reduce_add_epi32(sum1), + neon_m128_reduce_add_epi32(sum2), + neon_m128_reduce_add_epi32(sum3) + }; + return vaddq_s32(hsums, bias); + } + + [[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2( + int32x4_t& acc, + int8x8_t a0, int8x8_t b0, + int8x8_t a1, int8x8_t b1) { + + int16x8_t product = vmull_s8(a0, b0); + product = vmlal_s8(product, a1, b1); + acc = vpadalq_s16(acc, product); + } + +#endif + } #endif // STOCKFISH_SIMD_H_INCLUDED