Generalize the feature transform to use vec_t macros

[stockfish] / src / nnue / nnue_feature_transformer.h
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index fb867421f6c249b7513190ef5f2a48296726ac03..c969ac6cee8c69eef729c2510fae867197c7ddd0 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
    #define vec_store(a,b) _mm512_store_si512(a,b)
    #define vec_add_16(a,b) _mm512_add_epi16(a,b)
    #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
+  #define vec_zero() _mm512_setzero_epi32()
+  #define vec_set_16(a) _mm512_set1_epi16(a)
+  #define vec_max_16(a,b) _mm512_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm512_min_epi16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+    vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
+    return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
+  }
    #define vec_load_psqt(a) _mm256_load_si256(a)
    #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
    #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
    #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
    #define vec_zero_psqt() _mm256_setzero_si256()
    #define NumRegistersSIMD 32
+  #define MaxChunkSize 64
  
    #elif USE_AVX2
    typedef __m256i vec_t;
@@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
    #define vec_store(a,b) _mm256_store_si256(a,b)
    #define vec_add_16(a,b) _mm256_add_epi16(a,b)
    #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
+  #define vec_zero() _mm256_setzero_si256()
+  #define vec_set_16(a) _mm256_set1_epi16(a)
+  #define vec_max_16(a,b) _mm256_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm256_min_epi16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+    vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
+    return _mm256_permute4x64_epi64(compacted, 0b11011000);
+  }
    #define vec_load_psqt(a) _mm256_load_si256(a)
    #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
    #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
    #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
    #define vec_zero_psqt() _mm256_setzero_si256()
    #define NumRegistersSIMD 16
+  #define MaxChunkSize 32
  
    #elif USE_SSE2
    typedef __m128i vec_t;
@@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
    #define vec_store(a,b) *(a)=(b)
    #define vec_add_16(a,b) _mm_add_epi16(a,b)
    #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
+  #define vec_zero() _mm_setzero_si128()
+  #define vec_set_16(a) _mm_set1_epi16(a)
+  #define vec_max_16(a,b) _mm_max_epi16(a,b)
+  #define vec_min_16(a,b) _mm_min_epi16(a,b)
+  #define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
    #define vec_load_psqt(a) (*(a))
    #define vec_store_psqt(a,b) *(a)=(b)
    #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
    #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
    #define vec_zero_psqt() _mm_setzero_si128()
    #define NumRegistersSIMD (Is64Bit ? 16 : 8)
+  #define MaxChunkSize 16
  
    #elif USE_MMX
    typedef __m64 vec_t;
@@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
    #define vec_store(a,b) *(a)=(b)
    #define vec_add_16(a,b) _mm_add_pi16(a,b)
    #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
+  #define vec_zero() _mm_setzero_si64()
+  #define vec_set_16(a) _mm_set1_pi16(a)
+  inline vec_t vec_max_16(vec_t a,vec_t b){
+      vec_t comparison = _mm_cmpgt_pi16(a,b);
+      return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
+  }
+  inline vec_t vec_min_16(vec_t a,vec_t b){
+      vec_t comparison = _mm_cmpgt_pi16(a,b);
+      return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
+  }
+  #define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
    #define vec_load_psqt(a) (*(a))
    #define vec_store_psqt(a,b) *(a)=(b)
    #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
    #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
    #define vec_zero_psqt() _mm_setzero_si64()
+  #define vec_cleanup() _mm_empty()
    #define NumRegistersSIMD 8
+  #define MaxChunkSize 8
  
    #elif USE_NEON
    typedef int16x8_t vec_t;
@@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
    #define vec_store(a,b) *(a)=(b)
    #define vec_add_16(a,b) vaddq_s16(a,b)
    #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_mul_16(a,b) vmulq_s16(a,b)
+  #define vec_zero() vec_t{0}
+  #define vec_set_16(a) vdupq_n_s16(a)
+  #define vec_max_16(a,b) vmaxq_s16(a,b)
+  #define vec_min_16(a,b) vminq_s16(a,b)
+  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
+        const int8x8_t shifta = vshrn_n_s16(a, 7);
+        const int8x8_t shiftb = vshrn_n_s16(b, 7);
+       const int8x16_t compacted = vcombine_s8(shifta,shiftb);
+       return *reinterpret_cast<const vec_t*> (&compacted);
+  }
    #define vec_load_psqt(a) (*(a))
    #define vec_store_psqt(a,b) *(a)=(b)
    #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
    #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
    #define vec_zero_psqt() psqt_vec_t{0}
    #define NumRegistersSIMD 16
+  #define MaxChunkSize 16
  
    #else
    #undef VECTOR
@@ -123,8 +176,10 @@ namespace Stockfish::Eval::NNUE {
        // We use __m* types as template arguments, which causes GCC to emit warnings
        // about losing some attribute information. This is irrelevant to us as we
        // only take their size, so the following pragma are harmless.
+      #if defined(__GNUC__)
        #pragma GCC diagnostic push
        #pragma GCC diagnostic ignored "-Wignored-attributes"
+      #endif
  
        template <typename SIMDRegisterType,
                  typename LaneType,
@@ -156,9 +211,9 @@ namespace Stockfish::Eval::NNUE {
  
        static constexpr int NumRegs     = BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
        static constexpr int NumPsqtRegs = BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
-
+      #if defined(__GNUC__)
        #pragma GCC diagnostic pop
-
+      #endif
    #endif
  
  
@@ -233,110 +288,30 @@ namespace Stockfish::Eval::NNUE {
        {
            const IndexType offset = (HalfDimensions / 2) * p;
  
-#if defined(USE_AVX512)
-
-          constexpr IndexType OutputChunkSize = 512 / 8;
-          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
-          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
-          const __m512i Zero = _mm512_setzero_si512();
-          const __m512i One = _mm512_set1_epi16(127);
-          const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-
-          const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
-          const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-                __m512i* out = reinterpret_cast<      __m512i*>(output + offset);
-
-          for (IndexType j = 0; j < NumOutputChunks; j += 1)
-          {
-              const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
-              const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
-              const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
-              const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
-
-              const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
-              const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
-
-              out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
-          }
-
-#elif defined(USE_AVX2)
-
-          constexpr IndexType OutputChunkSize = 256 / 8;
-          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
-          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
-          const __m256i Zero = _mm256_setzero_si256();
-          const __m256i One = _mm256_set1_epi16(127);
-          constexpr int Control = 0b11011000;
-
-          const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
-          const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-                __m256i* out = reinterpret_cast<      __m256i*>(output + offset);
-
-          for (IndexType j = 0; j < NumOutputChunks; j += 1)
-          {
-              const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
-              const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
-              const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
-              const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
-
-              const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
-              const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
-
-              out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
-          }
-
-#elif defined(USE_SSE2)
-
-          constexpr IndexType OutputChunkSize = 128 / 8;
-          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
-          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
-          const __m128i Zero = _mm_setzero_si128();
-          const __m128i One = _mm_set1_epi16(127);
+#if defined(VECTOR)
  
-          const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
-          const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-                __m128i* out = reinterpret_cast<      __m128i*>(output + offset);
-
-          for (IndexType j = 0; j < NumOutputChunks; j += 1)
-          {
-              const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
-              const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
-              const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
-              const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
-
-              const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
-              const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
-
-              out[j] = _mm_packs_epi16(pa, pb);
-          }
-
-#elif defined(USE_NEON)
-
-          constexpr IndexType OutputChunkSize = 128 / 8;
+         constexpr IndexType OutputChunkSize = MaxChunkSize;
            static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
            constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
  
-          const int16x8_t Zero = vdupq_n_s16(0);
-          const int16x8_t One  = vdupq_n_s16(127);
+          vec_t Zero = vec_zero();
+          vec_t One = vec_set_16(127);
  
-          const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
-          const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-                int8x16_t* out = reinterpret_cast<      int8x16_t*>(output + offset);
+          const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
+          const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                vec_t* out = reinterpret_cast<      vec_t*>(output + offset);
  
            for (IndexType j = 0; j < NumOutputChunks; j += 1)
            {
-              const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
-              const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
-              const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
-              const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
+              const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
+              const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
+              const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
+              const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);
  
-              const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
-              const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
+              const vec_t pa = vec_mul_16(sum0a, sum1a);
+              const vec_t pb = vec_mul_16(sum0b, sum1b);
  
-              out[j] = vcombine_s8(pa, pb);
+              out[j] = vec_msb_pack_16(pa, pb);
            }
  
  #else
@@ -352,6 +327,10 @@ namespace Stockfish::Eval::NNUE {
  #endif
        }
  
+#if defined(vec_cleanup)
+      vec_cleanup();
+#endif
+
        return psqt;
  
     } // end of function transform()