Remove handcrafted MMX code

[stockfish] / src / nnue / nnue_feature_transformer.h
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index 13f1604fe1365d1b07a7bc2ef6a4cd111dade85f..77a175f50c921cc3d90c9f09bc7f51179533a1eb 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -21,11 +21,18 @@
  #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  
-#include "nnue_common.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <utility>
+
+#include "../position.h"
+#include "../types.h"
+#include "nnue_accumulator.h"
  #include "nnue_architecture.h"
-
-#include <cstring> // std::memset()
-#include <utility> // std::pair
+#include "nnue_common.h"
  
  namespace Stockfish::Eval::NNUE {
  
@@ -42,8 +49,8 @@ namespace Stockfish::Eval::NNUE {
      "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
  
    #ifdef USE_AVX512
-  typedef __m512i vec_t;
-  typedef __m256i psqt_vec_t;
+  using vec_t = __m512i;
+  using psqt_vec_t = __m256i;
    #define vec_load(a) _mm512_load_si512(a)
    #define vec_store(a,b) _mm512_store_si512(a,b)
    #define vec_add_16(a,b) _mm512_add_epi16(a,b)
@@ -62,12 +69,12 @@ namespace Stockfish::Eval::NNUE {
    #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
    #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
    #define vec_zero_psqt() _mm256_setzero_si256()
-  #define NumRegistersSIMD 32
+  #define NumRegistersSIMD 16
    #define MaxChunkSize 64
  
    #elif USE_AVX2
-  typedef __m256i vec_t;
-  typedef __m256i psqt_vec_t;
+  using vec_t = __m256i;
+  using psqt_vec_t = __m256i;
    #define vec_load(a) _mm256_load_si256(a)
    #define vec_store(a,b) _mm256_store_si256(a,b)
    #define vec_add_16(a,b) _mm256_add_epi16(a,b)
@@ -90,8 +97,8 @@ namespace Stockfish::Eval::NNUE {
    #define MaxChunkSize 32
  
    #elif USE_SSE2
-  typedef __m128i vec_t;
-  typedef __m128i psqt_vec_t;
+  using vec_t = __m128i;
+  using psqt_vec_t = __m128i;
    #define vec_load(a) (*(a))
    #define vec_store(a,b) *(a)=(b)
    #define vec_add_16(a,b) _mm_add_epi16(a,b)
@@ -110,37 +117,9 @@ namespace Stockfish::Eval::NNUE {
    #define NumRegistersSIMD (Is64Bit ? 16 : 8)
    #define MaxChunkSize 16
  
-  #elif USE_MMX
-  typedef __m64 vec_t;
-  typedef __m64 psqt_vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_pi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-  #define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
-  #define vec_zero() _mm_setzero_si64()
-  #define vec_set_16(a) _mm_set1_pi16(a)
-  inline vec_t vec_max_16(vec_t a,vec_t b){
-    vec_t comparison = _mm_cmpgt_pi16(a,b);
-    return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
-  }
-  inline vec_t vec_min_16(vec_t a,vec_t b){
-    vec_t comparison = _mm_cmpgt_pi16(a,b);
-    return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
-  }
-  #define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
-  #define vec_load_psqt(a) (*(a))
-  #define vec_store_psqt(a,b) *(a)=(b)
-  #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
-  #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
-  #define vec_zero_psqt() _mm_setzero_si64()
-  #define vec_cleanup() _mm_empty()
-  #define NumRegistersSIMD 8
-  #define MaxChunkSize 8
-
    #elif USE_NEON
-  typedef int16x8_t vec_t;
-  typedef int32x4_t psqt_vec_t;
+  using vec_t = int16x8_t;
+  using psqt_vec_t = int32x4_t;
    #define vec_load(a) (*(a))
    #define vec_store(a,b) *(a)=(b)
    #define vec_add_16(a,b) vaddq_s16(a,b)
@@ -253,9 +232,9 @@ namespace Stockfish::Eval::NNUE {
      // Read network parameters
      bool read_parameters(std::istream& stream) {
  
-      read_little_endian<BiasType      >(stream, biases     , HalfDimensions                  );
-      read_little_endian<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
-      read_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
+      read_leb_128<BiasType      >(stream, biases     , HalfDimensions                  );
+      read_leb_128<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
+      read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
  
        return !stream.fail();
      }
@@ -263,9 +242,9 @@ namespace Stockfish::Eval::NNUE {
      // Write network parameters
      bool write_parameters(std::ostream& stream) const {
  
-      write_little_endian<BiasType      >(stream, biases     , HalfDimensions                  );
-      write_little_endian<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
-      write_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
+      write_leb_128<BiasType      >(stream, biases     , HalfDimensions                  );
+      write_leb_128<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
+      write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
  
        return !stream.fail();
      }
@@ -320,18 +299,14 @@ namespace Stockfish::Eval::NNUE {
            for (IndexType j = 0; j < HalfDimensions / 2; ++j) {
                BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
                BiasType sum1 = accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
-              sum0 = std::max<int>(0, std::min<int>(127, sum0));
-              sum1 = std::max<int>(0, std::min<int>(127, sum1));
-              output[offset + j] = static_cast<OutputType>(sum0 * sum1 / 128);
+              sum0 = std::clamp<BiasType>(sum0, 0, 127);
+              sum1 = std::clamp<BiasType>(sum1, 0, 127);
+              output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 128);
            }
  
  #endif
        }
  
-#if defined(vec_cleanup)
-      vec_cleanup();
-#endif
-
        return psqt;
      } // end of function transform()
  
@@ -363,9 +338,9 @@ namespace Stockfish::Eval::NNUE {
      // NOTE: The parameter states_to_update is an array of position states, ending with nullptr.
      //       All states must be sequential, that is states_to_update[i] must either be reachable
      //       by repeatedly applying ->previous from states_to_update[i+1] or states_to_update[i] == nullptr.
-    //       computed_st must be reachable by repeatadly applying ->previous on states_to_update[0], if not nullptr.
+    //       computed_st must be reachable by repeatedly applying ->previous on states_to_update[0], if not nullptr.
      template<Color Perspective, size_t N>
-    void update_accumulator_incremetal(const Position& pos, StateInfo* computed_st, StateInfo* states_to_update[N]) const {
+    void update_accumulator_incremental(const Position& pos, StateInfo* computed_st, StateInfo* states_to_update[N]) const {
        static_assert(N > 0);
        assert(states_to_update[N-1] == nullptr);
  
@@ -522,10 +497,6 @@ namespace Stockfish::Eval::NNUE {
          }
        }
  #endif
-
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
      }
  
      template<Color Perspective>
@@ -606,10 +577,6 @@ namespace Stockfish::Eval::NNUE {
            accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
        }
  #endif
-
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
      }
  
      template<Color Perspective>
@@ -630,7 +597,7 @@ namespace Stockfish::Eval::NNUE {
        {
          // Only update current position accumulator to minimize work.
          StateInfo* states_to_update[2] = { pos.state(), nullptr };
-        update_accumulator_incremetal<Perspective, 2>(pos, oldest_st, states_to_update);
+        update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update);
        }
        else
        {
@@ -656,7 +623,7 @@ namespace Stockfish::Eval::NNUE {
          StateInfo *states_to_update[3] =
            { next, next == pos.state() ? nullptr : pos.state(), nullptr };
  
-        update_accumulator_incremetal<Perspective, 3>(pos, oldest_st, states_to_update);
+        update_accumulator_incremental<Perspective, 3>(pos, oldest_st, states_to_update);
        }
        else
        {