Cleanup and optimize SSE/AVX code

[stockfish] / src / nnue / nnue_feature_transformer.h
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index f899d7617655cf086274aeee8ecb021434c06bca..3818e444b6af9710110dff8eba49b4148d55b53b 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -109,13 +109,11 @@ namespace Eval::NNUE {
    #if defined(USE_AVX2)
          auto out = reinterpret_cast<__m256i*>(&output[offset]);
          for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum0 = _mm256_loadA_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_loadA_si256(
+            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
                _mm256_packs_epi16(sum0, sum1), kZero), kControl));
          }
  
@@ -171,38 +169,41 @@ namespace Eval::NNUE {
                     kHalfDimensions * sizeof(BiasType));
          for (const auto index : active_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;
+  #if defined(USE_AVX512)
+          auto accumulation = reinterpret_cast<__m512i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
  
-  #if defined(USE_AVX2)
+  #elif defined(USE_AVX2)
            auto accumulation = reinterpret_cast<__m256i*>(
                &accumulator.accumulation[perspective][i][0]);
            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
-            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-          }
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
  
    #elif defined(USE_SSE2)
            auto accumulation = reinterpret_cast<__m128i*>(
                &accumulator.accumulation[perspective][i][0]);
            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-          }
  
    #elif defined(USE_NEON)
            auto accumulation = reinterpret_cast<int16x8_t*>(
                &accumulator.accumulation[perspective][i][0]);
            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-          }
  
    #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          for (IndexType j = 0; j < kHalfDimensions; ++j)
              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
    #endif
  
          }