+ const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
+
+ const auto psqt = (
+ psqtAccumulation[perspectives[0]][bucket]
+ - psqtAccumulation[perspectives[1]][bucket]
+ ) / 2;
+
+
+ #if defined(USE_AVX512)
+
+ constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
+ static_assert(HalfDimensions % (SimdWidth * 2) == 0);
+ const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+ const __m512i Zero = _mm512_setzero_si512();
+
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const IndexType offset = HalfDimensions * p;
+ auto out = reinterpret_cast<__m512i*>(&output[offset]);
+ for (IndexType j = 0; j < NumChunks; ++j)
+ {
+ __m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
+ (accumulation[perspectives[p]])[j * 2 + 0]);
+ __m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
+ (accumulation[perspectives[p]])[j * 2 + 1]);
+
+ _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
+ _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
+ }
+ }
+ return psqt;
+
+ #elif defined(USE_AVX2)
+
+ constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
+ constexpr int Control = 0b11011000;
+ const __m256i Zero = _mm256_setzero_si256();
+
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const IndexType offset = HalfDimensions * p;
+ auto out = reinterpret_cast<__m256i*>(&output[offset]);
+ for (IndexType j = 0; j < NumChunks; ++j)
+ {
+ __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
+ (accumulation[perspectives[p]])[j * 2 + 0]);
+ __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
+ (accumulation[perspectives[p]])[j * 2 + 1]);
+
+ _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
+ _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
+ }
+ }
+ return psqt;