-#if defined(USE_AVX512)
-
- constexpr IndexType OutputChunkSize = 512 / 8;
- static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
- constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
- const __m512i Zero = _mm512_setzero_si512();
- const __m512i One = _mm512_set1_epi16(127);
- const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-
- const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
- const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
- __m512i* out = reinterpret_cast< __m512i*>(output + offset);
-
- for (IndexType j = 0; j < NumOutputChunks; j += 1)
- {
- const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
- const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
- const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
- const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
-
- const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
- const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
-
- out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
- }
-
-#elif defined(USE_AVX2)
-
- constexpr IndexType OutputChunkSize = 256 / 8;
- static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
- constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
- const __m256i Zero = _mm256_setzero_si256();
- const __m256i One = _mm256_set1_epi16(127);
- constexpr int Control = 0b11011000;
-
- const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
- const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
- __m256i* out = reinterpret_cast< __m256i*>(output + offset);
-
- for (IndexType j = 0; j < NumOutputChunks; j += 1)
- {
- const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
- const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
- const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
- const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
-
- const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
- const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
-
- out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
- }
-
-#elif defined(USE_SSE2)
-
- constexpr IndexType OutputChunkSize = 128 / 8;
- static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
- constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
- const __m128i Zero = _mm_setzero_si128();
- const __m128i One = _mm_set1_epi16(127);