- out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
- }
-
-#elif defined(USE_SSE2)
-
- constexpr IndexType OutputChunkSize = 128 / 8;
- static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
- constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
- const __m128i Zero = _mm_setzero_si128();
- const __m128i One = _mm_set1_epi16(127);
-
- const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
- const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
- __m128i* out = reinterpret_cast< __m128i*>(output + offset);
-
- for (IndexType j = 0; j < NumOutputChunks; j += 1)
- {
- const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
- const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
- const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
- const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
-
- const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
- const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
-
- out[j] = _mm_packs_epi16(pa, pb);
- }
-
-#elif defined(USE_NEON)
-
- constexpr IndexType OutputChunkSize = 128 / 8;
- static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
- constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
- const int16x8_t Zero = vdupq_n_s16(0);
- const int16x8_t One = vdupq_n_s16(127);
-
- const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
- const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
- int8x16_t* out = reinterpret_cast< int8x16_t*>(output + offset);
-
- for (IndexType j = 0; j < NumOutputChunks; j += 1)
- {
- const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
- const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
- const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
- const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
-
- const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
- const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
-
- out[j] = vcombine_s8(pa, pb);