- );
- }
-
- #elif defined(USE_MMX)
- auto out = reinterpret_cast<__m64*>(&output[offset]);
- for (IndexType j = 0; j < NumChunks; ++j) {
- __m64 sum0 = *(&reinterpret_cast<const __m64*>(
- accumulation[perspectives[p]])[j * 2 + 0]);
- __m64 sum1 = *(&reinterpret_cast<const __m64*>(
- accumulation[perspectives[p]])[j * 2 + 1]);
- const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
- out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
- }
-
- #elif defined(USE_NEON)
- const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
- for (IndexType j = 0; j < NumChunks; ++j) {
- int16x8_t sum = reinterpret_cast<const int16x8_t*>(
- accumulation[perspectives[p]])[j];
- out[j] = vmax_s8(vqmovn_s16(sum), Zero);
- }
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const IndexType offset = HalfDimensions * p;
+ for (IndexType j = 0; j < HalfDimensions; ++j)
+ {
+ BiasType sum = accumulation[perspectives[p]][j];
+ output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
+ }
+ }
+ return psqt;