+ #endif
+
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const IndexType offset = HalfDimensions * p;
+ auto out = reinterpret_cast<__m128i*>(&output[offset]);
+ for (IndexType j = 0; j < NumChunks; ++j)
+ {
+ __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
+ (accumulation[perspectives[p]])[j * 2 + 0]);
+ __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
+ (accumulation[perspectives[p]])[j * 2 + 1]);
+ const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+ #ifdef USE_SSE41
+ _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
+ #else
+ _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
+ #endif
+ }
+ }
+ return psqt;