+ #elif defined(USE_MMX)
+ auto out = reinterpret_cast<__m64*>(&output[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+ accumulation[perspectives[p]][0])[j * 2 + 0]);
+ __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+ accumulation[perspectives[p]][0])[j * 2 + 1]);
+ const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+ out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+ }
+