+ // If vector instructions are enabled, we update and refresh the
+ // accumulator tile by tile such that each tile fits in the CPU's
+ // vector registers.
+ #define TILING
+
+ #ifdef USE_AVX512
+ typedef __m512i vec_t;
+ #define vec_load(a) _mm512_loadA_si512(a)
+ #define vec_store(a,b) _mm512_storeA_si512(a,b)
+ #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+ #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+ static constexpr IndexType kNumRegs = 8; // only 8 are needed
+
+ #elif USE_AVX2
+ typedef __m256i vec_t;
+ #define vec_load(a) _mm256_loadA_si256(a)
+ #define vec_store(a,b) _mm256_storeA_si256(a,b)
+ #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+ #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+ static constexpr IndexType kNumRegs = 16;
+
+ #elif USE_SSE2
+ typedef __m128i vec_t;
+ #define vec_load(a) (*(a))
+ #define vec_store(a,b) *(a)=(b)
+ #define vec_add_16(a,b) _mm_add_epi16(a,b)
+ #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+ static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+ #elif USE_MMX
+ typedef __m64 vec_t;
+ #define vec_load(a) (*(a))
+ #define vec_store(a,b) *(a)=(b)
+ #define vec_add_16(a,b) _mm_add_pi16(a,b)
+ #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+ static constexpr IndexType kNumRegs = 8;
+
+ #elif USE_NEON
+ typedef int16x8_t vec_t;
+ #define vec_load(a) (*(a))
+ #define vec_store(a,b) *(a)=(b)
+ #define vec_add_16(a,b) vaddq_s16(a,b)
+ #define vec_sub_16(a,b) vsubq_s16(a,b)
+ static constexpr IndexType kNumRegs = 16;
+
+ #else
+ #undef TILING
+
+ #endif
+