kHalfDimensions * sizeof(BiasType));
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
+ #if defined(USE_AVX512)
+ auto accumulation = reinterpret_cast<__m512i*>(
+ &accumulator.accumulation[perspective][i][0]);
+ auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+ constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+ for (IndexType j = 0; j < kNumChunks; ++j)
+ _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
- #if defined(USE_AVX2)
+ #elif defined(USE_AVX2)
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
_mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
- }
#elif defined(USE_SSE2)
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
- }
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
- }
#else
- for (IndexType j = 0; j < kHalfDimensions; ++j) {
+ for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
- }
#endif
}