- output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
- #elif defined(USE_AVX2)
- __m256i sum = _mm256_setzero_si256();
- const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
- product = _mm256_madd_epi16(product, kOnes);
- sum = _mm256_add_epi32(sum, product);
- }
- __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
- sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
- sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
- output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
- #elif defined(USE_SSSE3)
- __m128i sum = _mm_setzero_si128();
- const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
- for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
- __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
- product0 = _mm_madd_epi16(product0, kOnes);
- sum = _mm_add_epi32(sum, product0);
- __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
- product1 = _mm_madd_epi16(product1, kOnes);
- sum = _mm_add_epi32(sum, product1);
- }
- if (kNumChunks & 0x1) {
- __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
- product = _mm_madd_epi16(product, kOnes);
- sum = _mm_add_epi32(sum, product);
- }
- sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
- sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
- output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
- #elif defined(USE_SSE2)
- __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
- __m128i sum_hi = kZeros;
- const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m128i row_j = _mm_load_si128(&row[j]);
- __m128i input_j = _mm_load_si128(&input_vector[j]);
- __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
- __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
- __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
- __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
- __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
- __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
- __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
- sum_lo = _mm_add_epi32(sum_lo, product_lo);
- sum_hi = _mm_add_epi32(sum_hi, product_hi);
- }
- __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
- __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
- sum = _mm_add_epi32(sum, sum_high_64);
- __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
- sum = _mm_add_epi32(sum, sum_second_32);
- output[i] = _mm_cvtsi128_si32(sum);
-
- #elif defined(USE_MMX)
- __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
- __m64 sum_hi = kZeros;
- const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m64 row_j = row[j];
- __m64 input_j = input_vector[j];
- __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
- __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
- __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
- __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
- __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
- __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
- __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
- sum_lo = _mm_add_pi32(sum_lo, product_lo);
- sum_hi = _mm_add_pi32(sum_hi, product_hi);
- }
- __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
- sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
- output[i] = _mm_cvtsi64_si32(sum);
-
- #elif defined(USE_NEON)
- int32x4_t sum = {biases_[i]};
- const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
- product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
- sum = vpadalq_s16(sum, product);
- }
- output[i] = sum[0] + sum[1] + sum[2] + sum[3];