+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+ output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+ #elif defined(USE_SSE2)
+ __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+ __m128i sum_hi = kZeros;
+ const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m128i row_j = _mm_load_si128(&row[j]);
+ __m128i input_j = _mm_load_si128(&input_vector[j]);
+ __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+ __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+ __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+ __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+ __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+ __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+ __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+ sum_lo = _mm_add_epi32(sum_lo, product_lo);
+ sum_hi = _mm_add_epi32(sum_hi, product_hi);
+ }
+ __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+ __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_high_64);
+ __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_second_32);