X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Flayers%2Faffine_transform.h;h=fc65c34339ff35bb44f69dee3a85285de69eb51b;hb=8a912951de6d4bff78d3ff5258213a0c7e6f494e;hp=61cdb7818661286cde3c2be0c55f87035746fcad;hpb=1461d861c8240e29df690f1e34dc50eee37ae1b5;p=stockfish diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 61cdb781..fc65c343 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -45,17 +45,13 @@ namespace Stockfish::Eval::NNUE::Layers { template static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input) { +# if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined(USE_NEON) # if defined(USE_SSE2) // At least a multiple of 16, with SSE2. constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const __m128i Zeros = _mm_setzero_si128(); const auto inputVector = reinterpret_cast(input); -# elif defined(USE_MMX) - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 8; - const __m64 Zeros = _mm_setzero_si64(); - const auto inputVector = reinterpret_cast(input); - # elif defined(USE_NEON_DOTPROD) constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const auto inputVector = reinterpret_cast(input); @@ -91,26 +87,6 @@ namespace Stockfish::Eval::NNUE::Layers { sum = _mm_add_epi32(sum, sum_second_32); output[i] = _mm_cvtsi128_si32(sum); -# elif defined(USE_MMX) - __m64 sumLo = _mm_cvtsi32_si64(biases[i]); - __m64 sumHi = Zeros; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m64 row_j = row[j]; - __m64 input_j = inputVector[j]; - __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); - __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); - __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros); - __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros); - __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo); - __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi); - sumLo = _mm_add_pi32(sumLo, productLo); - sumHi = _mm_add_pi32(sumHi, productHi); - } - __m64 sum = _mm_add_pi32(sumLo, sumHi); - sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); - output[i] = _mm_cvtsi64_si32(sum); - # elif defined(USE_NEON_DOTPROD) int32x4_t sum = {biases[i]}; const auto row = reinterpret_cast(&weights[offset]); @@ -129,17 +105,19 @@ namespace Stockfish::Eval::NNUE::Layers { } output[i] = sum[0] + sum[1] + sum[2] + sum[3]; -# else - std::int32_t sum = biases[i]; - for (IndexType j = 0; j < InputDimensions; ++j) { - sum += weights[offset + j] * input[j]; - } - output[i] = sum; # endif } - -# if defined(USE_MMX) - _mm_empty(); +# else + std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions); + + // Traverse weights in transpose order to take advantage of input sparsity + for (IndexType i = 0; i < InputDimensions; ++i) + if (input[i]) { + const std::int8_t* w = &weights[i]; + const int in = input[i]; + for (IndexType j = 0; j < OutputDimensions; ++j) + output[j] += w[j * PaddedInputDimensions] * in; + } # endif } #endif @@ -302,7 +280,7 @@ namespace Stockfish::Eval::NNUE::Layers { vec_t sum0 = vec_setzero(); const auto row0 = reinterpret_cast(&weights[0]); - for (int j = 0; j < (int)NumChunks; ++j) + for (int j = 0; j < int(NumChunks); ++j) { const vec_t in = inputVector[j]; vec_add_dpbusd_32(sum0, in, row0[j]);