X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Flayers%2Fclipped_relu.h;h=44d8a7def4c92ff55f7fcf1150b9553b37a90458;hb=21df37d7fd4dcc9b4a9c319382cc43685c0259c8;hp=13196ec28b49d133afeb0c0f704e644b86583b8d;hpb=f948cd008d3a289ebbadc463271f84888e8069ba;p=stockfish diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 13196ec2..44d8a7de 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m64*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m64 words0 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), + kWeightScaleBits); + const __m64 words1 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), + kWeightScaleBits); + const __m64 packedbytes = _mm_packs_pi16(words0, words1); + out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + _mm_empty(); + constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0};