git.sesse.net Git - stockfish/blob - src/nnue/layers/affine_transform.h

   1 /*
   2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   3   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
   4
   5   Stockfish is free software: you can redistribute it and/or modify
   6   it under the terms of the GNU General Public License as published by
   7   the Free Software Foundation, either version 3 of the License, or
   8   (at your option) any later version.
   9
  10   Stockfish is distributed in the hope that it will be useful,
  11   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13   GNU General Public License for more details.
  14
  15   You should have received a copy of the GNU General Public License
  16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19 // Definition of layer AffineTransform of NNUE evaluation function
  20
  21 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
  22 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
  23
  24 #include <iostream>
  25 #include "../nnue_common.h"
  26
  27 namespace Eval::NNUE::Layers {
  28
  29   // Affine transformation layer
  30   template <typename PreviousLayer, IndexType OutputDimensions>
  31   class AffineTransform {
  32    public:
  33     // Input/output type
  34     using InputType = typename PreviousLayer::OutputType;
  35     using OutputType = std::int32_t;
  36     static_assert(std::is_same<InputType, std::uint8_t>::value, "");
  37
  38     // Number of input/output dimensions
  39     static constexpr IndexType kInputDimensions =
  40         PreviousLayer::kOutputDimensions;
  41     static constexpr IndexType kOutputDimensions = OutputDimensions;
  42     static constexpr IndexType kPaddedInputDimensions =
  43         CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
  44
  45     // Size of forward propagation buffer used in this layer
  46     static constexpr std::size_t kSelfBufferSize =
  47         CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
  48
  49     // Size of the forward propagation buffer used from the input layer to this layer
  50     static constexpr std::size_t kBufferSize =
  51         PreviousLayer::kBufferSize + kSelfBufferSize;
  52
  53     // Hash value embedded in the evaluation file
  54     static constexpr std::uint32_t GetHashValue() {
  55       std::uint32_t hash_value = 0xCC03DAE4u;
  56       hash_value += kOutputDimensions;
  57       hash_value ^= PreviousLayer::GetHashValue() >> 1;
  58       hash_value ^= PreviousLayer::GetHashValue() << 31;
  59       return hash_value;
  60     }
  61
  62    // Read network parameters
  63     bool ReadParameters(std::istream& stream) {
  64       if (!previous_layer_.ReadParameters(stream)) return false;
  65       for (std::size_t i = 0; i < kOutputDimensions; ++i)
  66         biases_[i] = read_little_endian<BiasType>(stream);
  67       for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
  68         weights_[i] = read_little_endian<WeightType>(stream);
  69       return !stream.fail();
  70     }
  71
  72     // Forward propagation
  73     const OutputType* Propagate(
  74         const TransformedFeatureType* transformed_features, char* buffer) const {
  75       const auto input = previous_layer_.Propagate(
  76           transformed_features, buffer + kSelfBufferSize);
  77
  78 #if defined (USE_AVX512)
  79
  80       [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
  81
  82       [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
  83         return _mm512_reduce_add_epi32(sum) + bias;
  84       };
  85
  86       [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
  87         __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
  88         __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
  89
  90         __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
  91         __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
  92
  93         __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
  94         __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
  95
  96         __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
  97         __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
  98
  99         __m512i sum = _mm512_add_epi32(sum0123a, sum0123b);
 100
 101         __m256i sum256lo = _mm512_castsi512_si256(sum);
 102         __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
 103
 104         sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
 105
 106         __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
 107         __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
 108
 109         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
 110       };
 111
 112       [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 113 #if defined (USE_VNNI)
 114         acc = _mm512_dpbusd_epi32(acc, a, b);
 115 #else
 116         __m512i product0 = _mm512_maddubs_epi16(a, b);
 117         product0 = _mm512_madd_epi16(product0, kOnes512);
 118         acc = _mm512_add_epi32(acc, product0);
 119 #endif
 120       };
 121
 122 #endif
 123 #if defined (USE_AVX2)
 124
 125       [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
 126
 127       [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
 128         __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
 129         sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
 130         sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
 131         return _mm_cvtsi128_si32(sum128) + bias;
 132       };
 133
 134       [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
 135         sum0 = _mm256_hadd_epi32(sum0, sum1);
 136         sum2 = _mm256_hadd_epi32(sum2, sum3);
 137
 138         sum0 = _mm256_hadd_epi32(sum0, sum2);
 139
 140         __m128i sum128lo = _mm256_castsi256_si128(sum0);
 141         __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
 142
 143         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
 144       };
 145
 146       [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 147 #if defined (USE_VNNI)
 148         acc = _mm256_dpbusd_epi32(acc, a, b);
 149 #else
 150         __m256i product0 = _mm256_maddubs_epi16(a, b);
 151         product0 = _mm256_madd_epi16(product0, kOnes256);
 152         acc = _mm256_add_epi32(acc, product0);
 153 #endif
 154       };
 155
 156 #endif
 157
 158 #if defined (USE_SSSE3)
 159
 160       [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
 161
 162       [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
 163         sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
 164         sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
 165         return _mm_cvtsi128_si32(sum) + bias;
 166       };
 167
 168       [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
 169         sum0 = _mm_hadd_epi32(sum0, sum1);
 170         sum2 = _mm_hadd_epi32(sum2, sum3);
 171
 172         sum0 = _mm_hadd_epi32(sum0, sum2);
 173
 174         return _mm_add_epi32(sum0, bias);
 175       };
 176
 177       [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
 178         __m128i product0 = _mm_maddubs_epi16(a, b);
 179         product0 = _mm_madd_epi16(product0, kOnes128);
 180         acc = _mm_add_epi32(acc, product0);
 181       };
 182
 183 #endif
 184
 185 #if defined (USE_AVX512)
 186
 187       constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
 188       constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
 189
 190       const auto output = reinterpret_cast<OutputType*>(buffer);
 191
 192       // Since to saturate a zmm register it takes 64 bytes we
 193       // cannot use AVX512 for the smaller affine transforms.
 194       // Instead we fallback to a AVX2 implementation if the
 195       // kInputDimensions isn't a multiple of 64.
 196       // Note that this means that for example for
 197       // kInputDimensions of 96 we fallback to AVX2 even though
 198       // the first 64 elements could be processed with AVX512.
 199       // This is caused by mixing the __m256 and __m512 variables
 200       // required to better handle that case and it would
 201       // require handling more cases statically not to lose performance.
 202       // This should be revisited if such input dimensions are to be considered.
 203       [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
 204       [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
 205
 206       // kOutputDimensions is either 1 or a multiple of kSimdWidth
 207       // because then it is also an input dimension.
 208       if constexpr (kOutputDimensions % 4 == 0)
 209       {
 210         for (IndexType i = 0; i < kOutputDimensions; i += 4)
 211         {
 212           const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
 213           const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
 214           const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
 215           const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
 216
 217           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
 218           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 219
 220           if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
 221           {
 222             __m512i sum0 = _mm512_setzero_si512();
 223             __m512i sum1 = _mm512_setzero_si512();
 224             __m512i sum2 = _mm512_setzero_si512();
 225             __m512i sum3 = _mm512_setzero_si512();
 226
 227             const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
 228             const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
 229             const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
 230             const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
 231
 232             for (IndexType j = 0; j < kNumChunks512; ++j)
 233             {
 234               const __m512i in = input_vector512[j];
 235
 236               m512_add_dpbusd_epi32(sum0, in, row0[j]);
 237               m512_add_dpbusd_epi32(sum1, in, row1[j]);
 238               m512_add_dpbusd_epi32(sum2, in, row2[j]);
 239               m512_add_dpbusd_epi32(sum3, in, row3[j]);
 240             }
 241
 242             *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
 243           }
 244           else
 245           {
 246             __m256i sum0 = _mm256_setzero_si256();
 247             __m256i sum1 = _mm256_setzero_si256();
 248             __m256i sum2 = _mm256_setzero_si256();
 249             __m256i sum3 = _mm256_setzero_si256();
 250
 251             const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
 252             const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
 253             const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
 254             const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 255
 256             for (IndexType j = 0; j < kNumChunks256; ++j)
 257             {
 258               const __m256i in = input_vector256[j];
 259
 260               m256_add_dpbusd_epi32(sum0, in, row0[j]);
 261               m256_add_dpbusd_epi32(sum1, in, row1[j]);
 262               m256_add_dpbusd_epi32(sum2, in, row2[j]);
 263               m256_add_dpbusd_epi32(sum3, in, row3[j]);
 264             }
 265
 266             *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
 267           }
 268         }
 269       }
 270       else if constexpr (kOutputDimensions == 1)
 271       {
 272         if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
 273         {
 274           __m512i sum0 = _mm512_setzero_si512();
 275
 276           const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
 277
 278           for (IndexType j = 0; j < kNumChunks512; ++j)
 279           {
 280             const __m512i in = input_vector512[j];
 281
 282             m512_add_dpbusd_epi32(sum0, in, row0[j]);
 283           }
 284
 285           output[0] = m512_hadd(sum0, biases_[0]);
 286         }
 287         else
 288         {
 289           __m256i sum0 = _mm256_setzero_si256();
 290
 291           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 292
 293           for (IndexType j = 0; j < kNumChunks256; ++j)
 294           {
 295             const __m256i in = input_vector256[j];
 296
 297             m256_add_dpbusd_epi32(sum0, in, row0[j]);
 298           }
 299
 300           output[0] = m256_hadd(sum0, biases_[0]);
 301         }
 302       }
 303       else
 304       {
 305         // This case can never happen because kOutputDimensions
 306         // is always 1 or a multiple of kSimdWidth.
 307         assert(false);
 308       }
 309
 310 #elif defined (USE_AVX2)
 311
 312       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
 313
 314       const auto output = reinterpret_cast<OutputType*>(buffer);
 315       const auto input_vector = reinterpret_cast<const __m256i*>(input);
 316
 317       // kOutputDimensions is either 1 or a multiple of kSimdWidth
 318       // because then it is also an input dimension.
 319       if constexpr (kOutputDimensions % 4 == 0)
 320       {
 321         for (IndexType i = 0; i < kOutputDimensions; i += 4)
 322         {
 323           const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
 324           const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
 325           const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
 326           const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
 327
 328           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
 329           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 330
 331           __m256i sum0 = _mm256_setzero_si256();
 332           __m256i sum1 = _mm256_setzero_si256();
 333           __m256i sum2 = _mm256_setzero_si256();
 334           __m256i sum3 = _mm256_setzero_si256();
 335
 336           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
 337           const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
 338           const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
 339           const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 340
 341           for (IndexType j = 0; j < kNumChunks; ++j)
 342           {
 343             const __m256i in = input_vector[j];
 344
 345             m256_add_dpbusd_epi32(sum0, in, row0[j]);
 346             m256_add_dpbusd_epi32(sum1, in, row1[j]);
 347             m256_add_dpbusd_epi32(sum2, in, row2[j]);
 348             m256_add_dpbusd_epi32(sum3, in, row3[j]);
 349           }
 350
 351           *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
 352         }
 353       }
 354       else if constexpr (kOutputDimensions == 1)
 355       {
 356         __m256i sum0 = _mm256_setzero_si256();
 357
 358         const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 359
 360         for (IndexType j = 0; j < kNumChunks; ++j)
 361         {
 362           const __m256i in = input_vector[j];
 363
 364             m256_add_dpbusd_epi32(sum0, in, row0[j]);
 365         }
 366
 367         output[0] = m256_hadd(sum0, biases_[0]);
 368       }
 369       else
 370       {
 371         // This case can never happen because kOutputDimensions
 372         // is always 1 or a multiple of kSimdWidth.
 373         assert(false);
 374       }
 375
 376 #elif defined (USE_SSSE3)
 377
 378       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
 379
 380       auto output = reinterpret_cast<OutputType*>(buffer);
 381       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 382
 383       // kOutputDimensions is either 1 or a multiple of kSimdWidth
 384       // because then it is also an input dimension.
 385       if constexpr (kOutputDimensions % 4 == 0)
 386       {
 387         for (IndexType i = 0; i < kOutputDimensions; i += 4)
 388         {
 389           const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
 390           const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
 391           const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
 392           const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
 393
 394           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
 395           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 396
 397           __m128i sum0 = _mm_setzero_si128();
 398           __m128i sum1 = _mm_setzero_si128();
 399           __m128i sum2 = _mm_setzero_si128();
 400           __m128i sum3 = _mm_setzero_si128();
 401
 402           const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
 403           const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
 404           const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
 405           const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
 406
 407           for (int j = 0; j < (int)kNumChunks; j += 1)
 408           {
 409             const __m128i in = input_vector[j];
 410
 411             m128_add_dpbusd_epi32(sum0, in, row0[j]);
 412             m128_add_dpbusd_epi32(sum1, in, row1[j]);
 413             m128_add_dpbusd_epi32(sum2, in, row2[j]);
 414             m128_add_dpbusd_epi32(sum3, in, row3[j]);
 415           }
 416
 417           *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
 418         }
 419       }
 420       else if constexpr (kOutputDimensions == 1)
 421       {
 422         __m128i sum0 = _mm_setzero_si128();
 423
 424         const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
 425
 426         for (int j = 0; j < (int)kNumChunks; j += 1)
 427         {
 428           const __m128i in = input_vector[j];
 429
 430           m128_add_dpbusd_epi32(sum0, in, row0[j]);
 431         }
 432
 433         output[0] = m128_hadd(sum0, biases_[0]);
 434       }
 435       else
 436       {
 437         // This case can never happen because kOutputDimensions
 438         // is always 1 or a multiple of kSimdWidth.
 439         assert(false);
 440       }
 441
 442 #else
 443
 444 // Use old implementation for the other architectures.
 445
 446       auto output = reinterpret_cast<OutputType*>(buffer);
 447
 448 #if defined(USE_SSE2)
 449       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
 450 #ifndef USE_SSSE3
 451       const __m128i kZeros = _mm_setzero_si128();
 452 #else
 453       const __m128i kOnes = _mm_set1_epi16(1);
 454 #endif
 455       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 456
 457 #elif defined(USE_MMX)
 458       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
 459       const __m64 kZeros = _mm_setzero_si64();
 460       const auto input_vector = reinterpret_cast<const __m64*>(input);
 461
 462 #elif defined(USE_NEON)
 463       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
 464       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
 465 #endif
 466
 467       for (IndexType i = 0; i < kOutputDimensions; ++i) {
 468         const IndexType offset = i * kPaddedInputDimensions;
 469
 470 #if defined(USE_SSE2)
 471         __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
 472         __m128i sum_hi = kZeros;
 473         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
 474         for (IndexType j = 0; j < kNumChunks; ++j) {
 475           __m128i row_j = _mm_load_si128(&row[j]);
 476           __m128i input_j = _mm_load_si128(&input_vector[j]);
 477           __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
 478           __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
 479           __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
 480           __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
 481           __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
 482           __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
 483           __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
 484           sum_lo = _mm_add_epi32(sum_lo, product_lo);
 485           sum_hi = _mm_add_epi32(sum_hi, product_hi);
 486         }
 487         __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
 488         __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
 489         sum = _mm_add_epi32(sum, sum_high_64);
 490         __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
 491         sum = _mm_add_epi32(sum, sum_second_32);
 492         output[i] = _mm_cvtsi128_si32(sum);
 493
 494 #elif defined(USE_MMX)
 495         __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
 496         __m64 sum_hi = kZeros;
 497         const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
 498         for (IndexType j = 0; j < kNumChunks; ++j) {
 499           __m64 row_j = row[j];
 500           __m64 input_j = input_vector[j];
 501           __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
 502           __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
 503           __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
 504           __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
 505           __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
 506           __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
 507           __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
 508           sum_lo = _mm_add_pi32(sum_lo, product_lo);
 509           sum_hi = _mm_add_pi32(sum_hi, product_hi);
 510         }
 511         __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
 512         sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
 513         output[i] = _mm_cvtsi64_si32(sum);
 514
 515 #elif defined(USE_NEON)
 516         int32x4_t sum = {biases_[i]};
 517         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
 518         for (IndexType j = 0; j < kNumChunks; ++j) {
 519           int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
 520           product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
 521           sum = vpadalq_s16(sum, product);
 522         }
 523         output[i] = sum[0] + sum[1] + sum[2] + sum[3];
 524
 525 #else
 526         OutputType sum = biases_[i];
 527         for (IndexType j = 0; j < kInputDimensions; ++j) {
 528           sum += weights_[offset + j] * input[j];
 529         }
 530         output[i] = sum;
 531 #endif
 532
 533       }
 534 #if defined(USE_MMX)
 535       _mm_empty();
 536 #endif
 537
 538 #endif
 539
 540       return output;
 541     }
 542
 543    private:
 544     using BiasType = OutputType;
 545     using WeightType = std::int8_t;
 546
 547     PreviousLayer previous_layer_;
 548
 549     alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
 550     alignas(kCacheLineSize)
 551         WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
 552   };
 553
 554 }  // namespace Eval::NNUE::Layers
 555
 556 #endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED