git.sesse.net Git - stockfish/blob - src/nnue/layers/affine_transform.h

   1 /*
   2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   3   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
   4
   5   Stockfish is free software: you can redistribute it and/or modify
   6   it under the terms of the GNU General Public License as published by
   7   the Free Software Foundation, either version 3 of the License, or
   8   (at your option) any later version.
   9
  10   Stockfish is distributed in the hope that it will be useful,
  11   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13   GNU General Public License for more details.
  14
  15   You should have received a copy of the GNU General Public License
  16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19 // Definition of layer AffineTransform of NNUE evaluation function
  20
  21 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
  22 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
  23
  24 #include <iostream>
  25 #include <algorithm>
  26 #include <type_traits>
  27 #include "../nnue_common.h"
  28 #include "../../simd.h"
  29
  30 /*
  31   This file contains the definition for a fully connected layer (aka affine transform).
  32   Two approaches are employed, depending on the sizes of the transform.
  33
  34   Approach 1:
  35     - used when the PaddedInputDimensions >= 128
  36     - uses AVX512 if possible
  37     - processes inputs in batches of 2*InputSimdWidth
  38       - so in batches of 128 for AVX512
  39     - the weight blocks of size InputSimdWidth are transposed such that
  40       access is sequential
  41     - N columns of the weight matrix are processed a time, where N
  42       depends on the architecture (the amount of registers)
  43     - accumulate + hadd is used
  44
  45   Approach 2:
  46     - used when the PaddedInputDimensions < 128
  47     - does not use AVX512
  48     - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32.
  49       - that's why AVX512 is hard to implement
  50     - expected use-case is small layers
  51       - not optimized as well as the approach 1
  52     - inputs are processed in chunks of 4, weights are respectively transposed
  53     - accumulation happens directly to int32s
  54 */
  55
  56 namespace Stockfish::Eval::NNUE::Layers {
  57
  58 // Fallback implementation for older/other architectures.
  59 // Identical for both approaches. Requires the input to be padded to at least 16 values.
  60 #if !defined(USE_SSSE3)
  61   template <IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
  62   static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input)
  63   {
  64 # if defined(USE_SSE2)
  65     // At least a multiple of 16, with SSE2.
  66     static_assert(PaddedInputDimensions % 16 == 0);
  67     constexpr IndexType NumChunks = PaddedInputDimensions / 16;
  68     const __m128i Zeros = _mm_setzero_si128();
  69     const auto inputVector = reinterpret_cast<const __m128i*>(input);
  70
  71 # elif defined(USE_MMX)
  72     static_assert(InputDimensions % 8 == 0);
  73     constexpr IndexType NumChunks = InputDimensions / 8;
  74     const __m64 Zeros = _mm_setzero_si64();
  75     const auto inputVector = reinterpret_cast<const __m64*>(input);
  76
  77 # elif defined(USE_NEON)
  78     static_assert(PaddedInputDimensions % 16 == 0);
  79     constexpr IndexType NumChunks = PaddedInputDimensions / 16;
  80     const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
  81 # endif
  82
  83     for (IndexType i = 0; i < OutputDimensions; ++i) {
  84       const IndexType offset = i * PaddedInputDimensions;
  85
  86 # if defined(USE_SSE2)
  87       __m128i sumLo = _mm_cvtsi32_si128(biases[i]);
  88       __m128i sumHi = Zeros;
  89       const auto row = reinterpret_cast<const __m128i*>(&weights[offset]);
  90       for (IndexType j = 0; j < NumChunks; ++j) {
  91         __m128i row_j = _mm_load_si128(&row[j]);
  92         __m128i input_j = _mm_load_si128(&inputVector[j]);
  93         __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
  94         __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
  95         __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
  96         __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
  97         __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo);
  98         __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi);
  99         sumLo = _mm_add_epi32(sumLo, productLo);
 100         sumHi = _mm_add_epi32(sumHi, productHi);
 101       }
 102       __m128i sum = _mm_add_epi32(sumLo, sumHi);
 103       __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
 104       sum = _mm_add_epi32(sum, sumHigh_64);
 105       __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
 106       sum = _mm_add_epi32(sum, sum_second_32);
 107       output[i] = _mm_cvtsi128_si32(sum);
 108
 109 # elif defined(USE_MMX)
 110       __m64 sumLo = _mm_cvtsi32_si64(biases[i]);
 111       __m64 sumHi = Zeros;
 112       const auto row = reinterpret_cast<const __m64*>(&weights[offset]);
 113       for (IndexType j = 0; j < NumChunks; ++j) {
 114         __m64 row_j = row[j];
 115         __m64 input_j = inputVector[j];
 116         __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8);
 117         __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8);
 118         __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros);
 119         __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros);
 120         __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo);
 121         __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi);
 122         sumLo = _mm_add_pi32(sumLo, productLo);
 123         sumHi = _mm_add_pi32(sumHi, productHi);
 124       }
 125       __m64 sum = _mm_add_pi32(sumLo, sumHi);
 126       sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
 127       output[i] = _mm_cvtsi64_si32(sum);
 128
 129 # elif defined(USE_NEON)
 130       int32x4_t sum = {biases[i]};
 131       const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
 132       for (IndexType j = 0; j < NumChunks; ++j) {
 133         int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
 134         product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
 135         sum = vpadalq_s16(sum, product);
 136       }
 137       output[i] = sum[0] + sum[1] + sum[2] + sum[3];
 138
 139 # else
 140       std::int32_t sum = biases[i];
 141       for (IndexType j = 0; j < InputDimensions; ++j) {
 142         sum += weights[offset + j] * input[j];
 143       }
 144       output[i] = sum;
 145 # endif
 146     }
 147
 148 # if defined(USE_MMX)
 149     _mm_empty();
 150 # endif
 151   }
 152 #endif
 153
 154   template <typename PreviousLayer, IndexType OutDims, typename Enabled = void>
 155   class AffineTransform;
 156
 157   // A specialization for large inputs.
 158   template <typename PreviousLayer, IndexType OutDims>
 159   class AffineTransform<PreviousLayer, OutDims, std::enable_if_t<(PreviousLayer::OutputDimensions >= 2*64-1)>> {
 160    public:
 161     // Input/output type
 162     using InputType = typename PreviousLayer::OutputType;
 163     using OutputType = std::int32_t;
 164     static_assert(std::is_same<InputType, std::uint8_t>::value, "");
 165
 166     // Number of input/output dimensions
 167     static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions;
 168     static constexpr IndexType OutputDimensions = OutDims;
 169
 170     static constexpr IndexType PaddedInputDimensions =
 171       ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
 172
 173     static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen.");
 174
 175 #if defined (USE_AVX512)
 176     static constexpr const IndexType InputSimdWidth = 64;
 177     static constexpr const IndexType MaxNumOutputRegs = 16;
 178 #elif defined (USE_AVX2)
 179     static constexpr const IndexType InputSimdWidth = 32;
 180     static constexpr const IndexType MaxNumOutputRegs = 8;
 181 #elif defined (USE_SSSE3)
 182     static constexpr const IndexType InputSimdWidth = 16;
 183     static constexpr const IndexType MaxNumOutputRegs = 8;
 184 #else
 185     // The fallback implementation will not have permuted weights.
 186     // We define these to avoid a lot of ifdefs later.
 187     static constexpr const IndexType InputSimdWidth = 1;
 188     static constexpr const IndexType MaxNumOutputRegs = 1;
 189 #endif
 190
 191     // A big block is a region in the weight matrix of the size [PaddedInputDimensions, NumOutputRegs].
 192     // A small block is a region of size [InputSimdWidth, 1]
 193
 194     static constexpr const IndexType NumOutputRegs = std::min(MaxNumOutputRegs, OutputDimensions);
 195     static constexpr const IndexType SmallBlockSize = InputSimdWidth;
 196     static constexpr const IndexType BigBlockSize = NumOutputRegs * PaddedInputDimensions;
 197     static constexpr const IndexType NumSmallBlocksInBigBlock = BigBlockSize / SmallBlockSize;
 198     static constexpr const IndexType NumSmallBlocksPerOutput = PaddedInputDimensions / SmallBlockSize;
 199     static constexpr const IndexType NumBigBlocks = OutputDimensions / NumOutputRegs;
 200
 201     static_assert(OutputDimensions % NumOutputRegs == 0);
 202
 203     // Size of forward propagation buffer used in this layer
 204     static constexpr std::size_t SelfBufferSize =
 205       ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
 206
 207     // Size of the forward propagation buffer used from the input layer to this layer
 208     static constexpr std::size_t BufferSize =
 209       PreviousLayer::BufferSize + SelfBufferSize;
 210
 211     // Hash value embedded in the evaluation file
 212     static constexpr std::uint32_t get_hash_value() {
 213       std::uint32_t hashValue = 0xCC03DAE4u;
 214       hashValue += OutputDimensions;
 215       hashValue ^= PreviousLayer::get_hash_value() >> 1;
 216       hashValue ^= PreviousLayer::get_hash_value() << 31;
 217       return hashValue;
 218     }
 219
 220     /*
 221       Transposes the small blocks within a block.
 222       Effectively means that weights can be traversed sequentially during inference.
 223     */
 224     static IndexType get_weight_index(IndexType i)
 225     {
 226       const IndexType smallBlock = (i / SmallBlockSize) % NumSmallBlocksInBigBlock;
 227       const IndexType smallBlockCol = smallBlock / NumSmallBlocksPerOutput;
 228       const IndexType smallBlockRow = smallBlock % NumSmallBlocksPerOutput;
 229       const IndexType bigBlock   = i / BigBlockSize;
 230       const IndexType rest       = i % SmallBlockSize;
 231
 232       const IndexType idx =
 233           bigBlock * BigBlockSize
 234         + smallBlockRow * SmallBlockSize * NumOutputRegs
 235         + smallBlockCol * SmallBlockSize
 236         + rest;
 237
 238       return idx;
 239     }
 240
 241     // Read network parameters
 242     bool read_parameters(std::istream& stream) {
 243       if (!previousLayer.read_parameters(stream)) return false;
 244       for (std::size_t i = 0; i < OutputDimensions; ++i)
 245         biases[i] = read_little_endian<BiasType>(stream);
 246
 247       for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
 248         weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
 249
 250       return !stream.fail();
 251     }
 252
 253     // Write network parameters
 254     bool write_parameters(std::ostream& stream) const {
 255       if (!previousLayer.write_parameters(stream)) return false;
 256       for (std::size_t i = 0; i < OutputDimensions; ++i)
 257           write_little_endian<BiasType>(stream, biases[i]);
 258
 259       for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
 260         write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
 261
 262       return !stream.fail();
 263     }
 264
 265     // Forward propagation
 266     const OutputType* propagate(
 267         const TransformedFeatureType* transformedFeatures, char* buffer) const {
 268       const auto input = previousLayer.propagate(
 269         transformedFeatures, buffer + SelfBufferSize);
 270       OutputType* output = reinterpret_cast<OutputType*>(buffer);
 271
 272 #if defined (USE_AVX512)
 273       using vec_t = __m512i;
 274       #define vec_setzero _mm512_setzero_si512
 275       #define vec_set_32 _mm512_set1_epi32
 276       #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
 277       #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
 278       #define vec_hadd Simd::m512_hadd
 279       #define vec_haddx4 Simd::m512_haddx4
 280 #elif defined (USE_AVX2)
 281       using vec_t = __m256i;
 282       #define vec_setzero _mm256_setzero_si256
 283       #define vec_set_32 _mm256_set1_epi32
 284       #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
 285       #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
 286       #define vec_hadd Simd::m256_hadd
 287       #define vec_haddx4 Simd::m256_haddx4
 288 #elif defined (USE_SSSE3)
 289       using vec_t = __m128i;
 290       #define vec_setzero _mm_setzero_si128
 291       #define vec_set_32 _mm_set1_epi32
 292       #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
 293       #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
 294       #define vec_hadd Simd::m128_hadd
 295       #define vec_haddx4 Simd::m128_haddx4
 296 #endif
 297
 298 #if defined (USE_SSSE3)
 299       const vec_t* invec = reinterpret_cast<const vec_t*>(input);
 300
 301
 302       // Perform accumulation to registers for each big block
 303       for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
 304       {
 305         vec_t acc[NumOutputRegs] = { vec_setzero() };
 306
 307         // Each big block has NumOutputRegs small blocks in each "row", one per register.
 308         // We process two small blocks at a time to save on one addition without VNNI.
 309         for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2)
 310         {
 311           const vec_t* weightvec =
 312             reinterpret_cast<const vec_t*>(
 313                 weights
 314               + bigBlock * BigBlockSize
 315               + smallBlock * SmallBlockSize * NumOutputRegs);
 316
 317           const vec_t in0 = invec[smallBlock + 0];
 318           const vec_t in1 = invec[smallBlock + 1];
 319
 320           for (IndexType k = 0; k < NumOutputRegs; ++k)
 321             vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]);
 322         }
 323
 324         // Horizontally add all accumulators.
 325         if constexpr (NumOutputRegs % 4 == 0)
 326         {
 327           __m128i* outputvec = reinterpret_cast<__m128i*>(output);
 328           const __m128i* biasvec = reinterpret_cast<const __m128i*>(biases);
 329
 330           for (IndexType k = 0; k < NumOutputRegs; k += 4)
 331           {
 332             const IndexType idx = (bigBlock * NumOutputRegs + k) / 4;
 333             outputvec[idx] = vec_haddx4(acc[k+0], acc[k+1], acc[k+2], acc[k+3], biasvec[idx]);
 334           }
 335         }
 336         else
 337         {
 338           for (IndexType k = 0; k < NumOutputRegs; ++k)
 339           {
 340             const IndexType idx = (bigBlock * NumOutputRegs + k);
 341             output[idx] = vec_hadd(acc[k], biases[idx]);
 342           }
 343         }
 344       }
 345
 346 # undef vec_setzero
 347 # undef vec_set_32
 348 # undef vec_add_dpbusd_32
 349 # undef vec_add_dpbusd_32x2
 350 # undef vec_hadd
 351 # undef vec_haddx4
 352 #else
 353       // Use old implementation for the other architectures.
 354       affine_transform_non_ssse3<
 355         InputDimensions,
 356         PaddedInputDimensions,
 357         OutputDimensions>(output, weights, biases, input);
 358
 359 #endif
 360
 361       return output;
 362     }
 363
 364    private:
 365     using BiasType = OutputType;
 366     using WeightType = std::int8_t;
 367
 368     PreviousLayer previousLayer;
 369
 370     alignas(CacheLineSize) BiasType biases[OutputDimensions];
 371     alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
 372   };
 373
 374   template <typename PreviousLayer, IndexType OutDims>
 375   class AffineTransform<PreviousLayer, OutDims, std::enable_if_t<(PreviousLayer::OutputDimensions < 2*64-1)>> {
 376    public:
 377     // Input/output type
 378     using InputType = typename PreviousLayer::OutputType;
 379     using OutputType = std::int32_t;
 380     static_assert(std::is_same<InputType, std::uint8_t>::value, "");
 381
 382     // Number of input/output dimensions
 383     static constexpr IndexType InputDimensions =
 384         PreviousLayer::OutputDimensions;
 385     static constexpr IndexType OutputDimensions = OutDims;
 386     static constexpr IndexType PaddedInputDimensions =
 387         ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
 388
 389     static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
 390
 391 #if defined (USE_SSSE3)
 392     static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
 393     static constexpr const IndexType InputSimdWidth = SimdWidth;
 394 #endif
 395
 396     // Size of forward propagation buffer used in this layer
 397     static constexpr std::size_t SelfBufferSize =
 398       ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
 399
 400     // Size of the forward propagation buffer used from the input layer to this layer
 401     static constexpr std::size_t BufferSize =
 402       PreviousLayer::BufferSize + SelfBufferSize;
 403
 404     // Hash value embedded in the evaluation file
 405     static constexpr std::uint32_t get_hash_value() {
 406       std::uint32_t hashValue = 0xCC03DAE4u;
 407       hashValue += OutputDimensions;
 408       hashValue ^= PreviousLayer::get_hash_value() >> 1;
 409       hashValue ^= PreviousLayer::get_hash_value() << 31;
 410       return hashValue;
 411     }
 412
 413     static IndexType get_weight_index_scrambled(IndexType i)
 414     {
 415       return
 416         (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
 417         i / PaddedInputDimensions * 4 +
 418         i % 4;
 419     }
 420
 421     static IndexType get_weight_index(IndexType i)
 422     {
 423 #if defined (USE_SSSE3)
 424       return get_weight_index_scrambled(i);
 425 #else
 426       return i;
 427 #endif
 428     }
 429
 430     // Read network parameters
 431     bool read_parameters(std::istream& stream) {
 432       if (!previousLayer.read_parameters(stream)) return false;
 433       for (std::size_t i = 0; i < OutputDimensions; ++i)
 434         biases[i] = read_little_endian<BiasType>(stream);
 435       for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
 436         weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
 437
 438       return !stream.fail();
 439     }
 440
 441     // Write network parameters
 442     bool write_parameters(std::ostream& stream) const {
 443       if (!previousLayer.write_parameters(stream)) return false;
 444       for (std::size_t i = 0; i < OutputDimensions; ++i)
 445         write_little_endian<BiasType>(stream, biases[i]);
 446
 447       for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
 448         write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
 449
 450       return !stream.fail();
 451     }
 452     // Forward propagation
 453     const OutputType* propagate(
 454         const TransformedFeatureType* transformedFeatures, char* buffer) const {
 455       const auto input = previousLayer.propagate(
 456         transformedFeatures, buffer + SelfBufferSize);
 457       const auto output = reinterpret_cast<OutputType*>(buffer);
 458
 459 #if defined (USE_AVX2)
 460       using vec_t = __m256i;
 461       #define vec_setzero _mm256_setzero_si256
 462       #define vec_set_32 _mm256_set1_epi32
 463       #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
 464       #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
 465       #define vec_add_dpbusd_32x4 Simd::m256_add_dpbusd_epi32x4
 466       #define vec_hadd Simd::m256_hadd
 467       #define vec_haddx4 Simd::m256_haddx4
 468 #elif defined (USE_SSSE3)
 469       using vec_t = __m128i;
 470       #define vec_setzero _mm_setzero_si128
 471       #define vec_set_32 _mm_set1_epi32
 472       #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
 473       #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
 474       #define vec_add_dpbusd_32x4 Simd::m128_add_dpbusd_epi32x4
 475       #define vec_hadd Simd::m128_hadd
 476       #define vec_haddx4 Simd::m128_haddx4
 477 #endif
 478
 479 #if defined (USE_SSSE3)
 480       const auto inputVector = reinterpret_cast<const vec_t*>(input);
 481
 482       static_assert(InputDimensions % 8 == 0);
 483       static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
 484
 485       if constexpr (OutputDimensions % OutputSimdWidth == 0)
 486       {
 487         constexpr IndexType NumChunks = InputDimensions / 4;
 488         constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
 489
 490         const auto input32 = reinterpret_cast<const std::int32_t*>(input);
 491         const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
 492         vec_t acc[NumRegs];
 493         for (IndexType k = 0; k < NumRegs; ++k)
 494           acc[k] = biasvec[k];
 495
 496         for (IndexType i = 0; i < NumChunks; i += 2)
 497         {
 498           const vec_t in0 = vec_set_32(input32[i + 0]);
 499           const vec_t in1 = vec_set_32(input32[i + 1]);
 500           const auto col0 = reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
 501           const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
 502           for (IndexType k = 0; k < NumRegs; ++k)
 503             vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]);
 504         }
 505
 506         vec_t* outptr = reinterpret_cast<vec_t*>(output);
 507         for (IndexType k = 0; k < NumRegs; ++k)
 508           outptr[k] = acc[k];
 509       }
 510       else if constexpr (OutputDimensions == 1)
 511       {
 512         constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
 513         vec_t sum0 = vec_setzero();
 514         const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
 515
 516         for (int j = 0; j < (int)NumChunks; ++j)
 517         {
 518           const vec_t in = inputVector[j];
 519           vec_add_dpbusd_32(sum0, in, row0[j]);
 520         }
 521         output[0] = vec_hadd(sum0, biases[0]);
 522       }
 523
 524 # undef vec_setzero
 525 # undef vec_set_32
 526 # undef vec_add_dpbusd_32
 527 # undef vec_add_dpbusd_32x2
 528 # undef vec_add_dpbusd_32x4
 529 # undef vec_hadd
 530 # undef vec_haddx4
 531 #else
 532       // Use old implementation for the other architectures.
 533       affine_transform_non_ssse3<
 534         InputDimensions,
 535         PaddedInputDimensions,
 536         OutputDimensions>(output, weights, biases, input);
 537 #endif
 538
 539       return output;
 540     }
 541
 542    private:
 543     using BiasType = OutputType;
 544     using WeightType = std::int8_t;
 545
 546     PreviousLayer previousLayer;
 547
 548     alignas(CacheLineSize) BiasType biases[OutputDimensions];
 549     alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
 550   };
 551
 552 }  // namespace Stockfish::Eval::NNUE::Layers
 553
 554 #endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED