/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
- Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)
+ Copyright (C) 2004-2023 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <algorithm>
#include <type_traits>
#include "../nnue_common.h"
-#include "../../simd.h"
+#include "simd.h"
/*
This file contains the definition for a fully connected layer (aka affine transform).
const __m64 Zeros = _mm_setzero_si64();
const auto inputVector = reinterpret_cast<const __m64*>(input);
+# elif defined(USE_NEON_DOTPROD)
+ constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+ const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
+
# elif defined(USE_NEON)
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
output[i] = _mm_cvtsi64_si32(sum);
+# elif defined(USE_NEON_DOTPROD)
+ int32x4_t sum = {biases[i]};
+ const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
+ for (IndexType j = 0; j < NumChunks; ++j) {
+ sum = vdotq_s32(sum, inputVector[j], row[j]);
+ }
+ output[i] = vaddvq_s32(sum);
+
# elif defined(USE_NEON)
int32x4_t sum = {biases[i]};
const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
template <IndexType InDims, IndexType OutDims, typename Enabled = void>
class AffineTransform;
+#if defined (USE_AVX512)
+ constexpr IndexType LargeInputSize = 2 * 64;
+#else
+ constexpr IndexType LargeInputSize = std::numeric_limits<IndexType>::max();
+#endif
+
// A specialization for large inputs.
template <IndexType InDims, IndexType OutDims>
- class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= 2*64)>> {
+ class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= LargeInputSize)>> {
public:
// Input/output type
using InputType = std::uint8_t;
using OutputBuffer = OutputType[PaddedOutputDimensions];
- static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen.");
+ static_assert(PaddedInputDimensions >= LargeInputSize, "Something went wrong. This specialization should not have been chosen.");
#if defined (USE_AVX512)
- static constexpr const IndexType InputSimdWidth = 64;
- static constexpr const IndexType MaxNumOutputRegs = 16;
+ static constexpr IndexType InputSimdWidth = 64;
+ static constexpr IndexType MaxNumOutputRegs = 16;
#elif defined (USE_AVX2)
- static constexpr const IndexType InputSimdWidth = 32;
- static constexpr const IndexType MaxNumOutputRegs = 8;
+ static constexpr IndexType InputSimdWidth = 32;
+ static constexpr IndexType MaxNumOutputRegs = 8;
#elif defined (USE_SSSE3)
- static constexpr const IndexType InputSimdWidth = 16;
- static constexpr const IndexType MaxNumOutputRegs = 8;
+ static constexpr IndexType InputSimdWidth = 16;
+ static constexpr IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_NEON_DOTPROD)
+ static constexpr IndexType InputSimdWidth = 16;
+ static constexpr IndexType MaxNumOutputRegs = 8;
#elif defined (USE_NEON)
- static constexpr const IndexType InputSimdWidth = 8;
- static constexpr const IndexType MaxNumOutputRegs = 8;
+ static constexpr IndexType InputSimdWidth = 8;
+ static constexpr IndexType MaxNumOutputRegs = 8;
#else
// The fallback implementation will not have permuted weights.
// We define these to avoid a lot of ifdefs later.
- static constexpr const IndexType InputSimdWidth = 1;
- static constexpr const IndexType MaxNumOutputRegs = 1;
+ static constexpr IndexType InputSimdWidth = 1;
+ static constexpr IndexType MaxNumOutputRegs = 1;
#endif
// A big block is a region in the weight matrix of the size [PaddedInputDimensions, NumOutputRegs].
// A small block is a region of size [InputSimdWidth, 1]
- static constexpr const IndexType NumOutputRegs = std::min(MaxNumOutputRegs, OutputDimensions);
- static constexpr const IndexType SmallBlockSize = InputSimdWidth;
- static constexpr const IndexType BigBlockSize = NumOutputRegs * PaddedInputDimensions;
- static constexpr const IndexType NumSmallBlocksInBigBlock = BigBlockSize / SmallBlockSize;
- static constexpr const IndexType NumSmallBlocksPerOutput = PaddedInputDimensions / SmallBlockSize;
- static constexpr const IndexType NumBigBlocks = OutputDimensions / NumOutputRegs;
+ static constexpr IndexType NumOutputRegs = std::min(MaxNumOutputRegs, OutputDimensions);
+ static constexpr IndexType SmallBlockSize = InputSimdWidth;
+ static constexpr IndexType BigBlockSize = NumOutputRegs * PaddedInputDimensions;
+ static constexpr IndexType NumSmallBlocksInBigBlock = BigBlockSize / SmallBlockSize;
+ static constexpr IndexType NumSmallBlocksPerOutput = PaddedInputDimensions / SmallBlockSize;
+ static constexpr IndexType NumBigBlocks = OutputDimensions / NumOutputRegs;
static_assert(OutputDimensions % NumOutputRegs == 0);
// Read network parameters
bool read_parameters(std::istream& stream) {
- for (std::size_t i = 0; i < OutputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions; ++i)
biases[i] = read_little_endian<BiasType>(stream);
- for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
return !stream.fail();
// Write network parameters
bool write_parameters(std::ostream& stream) const {
- for (std::size_t i = 0; i < OutputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions; ++i)
write_little_endian<BiasType>(stream, biases[i]);
- for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
return !stream.fail();
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
#define vec_hadd Simd::m128_hadd
#define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_NEON_DOTPROD)
+ using acc_vec_t = int32x4_t;
+ using bias_vec_t = int32x4_t;
+ using weight_vec_t = int8x16_t;
+ using in_vec_t = int8x16_t;
+ #define vec_zero {0}
+ #define vec_add_dpbusd_32x2 Simd::dotprod_m128_add_dpbusd_epi32x2
+ #define vec_hadd Simd::neon_m128_hadd
+ #define vec_haddx4 Simd::neon_m128_haddx4
#elif defined (USE_NEON)
using acc_vec_t = int32x4_t;
using bias_vec_t = int32x4_t;
};
template <IndexType InDims, IndexType OutDims>
- class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < 2*64)>> {
+ class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < LargeInputSize)>> {
public:
// Input/output type
// Input/output type
using OutputBuffer = OutputType[PaddedOutputDimensions];
- static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
+ static_assert(PaddedInputDimensions < LargeInputSize, "Something went wrong. This specialization should not have been chosen.");
#if defined (USE_SSSE3)
- static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
- static constexpr const IndexType InputSimdWidth = SimdWidth;
+ static constexpr IndexType OutputSimdWidth = SimdWidth / 4;
+ static constexpr IndexType InputSimdWidth = SimdWidth;
#endif
// Hash value embedded in the evaluation file
// Read network parameters
bool read_parameters(std::istream& stream) {
- for (std::size_t i = 0; i < OutputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions; ++i)
biases[i] = read_little_endian<BiasType>(stream);
- for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
return !stream.fail();
// Write network parameters
bool write_parameters(std::ostream& stream) const {
- for (std::size_t i = 0; i < OutputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions; ++i)
write_little_endian<BiasType>(stream, biases[i]);
- for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
return !stream.fail();