always selecting AffineTransform specialization for small inputs.
A related patch was tested as
Initially tested as a simplification
STC https://tests.stockfishchess.org/tests/view/
6317c3f437f41b13973d6dff
LLR: 2.95 (-2.94,2.94) <-1.75,0.25>
Total: 58072 W: 15619 L: 15425 D: 27028
Ptnml(0-2): 241, 6191, 15992, 6357, 255
Elo gain speedup test
STC https://tests.stockfishchess.org/tests/view/
63181c1b37f41b13973d79dc
LLR: 2.94 (-2.94,2.94) <0.00,2.00>
Total: 184496 W: 49922 L: 49401 D: 85173
Ptnml(0-2): 851, 19397, 51208, 19964, 828
and this patch gained in testing
speedup = +0.0071
P(speedup > 0) = 1.0000
on CPU: 16 x AMD Ryzen 9 3950X
closes https://github.com/official-stockfish/Stockfish/pull/4158
No functional change
#include <algorithm>
#include <type_traits>
#include "../nnue_common.h"
#include <algorithm>
#include <type_traits>
#include "../nnue_common.h"
/*
This file contains the definition for a fully connected layer (aka affine transform).
/*
This file contains the definition for a fully connected layer (aka affine transform).
template <IndexType InDims, IndexType OutDims, typename Enabled = void>
class AffineTransform;
template <IndexType InDims, IndexType OutDims, typename Enabled = void>
class AffineTransform;
+#if defined (USE_AVX512)
+ constexpr IndexType LargeInputSize = 2 * 64;
+#else
+ constexpr IndexType LargeInputSize = std::numeric_limits<IndexType>::max();
+#endif
+
// A specialization for large inputs.
template <IndexType InDims, IndexType OutDims>
// A specialization for large inputs.
template <IndexType InDims, IndexType OutDims>
- class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= 2*64)>> {
+ class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= LargeInputSize)>> {
public:
// Input/output type
using InputType = std::uint8_t;
public:
// Input/output type
using InputType = std::uint8_t;
using OutputBuffer = OutputType[PaddedOutputDimensions];
using OutputBuffer = OutputType[PaddedOutputDimensions];
- static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen.");
+ static_assert(PaddedInputDimensions >= LargeInputSize, "Something went wrong. This specialization should not have been chosen.");
#if defined (USE_AVX512)
static constexpr const IndexType InputSimdWidth = 64;
#if defined (USE_AVX512)
static constexpr const IndexType InputSimdWidth = 64;
};
template <IndexType InDims, IndexType OutDims>
};
template <IndexType InDims, IndexType OutDims>
- class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < 2*64)>> {
+ class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < LargeInputSize)>> {
public:
// Input/output type
// Input/output type
public:
// Input/output type
// Input/output type
using OutputBuffer = OutputType[PaddedOutputDimensions];
using OutputBuffer = OutputType[PaddedOutputDimensions];
- static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
+ static_assert(PaddedInputDimensions < LargeInputSize, "Something went wrong. This specialization should not have been chosen.");
#if defined (USE_SSSE3)
static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
#if defined (USE_SSSE3)
static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;