+#endif
+
+ return output;
+ }
+
+ private:
+ using BiasType = OutputType;
+ using WeightType = std::int8_t;
+
+ PreviousLayer previousLayer;
+
+ alignas(CacheLineSize) BiasType biases[OutputDimensions];
+ alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
+ };
+
+ template <typename PreviousLayer, IndexType OutDims>
+ class AffineTransform<PreviousLayer, OutDims, std::enable_if_t<(PreviousLayer::OutputDimensions < 2*64-1)>> {
+ public:
+ // Input/output type
+ using InputType = typename PreviousLayer::OutputType;
+ using OutputType = std::int32_t;
+ static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+
+ // Number of input/output dimensions
+ static constexpr IndexType InputDimensions =
+ PreviousLayer::OutputDimensions;
+ static constexpr IndexType OutputDimensions = OutDims;
+ static constexpr IndexType PaddedInputDimensions =
+ ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+
+ static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
+
+#if defined (USE_SSSE3)
+ static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
+ static constexpr const IndexType InputSimdWidth = SimdWidth;
+#endif
+
+ // Size of forward propagation buffer used in this layer
+ static constexpr std::size_t SelfBufferSize =
+ ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
+
+ // Size of the forward propagation buffer used from the input layer to this layer
+ static constexpr std::size_t BufferSize =
+ PreviousLayer::BufferSize + SelfBufferSize;
+
+ // Hash value embedded in the evaluation file
+ static constexpr std::uint32_t get_hash_value() {
+ std::uint32_t hashValue = 0xCC03DAE4u;
+ hashValue += OutputDimensions;
+ hashValue ^= PreviousLayer::get_hash_value() >> 1;
+ hashValue ^= PreviousLayer::get_hash_value() << 31;
+ return hashValue;
+ }
+
+ static IndexType get_weight_index_scrambled(IndexType i)
+ {
+ return
+ (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
+ i / PaddedInputDimensions * 4 +
+ i % 4;
+ }
+
+ static IndexType get_weight_index(IndexType i)
+ {
+#if defined (USE_SSSE3)
+ return get_weight_index_scrambled(i);
+#else
+ return i;
+#endif
+ }
+
+ // Read network parameters
+ bool read_parameters(std::istream& stream) {
+ if (!previousLayer.read_parameters(stream)) return false;
+ for (std::size_t i = 0; i < OutputDimensions; ++i)
+ biases[i] = read_little_endian<BiasType>(stream);
+ for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
+
+ return !stream.fail();
+ }
+
+ // Write network parameters
+ bool write_parameters(std::ostream& stream) const {
+ if (!previousLayer.write_parameters(stream)) return false;
+ for (std::size_t i = 0; i < OutputDimensions; ++i)
+ write_little_endian<BiasType>(stream, biases[i]);
+
+ for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+ write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
+
+ return !stream.fail();
+ }
+ // Forward propagation
+ const OutputType* propagate(
+ const TransformedFeatureType* transformedFeatures, char* buffer) const {
+ const auto input = previousLayer.propagate(
+ transformedFeatures, buffer + SelfBufferSize);
+ const auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined (USE_AVX2)
+ using vec_t = __m256i;
+ #define vec_setzero _mm256_setzero_si256
+ #define vec_set_32 _mm256_set1_epi32
+ #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+ #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
+ #define vec_add_dpbusd_32x4 Simd::m256_add_dpbusd_epi32x4
+ #define vec_hadd Simd::m256_hadd
+ #define vec_haddx4 Simd::m256_haddx4
+#elif defined (USE_SSSE3)
+ using vec_t = __m128i;
+ #define vec_setzero _mm_setzero_si128
+ #define vec_set_32 _mm_set1_epi32
+ #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+ #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
+ #define vec_add_dpbusd_32x4 Simd::m128_add_dpbusd_epi32x4
+ #define vec_hadd Simd::m128_hadd
+ #define vec_haddx4 Simd::m128_haddx4
+#endif
+
+#if defined (USE_SSSE3)
+ const auto inputVector = reinterpret_cast<const vec_t*>(input);
+
+ static_assert(InputDimensions % 8 == 0);
+ static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
+
+ if constexpr (OutputDimensions % OutputSimdWidth == 0)
+ {
+ constexpr IndexType NumChunks = InputDimensions / 4;
+ constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
+
+ const auto input32 = reinterpret_cast<const std::int32_t*>(input);
+ const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+ vec_t acc[NumRegs];
+ for (IndexType k = 0; k < NumRegs; ++k)
+ acc[k] = biasvec[k];
+
+ for (IndexType i = 0; i < NumChunks; i += 2)
+ {
+ const vec_t in0 = vec_set_32(input32[i + 0]);
+ const vec_t in1 = vec_set_32(input32[i + 1]);
+ const auto col0 = reinterpret_cast<const vec_t*>(&weights[(i + 0) * OutputDimensions * 4]);
+ const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
+ for (IndexType k = 0; k < NumRegs; ++k)
+ vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]);