Linmiao Xu (linrock)
Fabian Beuke (madnight)
Fabian Fichter (ianfab)
+Fanael Linithien (Fanael)
fanon
Fauzi Akram Dabat (FauziAkram)
Felix Wittmann
bits = 64
prefetch = no
popcnt = no
+mmx = no
sse = no
ssse3 = no
sse41 = no
arch = i386
bits = 32
prefetch = yes
+ mmx = yes
sse = yes
endif
ifneq ($(KERNEL),Darwin)
LDFLAGS += -Wl,--no-as-needed
endif
-
+
gccversion = $(shell $(CXX) --version)
gccisclang = $(findstring clang,$(gccversion))
endif
endif
endif
+ifeq ($(mmx),yes)
+ CXXFLAGS += -DUSE_MMX
+ ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+ CXXFLAGS += -mmmx
+ endif
+endif
+
ifeq ($(neon),yes)
CXXFLAGS += -DUSE_NEON
endif
@echo "x86-64-ssse3 > x86 64-bit with ssse3 support"
@echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support"
@echo "x86-64 > x86 64-bit generic"
- @echo "x86-32 > x86 32-bit (also enables SSE)"
+ @echo "x86-32 > x86 32-bit (also enables MMX and SSE)"
@echo "x86-32-old > x86 32-bit fall back for old hardware"
@echo "ppc-64 > PPC 64-bit"
@echo "ppc-32 > PPC 32-bit"
#endif
compiler += (HasPext ? " BMI2" : "");
compiler += (HasPopCnt ? " POPCNT" : "");
+ #if defined(USE_MMX)
+ compiler += " MMX";
+ #endif
#if !defined(NDEBUG)
compiler += " DEBUG";
#endif
const __m256i kOnes = _mm256_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m256i*>(input);
- #elif defined(USE_SSSE3)
+ #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+ #ifndef USE_SSSE3
+ const __m128i kZeros = _mm_setzero_si128();
+ #else
const __m128i kOnes = _mm_set1_epi16(1);
+ #endif
const auto input_vector = reinterpret_cast<const __m128i*>(input);
+ #elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+ const __m64 kZeros = _mm_setzero_si64();
+ const auto input_vector = reinterpret_cast<const __m64*>(input);
+
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+ #elif defined(USE_SSE2)
+ __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+ __m128i sum_hi = kZeros;
+ const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m128i row_j = _mm_load_si128(&row[j]);
+ __m128i input_j = _mm_load_si128(&input_vector[j]);
+ __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+ __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+ __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+ __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+ __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+ __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+ __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+ sum_lo = _mm_add_epi32(sum_lo, product_lo);
+ sum_hi = _mm_add_epi32(sum_hi, product_hi);
+ }
+ __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+ __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_high_64);
+ __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+ sum = _mm_add_epi32(sum, sum_second_32);
+ output[i] = _mm_cvtsi128_si32(sum);
+
+ #elif defined(USE_MMX)
+ __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+ __m64 sum_hi = kZeros;
+ const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m64 row_j = row[j];
+ __m64 input_j = input_vector[j];
+ __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+ __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+ __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+ __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+ __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+ __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+ __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+ sum_lo = _mm_add_pi32(sum_lo, product_lo);
+ sum_hi = _mm_add_pi32(sum_hi, product_hi);
+ }
+ __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+ sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+ output[i] = _mm_cvtsi64_si32(sum);
+
#elif defined(USE_NEON)
int32x4_t sum = {biases_[i]};
const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
#endif
}
+ #if defined(USE_MMX)
+ _mm_empty();
+ #endif
return output;
}
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
- #elif defined(USE_SSSE3)
+ #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
#ifdef USE_SSE41
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
+ #elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+ const __m64 k0x80s = _mm_set1_pi8(-128);
+ const auto in = reinterpret_cast<const __m64*>(input);
+ const auto out = reinterpret_cast<__m64*>(output);
+ for (IndexType i = 0; i < kNumChunks; ++i) {
+ const __m64 words0 = _mm_srai_pi16(
+ _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+ kWeightScaleBits);
+ const __m64 words1 = _mm_srai_pi16(
+ _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+ kWeightScaleBits);
+ const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+ out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+ }
+ _mm_empty();
+ constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
#elif defined(USE_SSE2)
#include <emmintrin.h>
+#elif defined(USE_MMX)
+#include <mmintrin.h>
+
#elif defined(USE_NEON)
#include <arm_neon.h>
#endif
#elif defined(USE_SSE2)
constexpr std::size_t kSimdWidth = 16;
+ #elif defined(USE_MMX)
+ constexpr std::size_t kSimdWidth = 8;
+
#elif defined(USE_NEON)
constexpr std::size_t kSimdWidth = 16;
#endif
constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256();
- #elif defined(USE_SSSE3)
+ #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
#ifdef USE_SSE41
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
+ #elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+ const __m64 k0x80s = _mm_set1_pi8(-128);
+
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}
- #elif defined(USE_SSSE3)
+ #elif defined(USE_SSE2)
auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
);
}
+ #elif defined(USE_MMX)
+ auto out = reinterpret_cast<__m64*>(&output[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+ accumulation[perspectives[p]][0])[j * 2 + 0]);
+ __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+ accumulation[perspectives[p]][0])[j * 2 + 1]);
+ const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+ out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+ }
+
#elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
#endif
}
+ #if defined(USE_MMX)
+ _mm_empty();
+ #endif
}
private:
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+ #elif defined(USE_MMX)
+ auto accumulation = reinterpret_cast<__m64*>(
+ &accumulator.accumulation[perspective][i][0]);
+ auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+ constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+ }
+
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
}
}
+ #if defined(USE_MMX)
+ _mm_empty();
+ #endif
accumulator.computed_accumulation = true;
accumulator.computed_score = false;
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
+ #elif defined(USE_MMX)
+ constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+ auto accumulation = reinterpret_cast<__m64*>(
+ &accumulator.accumulation[perspective][i][0]);
+
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<int16x8_t*>(
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
}
+ #elif defined(USE_MMX)
+ auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
+ }
+
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
}
+ #elif defined(USE_MMX)
+ auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+ for (IndexType j = 0; j < kNumChunks; ++j) {
+ accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+ }
+
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
}
}
}
+ #if defined(USE_MMX)
+ _mm_empty();
+ #endif
accumulator.computed_accumulation = true;
accumulator.computed_score = false;