+
+#if defined (USE_SSSE3)
+ // Determine if quadruplets of weight and input products can be summed using 16bits
+ // without saturation. We assume worst case combinations of 0 and 127 for all inputs.
+ if (!stream.fail())
+ {
+ auto can_saturate = [](const WeightType* w, int idx[4]) {
+ int pSum = 0, nSum = 0;
+ for (int p = 0; p < 4; ++p)
+ if (w[idx[p]] > 0)
+ pSum += w[idx[p]];
+ else
+ nSum += w[idx[p]];
+
+ return pSum > 258 || nSum < -258;
+ };
+
+ for (IndexType i = 0; i < kOutputDimensions; ++i)
+ {
+ canSaturate16[i] = false;
+ const WeightType* w = &weights_[i * kPaddedInputDimensions];
+#if defined (USE_AVX512)
+ for (IndexType j = 0; j < (kPaddedInputDimensions & ~127) && !canSaturate16[i]; j += 128)
+ for (int k = 0; k < 64 && !canSaturate16[i]; k += 2)
+ {
+ int spacing[4] = { 0, 1, 64, 65 };
+ canSaturate16[i] = can_saturate(&w[j + k], spacing);
+ }
+#elif defined (USE_AVX2)
+ for (IndexType j = 0; j < (kPaddedInputDimensions & ~63) && !canSaturate16[i]; j += 64)
+ for (int k = 0; k < 32 && !canSaturate16[i]; k += 2)
+ {
+ int spacing[4] = { 0, 1, 32, 33 };
+ canSaturate16[i] = can_saturate(&w[j + k], spacing);
+ }
+#elif defined (USE_SSSE3)
+ for (IndexType j = 0; j < (kPaddedInputDimensions & ~31) && !canSaturate16[i]; j += 32)
+ for (int k = 0; k < 16 && !canSaturate16[i]; k += 2)
+ {
+ int spacing[4] = { 0, 1, 16, 17 };
+ canSaturate16[i] = can_saturate(&w[j + k], spacing);
+ }
+#endif
+ }
+ }
+#endif
+