+ using vec128_t = __m128i;
+ #define vec128_zero _mm_setzero_si128()
+ #define vec128_set_16(a) _mm_set1_epi16(a)
+ #define vec128_load(a) _mm_load_si128(a)
+ #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+ #define vec128_add(a, b) _mm_add_epi16(a, b)
+#elif defined (USE_NEON)
+ using vec_t = int32x4_t;
+ static const std::uint32_t Mask[4] = {1, 2, 4, 8};
+ #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
+ using vec128_t = int16x8_t;
+ #define vec128_zero vdupq_n_u16(0)
+ #define vec128_set_16(a) vdupq_n_u16(a)
+ #define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
+ #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
+ #define vec128_add(a, b) vaddq_u16(a, b)