5 #include <type_traits>
\r
7 #include <tbb/cache_aligned_allocator.h>
\r
9 namespace caspar { namespace accelerator { namespace cpu {
\r
11 typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;
\r
21 xmm_ps(float value_)
\r
22 : value_(_mm_set1_ps(value_))
\r
26 xmm_ps(__m128 value_)
\r
31 xmm_ps& operator+=(const xmm_ps& other)
\r
33 value_ = _mm_add_ps(value_, other.value_);
\r
37 xmm_ps& operator-=(const xmm_ps& other)
\r
39 value_ = _mm_sub_ps(value_, other.value_);
\r
43 xmm_ps& operator*=(const xmm_ps& other)
\r
45 value_ = _mm_mul_ps(value_, other.value_);
\r
49 xmm_ps& operator/=(const xmm_ps& other)
\r
51 value_ = _mm_div_ps(value_, other.value_);
\r
55 xmm_ps& horizontal_add(const xmm_ps& other)
\r
57 value_ = _mm_hadd_ps(value_, other.value_);
\r
61 xmm_ps& horizontal_sub(const xmm_ps& other)
\r
63 value_ = _mm_hsub_ps(value_, other.value_);
\r
67 xmm_ps unpack_low(const xmm_ps& other)
\r
69 value_ = _mm_unpacklo_ps(value_, other.value_);
\r
73 xmm_ps unpack_high(const xmm_ps& other)
\r
75 value_ = _mm_unpackhi_ps(value_, other.value_);
\r
79 float operator[](int index) const
\r
81 return value_.m128_f32[index];
\r
84 float& operator[](int index)
\r
86 return value_.m128_f32[index];
\r
89 static xmm_ps zero()
\r
91 return _mm_setzero_ps();
\r
94 static xmm_ps load(const float* ptr)
\r
96 return _mm_load_ps(ptr);
\r
99 static xmm_ps loadu(const float* ptr)
\r
101 return _mm_loadu_ps(ptr);
\r
104 static void stream(const xmm_ps& source, float* dest)
\r
106 _mm_stream_ps(dest, source.value_);
\r
109 static xmm_ps horizontal_add(const xmm_ps& lhs, const xmm_ps& rhs)
\r
111 return xmm_ps(lhs).horizontal_add(rhs);
\r
114 static xmm_ps horizontal_sub(const xmm_ps& lhs, const xmm_ps& rhs)
\r
116 return xmm_ps(lhs).horizontal_sub(rhs);
\r
119 static xmm_ps unpack_low(const xmm_ps& lhs, const xmm_ps& rhs)
\r
121 return xmm_ps(lhs).unpack_low(rhs);
\r
124 static xmm_ps unpack_high(const xmm_ps& lhs, const xmm_ps& rhs)
\r
126 return xmm_ps(lhs).unpack_high(rhs);
\r
130 inline xmm_ps operator+(const xmm_ps& lhs, const xmm_ps& rhs)
\r
132 return xmm_ps(lhs) += rhs;
\r
135 inline xmm_ps operator-(const xmm_ps& lhs, const xmm_ps& rhs)
\r
137 return xmm_ps(lhs) -= rhs;
\r
140 inline xmm_ps operator*(const xmm_ps& lhs, const xmm_ps& rhs)
\r
142 return xmm_ps(lhs) *= rhs;
\r
145 inline xmm_ps operator/(const xmm_ps& lhs, const xmm_ps& rhs)
\r
147 return xmm_ps(lhs) /= rhs;
\r
153 template<typename> friend struct xmm_cast_impl;
\r
155 typedef xmm_epi32 xmm_epi_tag;
\r
161 xmm_epi32(__m128i value)
\r
166 xmm_epi32& operator>>=(int count)
\r
168 value_ = _mm_srli_epi32(value_, count);
\r
172 xmm_epi32& operator<<=(int count)
\r
174 value_ = _mm_slli_epi32(value_, count);
\r
178 xmm_epi32& operator|=(const xmm_epi32& other)
\r
180 value_ = _mm_or_si128(value_, other.value_);
\r
184 xmm_epi32& operator&=(const xmm_epi32& other)
\r
186 value_ = _mm_and_si128(value_, other.value_);
\r
190 static xmm_epi32 load(const void* source)
\r
192 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
\r
195 static xmm_epi32 loadu(const void* source)
\r
197 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
\r
200 int32_t operator[](int index) const
\r
202 return value_.m128i_i32[index];
\r
205 int32_t& operator[](int index)
\r
207 return value_.m128i_i32[index];
\r
210 static xmm_epi32 zero()
\r
212 return _mm_setzero_si128();
\r
216 inline xmm_epi32 operator>>(const xmm_epi32& lhs, int count)
\r
218 return xmm_epi32(lhs) >>= count;
\r
221 inline xmm_epi32 operator<<(const xmm_epi32& lhs, int count)
\r
223 return xmm_epi32(lhs) <<= count;
\r
226 inline xmm_epi32 operator|(const xmm_epi32& lhs, const xmm_epi32& rhs)
\r
228 return xmm_epi32(lhs) |= rhs;
\r
231 inline xmm_epi32 operator&(const xmm_epi32& lhs, const xmm_epi32& rhs)
\r
233 return xmm_epi32(lhs) &= rhs;
\r
239 template<typename> friend struct xmm_cast_impl;
\r
240 friend xmm_epi32 horizontal_add(const xmm_epi16&);
\r
242 typedef xmm_epi16 xmm_epi_tag;
\r
248 xmm_epi16(__m128i value)
\r
253 xmm_epi16(short value)
\r
254 : value_(_mm_set1_epi16(value))
\r
258 xmm_epi16& operator+=(const xmm_epi16& other)
\r
260 value_ = _mm_add_epi16(value_, other.value_);
\r
264 xmm_epi16& operator-=(const xmm_epi16& other)
\r
266 value_ = _mm_sub_epi16(value_, other.value_);
\r
270 xmm_epi16& operator>>=(int count)
\r
272 value_ = _mm_srli_epi16(value_, count);
\r
276 xmm_epi16& operator<<=(int count)
\r
278 value_ = _mm_slli_epi16(value_, count);
\r
282 xmm_epi16& operator|=(const xmm_epi16& other)
\r
284 value_ = _mm_or_si128(value_, other.value_);
\r
288 xmm_epi16& operator&=(const xmm_epi16& other)
\r
290 value_ = _mm_and_si128(value_, other.value_);
\r
294 xmm_epi16 multiply_low(const xmm_epi16& other)
\r
296 value_ = _mm_mullo_epi16(value_, other.value_);
\r
300 xmm_epi16 multiply_high(const xmm_epi16& other)
\r
302 value_ = _mm_mulhi_epi16(value_, other.value_);
\r
306 xmm_epi16 umultiply_low(const xmm_epi16& other)
\r
308 value_ = _mm_mullo_epi16(value_, other.value_);
\r
312 xmm_epi16 umultiply_high(const xmm_epi16& other)
\r
314 value_ = _mm_mulhi_epi16(value_, other.value_);
\r
318 xmm_epi16 and_not(const xmm_epi16& other)
\r
320 value_ = _mm_andnot_si128(other.value_, value_);
\r
324 xmm_epi16 unpack_low(const xmm_epi16& other)
\r
326 value_ = _mm_unpacklo_epi16 (value_, other.value_);
\r
330 xmm_epi16 unpack_high(const xmm_epi16& other)
\r
332 value_ = _mm_unpackhi_epi16 (value_, other.value_);
\r
336 xmm_epi16 pack(const xmm_epi16& other)
\r
338 value_ = _mm_packs_epi16(value_, other.value_);
\r
342 xmm_epi16 max(const xmm_epi16& other)
\r
344 value_ = _mm_max_epi16(value_, other.value_);
\r
348 xmm_epi16 min(const xmm_epi16& other)
\r
350 value_ = _mm_min_epi16(value_, other.value_);
\r
354 int16_t operator[](int index) const
\r
356 return value_.m128i_i16[index];
\r
359 int16_t& operator[](int index)
\r
361 return value_.m128i_i16[index];
\r
364 static xmm_epi16 load(const void* source)
\r
366 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
\r
369 static xmm_epi16 loadu(const void* source)
\r
371 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
\r
374 static xmm_epi32 horizontal_add(const xmm_epi16& lhs)
\r
377 return _mm_haddd_epi16(value_);
\r
379 return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));
\r
383 static xmm_epi16 multiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
385 return xmm_epi16(lhs).multiply_low(rhs);
\r
388 static xmm_epi16 multiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
390 return xmm_epi16(lhs).multiply_high(rhs);
\r
393 static xmm_epi16 umultiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
395 return xmm_epi16(lhs).umultiply_low(rhs);
\r
398 static xmm_epi16 umultiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
400 return xmm_epi16(lhs).umultiply_high(rhs);
\r
403 static xmm_epi16 unpack_low(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
405 return xmm_epi16(lhs).unpack_low(rhs);
\r
408 static xmm_epi16 unpack_high(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
410 return xmm_epi16(lhs).unpack_high(rhs);
\r
413 static xmm_epi16 pack(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
415 return xmm_epi16(lhs).pack(rhs);
\r
418 static xmm_epi16 and_not(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
420 return xmm_epi16(lhs).and_not(rhs);
\r
423 static xmm_epi16 max(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
425 return xmm_epi16(lhs).max(rhs);
\r
428 static xmm_epi16 min(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
430 return xmm_epi16(lhs).min(rhs);
\r
433 static xmm_epi16 zero()
\r
435 return _mm_setzero_si128();
\r
439 inline xmm_epi16 operator+(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
441 return xmm_epi16(lhs) += rhs;
\r
444 inline xmm_epi16 operator-(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
446 return xmm_epi16(lhs) -= rhs;
\r
449 inline xmm_epi16 operator>>(const xmm_epi16& lhs, int count)
\r
451 return xmm_epi16(lhs) >>= count;
\r
454 inline xmm_epi16 operator<<(const xmm_epi16& lhs, int count)
\r
456 return xmm_epi16(lhs) <<= count;
\r
459 inline xmm_epi16 operator|(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
461 return xmm_epi16(lhs) |= rhs;
\r
464 inline xmm_epi16 operator&(const xmm_epi16& lhs, const xmm_epi16& rhs)
\r
466 return xmm_epi16(lhs) &= rhs;
\r
472 template<typename> friend struct xmm_cast_impl;
\r
473 friend xmm_epi16 multiply_add(const xmm_epi8&, const xmm_epi8&);
\r
475 typedef xmm_epi8 xmm_epi_tag;
\r
481 xmm_epi8(__m128i value)
\r
487 : value_(_mm_set1_epi8(b))
\r
491 xmm_epi8(char b3, char b2, char b1, char b0)
\r
492 : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))
\r
496 xmm_epi8(char b15, char b14, char b13, char b12,
\r
497 char b11, char b10, char b9, char b8,
\r
498 char b7, char b6, char b5, char b4,
\r
499 char b3, char b2, char b1, char b0)
\r
500 : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))
\r
504 xmm_epi8& operator+=(const xmm_epi8& other)
\r
506 value_ = _mm_add_epi8(value_, other.value_);
\r
510 xmm_epi8& operator-=(const xmm_epi8& other)
\r
512 value_ = _mm_sub_epi8(value_, other.value_);
\r
516 xmm_epi8& shuffle(const xmm_epi8& other)
\r
518 value_ = _mm_shuffle_epi8 (value_, other.value_);
\r
522 xmm_epi8& max(const xmm_epi8& other)
\r
524 value_ = _mm_max_epi8 (value_, other.value_);
\r
528 xmm_epi8& min(const xmm_epi8& other)
\r
530 value_ = _mm_min_epi8 (value_, other.value_);
\r
534 xmm_epi8& umax(const xmm_epi8& other)
\r
536 value_ = _mm_max_epu8(value_, other.value_);
\r
540 xmm_epi8& umin(const xmm_epi8& other)
\r
542 value_ = _mm_min_epu8(value_, other.value_);
\r
546 xmm_epi8& blend(const xmm_epi8& other, const xmm_epi8& mask)
\r
548 value_ = _mm_blendv_epi8(value_, other.value_, mask.value_);
\r
552 const xmm_epi8& stream(void* dest) const
\r
554 _mm_stream_si128(reinterpret_cast<__m128i*>(dest), value_);
\r
558 char operator[](int index) const
\r
560 return value_.m128i_i8[index];
\r
563 char& operator[](int index)
\r
565 return value_.m128i_i8[index];
\r
568 static const xmm_epi8& stream(const xmm_epi8& source, void* dest)
\r
570 source.stream(dest);
\r
574 static xmm_epi8 load(const void* source)
\r
576 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
\r
579 static xmm_epi8 loadu(const void* source)
\r
581 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
\r
584 static xmm_epi16 multiply_add(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
586 return xmm_epi16(_mm_maddubs_epi16(lhs.value_, rhs.value_));
\r
589 static xmm_epi8& shuffle(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
591 return xmm_epi8(lhs).shuffle(rhs);
\r
594 static xmm_epi8& max(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
596 return xmm_epi8(lhs).max(rhs);
\r
599 static xmm_epi8& min(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
601 return xmm_epi8(lhs).min(rhs);
\r
604 static xmm_epi8& umax(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
606 return xmm_epi8(lhs).umax(rhs);
\r
609 static xmm_epi8& umin(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
611 return xmm_epi8(lhs).umin(rhs);
\r
614 static xmm_epi8& blend(const xmm_epi8& lhs, const xmm_epi8& rhs, const xmm_epi8& mask)
\r
616 return xmm_epi8(lhs).blend(rhs, mask);
\r
619 static xmm_epi8 zero()
\r
621 return _mm_setzero_si128();
\r
625 inline xmm_epi8 operator+(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
627 return xmm_epi8(lhs) += rhs;
\r
630 inline xmm_epi8 operator-(const xmm_epi8& lhs, const xmm_epi8& rhs)
\r
632 return xmm_epi8(lhs) -= rhs;
\r
637 template<typename T>
\r
638 struct xmm_cast_impl
\r
640 template<typename U>
\r
641 T operator()(const U& other)
\r
643 return typename T::xmm_epi_tag(other.value_);
\r
648 struct xmm_cast_impl<xmm_ps>
\r
650 xmm_ps operator()(const xmm_epi32& other)
\r
652 return _mm_cvtepi32_ps(other.value_);
\r
656 template<typename T, typename U>
\r
657 T xmm_cast(const U& other)
\r
659 return xmm_cast_impl<T>()(other);
\r