6 namespace caspar { namespace accelerator { namespace cpu { namespace xmm {
10 static const int value = 0x01;
12 struct nontemporal_tag
14 static const int value = 0x02;
18 static const int value = 0x01;
22 static const int value = 0x02;
34 template<typename temporal, typename alignment>
35 static T load(const void* source);
37 template<typename temporal,typename alignment>
38 static void store(const T& source, void* dest);
43 class s32_x : public base_x<s32_x>
46 template<typename> friend class base_x;
51 typedef s32_x xmm_epi_tag;
54 explicit s32_x(const s16_x& other);
55 explicit s32_x(const s8_x& other);
56 explicit s32_x(const u8_x& other);
57 s32_x(const __m128i& value);
59 s32_x& operator>>=(int count);
60 s32_x& operator<<=(int count);
61 s32_x& operator|=(const s32_x& other);
62 s32_x& operator&=(const s32_x& other);
63 int32_t operator[](int index) const;
64 int32_t& operator[](int index);
67 class s16_x : public base_x<s16_x>
72 template<typename> friend class base_x;
77 typedef s16_x xmm_epi_tag;
80 explicit s16_x(const s32_x& other);
81 explicit s16_x(const s8_x& other);
82 explicit s16_x(const u8_x& other);
83 s16_x(const __m128i& value);
86 s16_x& operator+=(const s16_x& other);
87 s16_x& operator-=(const s16_x& other);
88 s16_x& operator>>=(int count);
89 s16_x& operator<<=(int count);
90 s16_x& operator|=(const s16_x& other);
91 s16_x& operator&=(const s16_x& other);
92 int16_t operator[](int index) const;
93 int16_t& operator[](int index);
95 static s16_x unpack_low(const s8_x& lhs, const s8_x& rhs);
96 static s16_x unpack_high(const s8_x& lhs, const s8_x& rhs);
97 static s32_x horizontal_add(const s16_x& lhs);
98 static s16_x multiply_low(const s16_x& lhs, const s16_x& rhs);
99 static s16_x multiply_high(const s16_x& lhs, const s16_x& rhs);
100 static s16_x umultiply_low(const s16_x& lhs, const s16_x& rhs);
101 static s16_x umultiply_high(const s16_x& lhs, const s16_x& rhs);
102 static s16_x unpack_low(const s16_x& lhs, const s16_x& rhs);
103 static s16_x unpack_high(const s16_x& lhs, const s16_x& rhs);
104 static s16_x and_not(const s16_x& lhs, const s16_x& rhs);
105 static s16_x max(const s16_x& lhs, const s16_x& rhs);
106 static s16_x min(const s16_x& lhs, const s16_x& rhs);
110 class base8_x : public base_x<s8_x>
112 char operator[](int index) const;
113 char& operator[](int index);
116 class s8_x : public base_x<s8_x>
120 template<typename> friend class base_x;
125 typedef s8_x xmm_epi_tag;
128 explicit s8_x(const s32_x& other);
129 explicit s8_x(const s16_x& other);
130 explicit s8_x(const u8_x& other);
131 s8_x(const __m128i& value);
133 s8_x(char b3, char b2, char b1, char b0);
134 s8_x(char b15, char b14, char b13, char b12,
135 char b11, char b10, char b9, char b8,
136 char b7, char b6, char b5, char b4,
137 char b3, char b2, char b1, char b0);
139 s8_x& operator+=(const s8_x& other);
140 s8_x& operator-=(const s8_x& other);
141 char operator[](int index) const;
142 char& operator[](int index);
144 static s8_x upack(const s16_x& lhs, const s16_x& rhs);
146 static s16_x multiply_add(const u8_x& lhs, const s8_x& rhs);
147 static s8_x max(const s8_x& lhs, const s8_x& rhs);
148 static s8_x min(const s8_x& lhs, const s8_x& rhs);
150 static s8_x shuffle(const s8_x& lhs, const s8_x& rhs);
151 static s8_x blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask);
154 class u8_x : public base_x<u8_x>
158 template<typename> friend class base_x;
163 typedef u8_x xmm_epu_tag;
166 explicit u8_x(const s32_x& other);
167 explicit u8_x(const s16_x& other);
168 explicit u8_x(const s8_x& other);
169 u8_x(const __m128i& value);
171 u8_x(char b3, char b2, char b1, char b0);
172 u8_x(char b15, char b14, char b13, char b12,
173 char b11, char b10, char b9, char b8,
174 char b7, char b6, char b5, char b4,
175 char b3, char b2, char b1, char b0);
177 char operator[](int index) const;
178 char& operator[](int index);
180 static u8_x max(const u8_x& lhs, const u8_x& rhs);
181 static u8_x min(const u8_x& lhs, const u8_x& rhs);
183 static u8_x shuffle(const u8_x& lhs, const u8_x& rhs);
184 static u8_x blend(const u8_x& lhs, const u8_x& rhs, const u8_x& mask);
190 template<typename temporal, typename alignment>
191 T base_x<T>::load(const void* source)
193 static_assert(temporal::value != nontemporal_tag::value, "streaming loads not supported");
194 if(alignment::value == aligned_tag::value)
195 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
197 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
201 template<typename temporal, typename alignment>
202 void base_x<T>::store(const T& source, void* dest)
204 if(temporal::value == nontemporal_tag::value && alignment::value == aligned_tag::value)
205 _mm_stream_si128(reinterpret_cast<__m128i*>(dest), source.value_);
206 else if(alignment::value == aligned_tag::value)
207 _mm_store_si128(reinterpret_cast<__m128i*>(dest), source.value_);
209 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source.value_);
215 return _mm_setzero_si128();
224 s32_x::s32_x(const s16_x& other)
225 : value_(other.value_)
229 s32_x::s32_x(const s8_x& other)
230 : value_(other.value_)
234 s32_x::s32_x(const u8_x& other)
235 : value_(other.value_)
239 s32_x::s32_x(const __m128i& value)
244 s32_x& s32_x::operator>>=(int count)
246 value_ = _mm_srli_epi32(value_, count);
250 s32_x& s32_x::operator<<=(int count)
252 value_ = _mm_slli_epi32(value_, count);
256 s32_x& s32_x::operator|=(const s32_x& other)
258 value_ = _mm_or_si128(value_, other.value_);
262 s32_x& s32_x::operator&=(const s32_x& other)
264 value_ = _mm_and_si128(value_, other.value_);
268 int32_t s32_x::operator[](int index) const
270 return value_.m128i_i32[index];
273 int32_t& s32_x::operator[](int index)
275 return value_.m128i_i32[index];
278 inline s32_x operator>>(const s32_x& lhs, int count)
280 return s32_x(lhs) >>= count;
283 inline s32_x operator<<(const s32_x& lhs, int count)
285 return s32_x(lhs) <<= count;
288 inline s32_x operator|(const s32_x& lhs, const s32_x& rhs)
290 return s32_x(lhs) |= rhs;
293 inline s32_x operator&(const s32_x& lhs, const s32_x& rhs)
295 return s32_x(lhs) &= rhs;
304 s16_x::s16_x(const s32_x& other)
305 : value_(other.value_)
309 s16_x::s16_x(const s8_x& other)
310 : value_(other.value_)
314 s16_x::s16_x(const u8_x& other)
315 : value_(other.value_)
319 s16_x::s16_x(const __m128i& value)
324 s16_x::s16_x(short value)
325 : value_(_mm_set1_epi16(value))
329 s16_x& s16_x::operator+=(const s16_x& other)
331 value_ = _mm_add_epi16(value_, other.value_);
335 s16_x& s16_x::operator-=(const s16_x& other)
337 value_ = _mm_sub_epi16(value_, other.value_);
341 s16_x& s16_x::operator>>=(int count)
343 value_ = _mm_srli_epi16(value_, count);
347 s16_x& s16_x::operator<<=(int count)
349 value_ = _mm_slli_epi16(value_, count);
353 s16_x& s16_x::operator|=(const s16_x& other)
355 value_ = _mm_or_si128(value_, other.value_);
359 s16_x& s16_x::operator&=(const s16_x& other)
361 value_ = _mm_and_si128(value_, other.value_);
365 int16_t s16_x::operator[](int index) const
367 return value_.m128i_i16[index];
370 int16_t& s16_x::operator[](int index)
372 return value_.m128i_i16[index];
375 s16_x s16_x::unpack_low(const s8_x& lhs, const s8_x& rhs)
377 return _mm_unpacklo_epi8(rhs.value_, lhs.value_);
380 s16_x s16_x::unpack_high(const s8_x& lhs, const s8_x& rhs)
382 return _mm_unpackhi_epi8(rhs.value_, lhs.value_);
385 s32_x s16_x::horizontal_add(const s16_x& lhs)
388 return _mm_haddd_epi16(value_);
390 return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));
394 s16_x s16_x::multiply_low(const s16_x& lhs, const s16_x& rhs)
396 return _mm_mullo_epi16(lhs.value_, rhs.value_);
399 s16_x s16_x::multiply_high(const s16_x& lhs, const s16_x& rhs)
401 return _mm_mulhi_epi16(lhs.value_, rhs.value_);
404 s16_x s16_x::unpack_low(const s16_x& lhs, const s16_x& rhs)
406 return _mm_unpacklo_epi16(lhs.value_, rhs.value_);
409 s16_x s16_x::unpack_high(const s16_x& lhs, const s16_x& rhs)
411 return _mm_unpackhi_epi16(lhs.value_, rhs.value_);
414 s16_x s16_x::and_not(const s16_x& lhs, const s16_x& rhs)
416 return _mm_andnot_si128(lhs.value_, rhs.value_);
419 s16_x s16_x::max(const s16_x& lhs, const s16_x& rhs)
421 return _mm_max_epi16(lhs.value_, rhs.value_);
424 s16_x s16_x::min(const s16_x& lhs, const s16_x& rhs)
426 return _mm_min_epi16(lhs.value_, rhs.value_);
429 inline s16_x operator+(const s16_x& lhs, const s16_x& rhs)
431 return s16_x(lhs) += rhs;
434 inline s16_x operator-(const s16_x& lhs, const s16_x& rhs)
436 return s16_x(lhs) -= rhs;
439 inline s16_x operator>>(const s16_x& lhs, int count)
441 return s16_x(lhs) >>= count;
444 inline s16_x operator<<(const s16_x& lhs, int count)
446 return s16_x(lhs) <<= count;
449 inline s16_x operator|(const s16_x& lhs, const s16_x& rhs)
451 return s16_x(lhs) |= rhs;
454 inline s16_x operator&(const s16_x& lhs, const s16_x& rhs)
456 return s16_x(lhs) &= rhs;
465 s8_x::s8_x(const s32_x& other)
466 : value_(other.value_)
470 s8_x::s8_x(const s16_x& other)
471 : value_(other.value_)
475 s8_x::s8_x(const u8_x& other)
476 : value_(other.value_)
480 s8_x::s8_x(const __m128i& value)
486 : value_(_mm_set1_epi8(b))
490 s8_x::s8_x(char b3, char b2, char b1, char b0)
491 : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))
495 s8_x::s8_x(char b15, char b14, char b13, char b12,
496 char b11, char b10, char b9, char b8,
497 char b7, char b6, char b5, char b4,
498 char b3, char b2, char b1, char b0)
499 : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))
503 s8_x& s8_x::operator+=(const s8_x& other)
505 value_ = _mm_add_epi8(value_, other.value_);
509 s8_x& s8_x::operator-=(const s8_x& other)
511 value_ = _mm_sub_epi8(value_, other.value_);
515 char s8_x::operator[](int index) const
517 return value_.m128i_i8[index];
520 char& s8_x::operator[](int index)
522 return value_.m128i_i8[index];
525 s8_x s8_x::upack(const s16_x& lhs, const s16_x& rhs)
527 return _mm_packus_epi16(lhs.value_, rhs.value_);
530 s16_x s8_x::multiply_add(const u8_x& lhs, const s8_x& rhs)
532 return _mm_maddubs_epi16(lhs.value_, rhs.value_);
535 s8_x s8_x::max(const s8_x& lhs, const s8_x& rhs)
537 return _mm_max_epi8(lhs.value_, rhs.value_);
540 s8_x s8_x::min(const s8_x& lhs, const s8_x& rhs)
542 return _mm_min_epi8(lhs.value_, rhs.value_);
545 inline s8_x operator+(const s8_x& lhs, const s8_x& rhs)
547 return s8_x(lhs) += rhs;
550 inline s8_x operator-(const s8_x& lhs, const s8_x& rhs)
552 return s8_x(lhs) -= rhs;
555 s8_x s8_x::shuffle(const s8_x& lhs, const s8_x& rhs)
557 return _mm_shuffle_epi8(lhs.value_, rhs.value_);
560 s8_x s8_x::blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask)
562 return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);
571 u8_x::u8_x(const s32_x& other)
572 : value_(other.value_)
576 u8_x::u8_x(const s16_x& other)
577 : value_(other.value_)
581 u8_x::u8_x(const s8_x& other)
582 : value_(other.value_)
586 u8_x::u8_x(const __m128i& value)
592 : value_(_mm_set1_epi8(b))
596 u8_x::u8_x(char b3, char b2, char b1, char b0)
597 : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))
601 u8_x::u8_x(char b15, char b14, char b13, char b12,
602 char b11, char b10, char b9, char b8,
603 char b7, char b6, char b5, char b4,
604 char b3, char b2, char b1, char b0)
605 : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))
609 char u8_x::operator[](int index) const
611 return value_.m128i_i8[index];
614 char& u8_x::operator[](int index)
616 return value_.m128i_i8[index];
619 u8_x u8_x::max(const u8_x& lhs, const u8_x& rhs)
621 return _mm_max_epu8(lhs.value_, rhs.value_);
624 u8_x u8_x::min(const u8_x& lhs, const u8_x& rhs)
626 return _mm_min_epu8(lhs.value_, rhs.value_);
629 u8_x u8_x::shuffle(const u8_x& lhs, const u8_x& rhs)
631 return _mm_shuffle_epi8(lhs.value_, rhs.value_);
634 u8_x u8_x::blend(const u8_x& lhs, const u8_x& rhs, const u8_x& mask)
636 return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);
641 //template<typename T>
642 //struct xmm_cast_impl
644 // template<typename U>
645 // T operator()(const U& other)
647 // return typename T::xmm_epi_tag(other.value_);
652 //struct xmm_cast_impl<xmm_ps>
654 // xmm_ps operator()(const s32_x& other)
656 // return _mm_cvtepi32_ps(other.value_);
660 //template<typename T, typename U>
661 //T xmm_cast(const U& other)
663 // return xmm_cast_impl<T>()(other);