]> git.sesse.net Git - casparcg/blob - accelerator/cpu/util/simd.h
2.1.0: cpu/image_mixer: Added ffmpeg/swscale premultiplied color overflow fixes.
[casparcg] / accelerator / cpu / util / simd.h
1 #pragma once\r
2 \r
3 #include <intrin.h>\r
4 \r
5 #include <type_traits>\r
6 #include <vector>\r
7 #include <tbb/cache_aligned_allocator.h>\r
8 \r
9 namespace caspar { namespace accelerator { namespace cpu {\r
10 \r
11 typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
12 \r
13 class xmm_ps\r
14 {\r
15         __m128 value_;\r
16 public:\r
17         xmm_ps()\r
18         {\r
19         }\r
20         \r
21         xmm_ps(float value_)\r
22                 : value_(_mm_set1_ps(value_))\r
23         {\r
24         }\r
25         \r
26         xmm_ps(__m128 value_)\r
27                 : value_(value_)\r
28         {\r
29         }\r
30                 \r
31         xmm_ps& operator+=(const xmm_ps& other)\r
32         {\r
33                 value_ = _mm_add_ps(value_, other.value_);\r
34                 return *this;\r
35         }\r
36 \r
37         xmm_ps& operator-=(const xmm_ps& other)\r
38         {\r
39                 value_ = _mm_sub_ps(value_, other.value_);\r
40                 return *this;\r
41         }\r
42         \r
43         xmm_ps& operator*=(const xmm_ps& other)\r
44         {\r
45                 value_ = _mm_mul_ps(value_, other.value_);\r
46                 return *this;\r
47         }\r
48         \r
49         xmm_ps& operator/=(const xmm_ps& other)\r
50         {\r
51                 value_ = _mm_div_ps(value_, other.value_);\r
52                 return *this;\r
53         }\r
54         \r
55         xmm_ps& horizontal_add(const xmm_ps& other)\r
56         {\r
57                 value_ = _mm_hadd_ps(value_, other.value_);\r
58                 return *this;\r
59         }\r
60 \r
61         xmm_ps& horizontal_sub(const xmm_ps& other)\r
62         {\r
63                 value_ = _mm_hsub_ps(value_, other.value_);\r
64                 return *this;\r
65         }\r
66 \r
67         xmm_ps unpack_low(const xmm_ps& other)\r
68         {               \r
69                 value_ = _mm_unpacklo_ps(value_, other.value_);\r
70                 return *this;\r
71         }\r
72 \r
73         xmm_ps unpack_high(const xmm_ps& other)\r
74         {               \r
75                 value_ = _mm_unpackhi_ps(value_, other.value_);\r
76                 return *this;\r
77         }\r
78         \r
79         float operator[](int index) const\r
80         {\r
81                 return value_.m128_f32[index];\r
82         }\r
83 \r
84         float& operator[](int index)\r
85         {\r
86                 return value_.m128_f32[index];\r
87         }\r
88 \r
89         static xmm_ps zero()\r
90         {\r
91                 return _mm_setzero_ps();\r
92         }\r
93 \r
94         static xmm_ps load(const float* ptr)\r
95         {\r
96                 return _mm_load_ps(ptr);\r
97         }\r
98         \r
99         static xmm_ps loadu(const float* ptr)\r
100         {\r
101                 return _mm_loadu_ps(ptr);\r
102         }\r
103 \r
104         static void stream(const xmm_ps& source, float* dest)\r
105         {\r
106                 _mm_stream_ps(dest, source.value_);\r
107         }\r
108                 \r
109         static xmm_ps horizontal_add(const xmm_ps& lhs, const xmm_ps& rhs)\r
110         {\r
111                 return xmm_ps(lhs).horizontal_add(rhs);\r
112         }\r
113 \r
114         static xmm_ps horizontal_sub(const xmm_ps& lhs, const xmm_ps& rhs)\r
115         {\r
116                 return xmm_ps(lhs).horizontal_sub(rhs);\r
117         }\r
118 \r
119         static xmm_ps unpack_low(const xmm_ps& lhs, const xmm_ps& rhs)\r
120         {\r
121                 return xmm_ps(lhs).unpack_low(rhs);\r
122         }\r
123 \r
124         static xmm_ps unpack_high(const xmm_ps& lhs, const xmm_ps& rhs)\r
125         {\r
126                 return xmm_ps(lhs).unpack_high(rhs);\r
127         }\r
128 };\r
129         \r
130 inline xmm_ps operator+(const xmm_ps& lhs, const xmm_ps& rhs)\r
131 {               \r
132         return xmm_ps(lhs) += rhs;\r
133 }\r
134 \r
135 inline xmm_ps operator-(const xmm_ps& lhs, const xmm_ps& rhs)\r
136 {               \r
137         return xmm_ps(lhs) -= rhs;\r
138 }\r
139 \r
140 inline xmm_ps operator*(const xmm_ps& lhs, const xmm_ps& rhs)\r
141 {               \r
142         return xmm_ps(lhs) *= rhs;\r
143 }\r
144 \r
145 inline xmm_ps operator/(const xmm_ps& lhs, const xmm_ps& rhs)\r
146 {               \r
147         return xmm_ps(lhs) /= rhs;\r
148 }\r
149 \r
150 class xmm_epi32\r
151 {\r
152         __m128i value_;\r
153         template<typename> friend struct xmm_cast_impl;\r
154 public:\r
155         typedef xmm_epi32 xmm_epi_tag;\r
156 \r
157         xmm_epi32()\r
158         {\r
159         }\r
160 \r
161         xmm_epi32(__m128i value)\r
162                 : value_(value)\r
163         {\r
164         }\r
165         \r
166         xmm_epi32& operator>>=(int count)\r
167         {\r
168                 value_ = _mm_srli_epi32(value_, count);\r
169                 return *this;\r
170         }\r
171         \r
172         xmm_epi32& operator<<=(int count)\r
173         {\r
174                 value_ = _mm_slli_epi32(value_, count);\r
175                 return *this;\r
176         }\r
177                 \r
178         xmm_epi32& operator|=(const xmm_epi32& other)\r
179         {\r
180                 value_ = _mm_or_si128(value_, other.value_);\r
181                 return *this;\r
182         }       \r
183         \r
184         xmm_epi32& operator&=(const xmm_epi32& other)\r
185         {\r
186                 value_ = _mm_and_si128(value_, other.value_);\r
187                 return *this;\r
188         }       \r
189 \r
190         static xmm_epi32 load(const void* source)\r
191         {\r
192                 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
193         }\r
194         \r
195         static xmm_epi32 loadu(const void* source)\r
196         {\r
197                 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
198         }\r
199                 \r
200         int32_t operator[](int index) const\r
201         {\r
202                 return value_.m128i_i32[index];\r
203         }\r
204 \r
205         int32_t& operator[](int index)\r
206         {\r
207                 return value_.m128i_i32[index];\r
208         }\r
209 \r
210         static xmm_epi32 zero()\r
211         {\r
212                 return _mm_setzero_si128();\r
213         }\r
214 };\r
215 \r
216 inline xmm_epi32 operator>>(const xmm_epi32& lhs, int count)\r
217 {               \r
218         return xmm_epi32(lhs) >>= count;\r
219 }\r
220 \r
221 inline xmm_epi32 operator<<(const xmm_epi32& lhs, int count)\r
222 {               \r
223         return xmm_epi32(lhs) <<= count;\r
224 }\r
225 \r
226 inline xmm_epi32 operator|(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
227 {               \r
228         return xmm_epi32(lhs) |= rhs;\r
229 }\r
230 \r
231 inline xmm_epi32 operator&(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
232 {               \r
233         return xmm_epi32(lhs) &= rhs;\r
234 }\r
235 \r
236 class xmm_epi16\r
237 {\r
238         __m128i value_;\r
239         template<typename> friend struct xmm_cast_impl;\r
240         friend xmm_epi32 horizontal_add(const xmm_epi16&);\r
241 public:\r
242         typedef xmm_epi16 xmm_epi_tag;\r
243 \r
244         xmm_epi16()\r
245         {\r
246         }\r
247 \r
248         xmm_epi16(__m128i value)\r
249                 : value_(value)\r
250         {\r
251         }\r
252 \r
253         xmm_epi16(short value)\r
254                 : value_(_mm_set1_epi16(value))\r
255         {\r
256         }\r
257 \r
258         xmm_epi16& operator+=(const xmm_epi16& other)\r
259         {\r
260                 value_ = _mm_add_epi16(value_, other.value_);\r
261                 return *this;\r
262         }\r
263         \r
264         xmm_epi16& operator-=(const xmm_epi16& other)\r
265         {\r
266                 value_ = _mm_sub_epi16(value_, other.value_);\r
267                 return *this;\r
268         }\r
269 \r
270         xmm_epi16& operator>>=(int count)\r
271         {\r
272                 value_ = _mm_srli_epi16(value_, count);\r
273                 return *this;\r
274         }\r
275         \r
276         xmm_epi16& operator<<=(int count)\r
277         {\r
278                 value_ = _mm_slli_epi16(value_, count);\r
279                 return *this;\r
280         }\r
281 \r
282         xmm_epi16& operator|=(const xmm_epi16& other)\r
283         {\r
284                 value_ = _mm_or_si128(value_, other.value_);\r
285                 return *this;\r
286         }       \r
287         \r
288         xmm_epi16& operator&=(const xmm_epi16& other)\r
289         {\r
290                 value_ = _mm_and_si128(value_, other.value_);\r
291                 return *this;\r
292         }       \r
293                 \r
294         xmm_epi16 multiply_low(const xmm_epi16& other)\r
295         {               \r
296                 value_ = _mm_mullo_epi16(value_, other.value_);\r
297                 return *this;\r
298         }\r
299 \r
300         xmm_epi16 multiply_high(const xmm_epi16& other)\r
301         {               \r
302                 value_ = _mm_mulhi_epi16(value_, other.value_);\r
303                 return *this;\r
304         }\r
305 \r
306         xmm_epi16 umultiply_low(const xmm_epi16& other)\r
307         {               \r
308                 value_ = _mm_mullo_epi16(value_, other.value_);\r
309                 return *this;\r
310         }\r
311 \r
312         xmm_epi16 umultiply_high(const xmm_epi16& other)\r
313         {               \r
314                 value_ = _mm_mulhi_epi16(value_, other.value_);\r
315                 return *this;\r
316         }\r
317         \r
318         xmm_epi16 and_not(const xmm_epi16& other)\r
319         {               \r
320                 value_ = _mm_andnot_si128(other.value_, value_);\r
321                 return *this;\r
322         }\r
323 \r
324         xmm_epi16 unpack_low(const xmm_epi16& other)\r
325         {               \r
326                 value_ = _mm_unpacklo_epi16 (value_, other.value_);\r
327                 return *this;\r
328         }\r
329 \r
330         xmm_epi16 unpack_high(const xmm_epi16& other)\r
331         {               \r
332                 value_ = _mm_unpackhi_epi16 (value_, other.value_);\r
333                 return *this;\r
334         }\r
335 \r
336         xmm_epi16 pack(const xmm_epi16& other)\r
337         {               \r
338                 value_ = _mm_packs_epi16(value_, other.value_);\r
339                 return *this;\r
340         }\r
341                 \r
342         xmm_epi16 max(const xmm_epi16& other)\r
343         {               \r
344                 value_ = _mm_max_epi16(value_, other.value_);\r
345                 return *this;\r
346         }\r
347         \r
348         xmm_epi16 min(const xmm_epi16& other)\r
349         {               \r
350                 value_ = _mm_min_epi16(value_, other.value_);\r
351                 return *this;\r
352         }\r
353 \r
354         int16_t operator[](int index) const\r
355         {\r
356                 return value_.m128i_i16[index];\r
357         }\r
358 \r
359         int16_t& operator[](int index)\r
360         {\r
361                 return value_.m128i_i16[index];\r
362         }\r
363         \r
364         static xmm_epi16 load(const void* source)\r
365         {\r
366                 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
367         }\r
368         \r
369         static xmm_epi16 loadu(const void* source)\r
370         {\r
371                 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
372         }\r
373         \r
374         static xmm_epi32 horizontal_add(const xmm_epi16& lhs)\r
375         {\r
376                 #ifdef SSIM_XOP\r
377                                 return _mm_haddd_epi16(value_);\r
378                 #else\r
379                                 return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));\r
380                 #endif\r
381         }\r
382 \r
383         static xmm_epi16 multiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
384         {\r
385                 return xmm_epi16(lhs).multiply_low(rhs);\r
386         }\r
387 \r
388         static xmm_epi16 multiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
389         {\r
390                 return xmm_epi16(lhs).multiply_high(rhs);\r
391         }\r
392 \r
393         static xmm_epi16 umultiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
394         {\r
395                 return xmm_epi16(lhs).umultiply_low(rhs);\r
396         }\r
397 \r
398         static xmm_epi16 umultiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
399         {\r
400                 return xmm_epi16(lhs).umultiply_high(rhs);\r
401         }\r
402 \r
403         static xmm_epi16 unpack_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
404         {\r
405                 return xmm_epi16(lhs).unpack_low(rhs);\r
406         }\r
407 \r
408         static xmm_epi16 unpack_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
409         {\r
410                 return xmm_epi16(lhs).unpack_high(rhs);\r
411         }\r
412 \r
413         static xmm_epi16 pack(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
414         {\r
415                 return xmm_epi16(lhs).pack(rhs);\r
416         }\r
417 \r
418         static xmm_epi16 and_not(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
419         {\r
420                 return xmm_epi16(lhs).and_not(rhs);\r
421         }\r
422         \r
423         static xmm_epi16 max(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
424         {\r
425                 return xmm_epi16(lhs).max(rhs);\r
426         }\r
427         \r
428         static xmm_epi16 min(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
429         {\r
430                 return xmm_epi16(lhs).min(rhs);\r
431         }\r
432 \r
433         static xmm_epi16 zero()\r
434         {\r
435                 return _mm_setzero_si128();\r
436         }\r
437 };\r
438 \r
439 inline xmm_epi16 operator+(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
440 {\r
441         return xmm_epi16(lhs) += rhs;\r
442 }\r
443 \r
444 inline xmm_epi16 operator-(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
445 {\r
446         return xmm_epi16(lhs) -= rhs;\r
447 }\r
448 \r
449 inline xmm_epi16 operator>>(const xmm_epi16& lhs, int count)\r
450 {               \r
451         return xmm_epi16(lhs) >>= count;\r
452 }\r
453 \r
454 inline xmm_epi16 operator<<(const xmm_epi16& lhs, int count)\r
455 {               \r
456         return xmm_epi16(lhs) <<= count;\r
457 }\r
458 \r
459 inline xmm_epi16 operator|(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
460 {               \r
461         return xmm_epi16(lhs) |= rhs;\r
462 }\r
463 \r
464 inline xmm_epi16 operator&(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
465 {               \r
466         return xmm_epi16(lhs) &= rhs;\r
467 }\r
468 \r
469 class xmm_epi8\r
470 {\r
471         __m128i value_;\r
472         template<typename> friend struct xmm_cast_impl;\r
473         friend xmm_epi16 multiply_add(const xmm_epi8&, const xmm_epi8&);\r
474 public:\r
475         typedef xmm_epi8 xmm_epi_tag;\r
476 \r
477         xmm_epi8()\r
478         {\r
479         }\r
480 \r
481         xmm_epi8(__m128i value)\r
482                 : value_(value)\r
483         {\r
484         }\r
485         \r
486         xmm_epi8(char b)\r
487                 : value_(_mm_set1_epi8(b))\r
488         {\r
489         }\r
490 \r
491         xmm_epi8(char b3,  char b2,  char b1,  char b0)\r
492                 : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
493         {\r
494         }\r
495 \r
496         xmm_epi8(char b15, char b14, char b13, char b12, \r
497                          char b11, char b10, char b9,  char b8,  \r
498                          char b7,  char b6,  char b5,  char b4,  \r
499                          char b3,  char b2,  char b1,  char b0)\r
500                 : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
501         {\r
502         }\r
503         \r
504         xmm_epi8& operator+=(const xmm_epi8& other)\r
505         {\r
506                 value_ = _mm_add_epi8(value_, other.value_);\r
507                 return *this;\r
508         }\r
509 \r
510         xmm_epi8& operator-=(const xmm_epi8& other)\r
511         {\r
512                 value_ = _mm_sub_epi8(value_, other.value_);\r
513                 return *this;\r
514         }\r
515                         \r
516         xmm_epi8& shuffle(const xmm_epi8& other)\r
517         {               \r
518                 value_ = _mm_shuffle_epi8 (value_, other.value_);\r
519                 return *this;\r
520         }\r
521                 \r
522         xmm_epi8& max(const xmm_epi8& other)\r
523         {               \r
524                 value_ = _mm_max_epi8 (value_, other.value_);\r
525                 return *this;\r
526         }\r
527         \r
528         xmm_epi8& min(const xmm_epi8& other)\r
529         {               \r
530                 value_ = _mm_min_epi8 (value_, other.value_);\r
531                 return *this;\r
532         }\r
533         \r
534         xmm_epi8& umax(const xmm_epi8& other)\r
535         {               \r
536                 value_ = _mm_max_epu8(value_, other.value_);\r
537                 return *this;\r
538         }\r
539         \r
540         xmm_epi8& umin(const xmm_epi8& other)\r
541         {               \r
542                 value_ = _mm_min_epu8(value_, other.value_);\r
543                 return *this;\r
544         }\r
545         \r
546         xmm_epi8& blend(const xmm_epi8& other, const xmm_epi8& mask)\r
547         {               \r
548                 value_ = _mm_blendv_epi8(value_, other.value_, mask.value_);\r
549                 return *this;\r
550         }\r
551 \r
552         const xmm_epi8& stream(void* dest) const\r
553         {\r
554                 _mm_stream_si128(reinterpret_cast<__m128i*>(dest), value_);\r
555                 return *this;\r
556         }\r
557         \r
558         char operator[](int index) const\r
559         {\r
560                 return value_.m128i_i8[index];\r
561         }\r
562 \r
563         char& operator[](int index)\r
564         {\r
565                 return value_.m128i_i8[index];\r
566         }\r
567 \r
568         static const xmm_epi8& stream(const xmm_epi8& source, void* dest)\r
569         {\r
570                 source.stream(dest);\r
571                 return source;\r
572         }\r
573 \r
574         static xmm_epi8 load(const void* source)\r
575         {\r
576                 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
577         }\r
578         \r
579         static xmm_epi8 loadu(const void* source)\r
580         {\r
581                 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
582         }\r
583 \r
584         static xmm_epi16 multiply_add(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
585         {               \r
586                 return xmm_epi16(_mm_maddubs_epi16(lhs.value_, rhs.value_));\r
587         }\r
588 \r
589         static xmm_epi8& shuffle(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
590         {               \r
591                 return xmm_epi8(lhs).shuffle(rhs);\r
592         }\r
593         \r
594         static xmm_epi8& max(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
595         {               \r
596                 return xmm_epi8(lhs).max(rhs);\r
597         }\r
598         \r
599         static xmm_epi8& min(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
600         {               \r
601                 return xmm_epi8(lhs).min(rhs);\r
602         }\r
603         \r
604         static xmm_epi8& umax(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
605         {               \r
606                 return xmm_epi8(lhs).umax(rhs);\r
607         }\r
608         \r
609         static xmm_epi8& umin(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
610         {               \r
611                 return xmm_epi8(lhs).umin(rhs);\r
612         }\r
613 \r
614         static xmm_epi8& blend(const xmm_epi8& lhs, const xmm_epi8& rhs, const xmm_epi8& mask)\r
615         {               \r
616                 return xmm_epi8(lhs).blend(rhs, mask);\r
617         }\r
618 \r
619         static xmm_epi8 zero()\r
620         {\r
621                 return _mm_setzero_si128();\r
622         }\r
623 };\r
624 \r
625 inline xmm_epi8 operator+(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
626 {\r
627         return xmm_epi8(lhs) += rhs;\r
628 }\r
629 \r
630 inline xmm_epi8 operator-(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
631 {\r
632         return xmm_epi8(lhs) -= rhs;\r
633 }\r
634 \r
635 // xmm_cast\r
636 \r
637 template<typename T>\r
638 struct xmm_cast_impl\r
639 {               \r
640         template<typename U>\r
641         T operator()(const U& other)\r
642         {\r
643                 return typename T::xmm_epi_tag(other.value_);\r
644         }\r
645 };\r
646 \r
647 template<>\r
648 struct xmm_cast_impl<xmm_ps>\r
649 {\r
650         xmm_ps operator()(const xmm_epi32& other)\r
651         {\r
652                 return _mm_cvtepi32_ps(other.value_);\r
653         }\r
654 };\r
655 \r
656 template<typename T, typename U> \r
657 T xmm_cast(const U& other)\r
658 {\r
659         return xmm_cast_impl<T>()(other);\r
660 }\r
661 \r
662 }}}