git.sesse.net Git - casparcg/blob - accelerator/cpu/util/xmm.h

   1 #pragma once
   2
   3 #include <intrin.h>
   4 #include <type_traits>
   5
   6 namespace caspar { namespace accelerator { namespace cpu { namespace xmm {
   7
   8 struct temporal_tag
   9 {
  10         static const int value = 0x01;
  11 };
  12 struct nontemporal_tag
  13 {
  14         static const int value = 0x02;
  15 };
  16 struct aligned_tag
  17 {
  18         static const int value = 0x01;
  19 };
  20 struct unaligned_tag
  21 {
  22         static const int value = 0x02;
  23 };
  24
  25 class s32_x;
  26 class s16_x;
  27 class  s8_x;
  28 class  u8_x;
  29
  30 template<typename T>
  31 class base_x
  32 {
  33 public:
  34         template<typename temporal, typename alignment>
  35         static T load(const void* source);
  36
  37         template<typename temporal,typename alignment>
  38         static void store(const T& source, void* dest);
  39
  40         static T zero();
  41 };
  42
  43 class s32_x : public base_x<s32_x>
  44 {
  45         __m128i value_;
  46         template<typename> friend class base_x;
  47         friend class s16_x;
  48         friend class s8_x;
  49         friend class u8_x;
  50 public:
  51         typedef s32_x xmm_epi_tag;
  52
  53         s32_x();
  54         explicit s32_x(const s16_x& other);
  55         explicit s32_x(const s8_x& other);
  56         explicit s32_x(const u8_x& other);
  57         s32_x(const __m128i& value);
  58
  59         s32_x& operator>>=(int count);
  60         s32_x& operator<<=(int count);
  61         s32_x& operator|=(const s32_x& other);
  62         s32_x& operator&=(const s32_x& other);
  63         int32_t operator[](int index) const;
  64         int32_t& operator[](int index);
  65 };
  66
  67 class s16_x : public base_x<s16_x>
  68 {
  69         __m128i value_;
  70
  71 private:
  72         template<typename> friend class base_x;
  73         friend class s32_x;
  74         friend class s8_x;
  75         friend class u8_x;
  76 public:
  77         typedef s16_x xmm_epi_tag;
  78
  79         s16_x();
  80         explicit s16_x(const s32_x& other);
  81         explicit s16_x(const s8_x& other);
  82         explicit s16_x(const u8_x& other);
  83         s16_x(const __m128i& value);
  84         s16_x(short value);
  85
  86         s16_x& operator+=(const s16_x& other);
  87         s16_x& operator-=(const s16_x& other);
  88         s16_x& operator>>=(int count);
  89         s16_x& operator<<=(int count);
  90         s16_x& operator|=(const s16_x& other);
  91         s16_x& operator&=(const s16_x& other);
  92         int16_t operator[](int index) const;
  93         int16_t& operator[](int index);
  94
  95         static s16_x unpack_low(const s8_x& lhs, const s8_x& rhs);
  96         static s16_x unpack_high(const s8_x& lhs, const s8_x& rhs);
  97         static s32_x horizontal_add(const s16_x& lhs);
  98         static s16_x multiply_low(const s16_x& lhs, const s16_x& rhs);
  99         static s16_x multiply_high(const s16_x& lhs, const s16_x& rhs);
 100         static s16_x umultiply_low(const s16_x& lhs, const s16_x& rhs);
 101         static s16_x umultiply_high(const s16_x& lhs, const s16_x& rhs);
 102         static s16_x unpack_low(const s16_x& lhs, const s16_x& rhs);
 103         static s16_x unpack_high(const s16_x& lhs, const s16_x& rhs);
 104         static s16_x and_not(const s16_x& lhs, const s16_x& rhs);
 105         static s16_x max(const s16_x& lhs, const s16_x& rhs);
 106         static s16_x min(const s16_x& lhs, const s16_x& rhs);
 107 };
 108
 109 template<typename T>
 110 class base8_x : public base_x<s8_x>
 111 {
 112         char operator[](int index) const;
 113         char& operator[](int index);
 114 };
 115
 116 class s8_x : public base_x<s8_x>
 117 {
 118         __m128i value_;
 119 private:
 120         template<typename> friend class base_x;
 121         friend class s32_x;
 122         friend class s16_x;
 123         friend class u8_x;
 124 public:
 125         typedef s8_x xmm_epi_tag;
 126
 127         s8_x();
 128         explicit s8_x(const s32_x& other);
 129         explicit s8_x(const s16_x& other);
 130         explicit s8_x(const u8_x& other);
 131         s8_x(const __m128i& value);
 132         s8_x(char b);
 133         s8_x(char b3,  char b2,  char b1,  char b0);
 134         s8_x(char b15, char b14, char b13, char b12,
 135                  char b11, char b10, char b9,  char b8,
 136                  char b7,  char b6,  char b5,  char b4,
 137                  char b3,  char b2,  char b1,  char b0);
 138
 139         s8_x& operator+=(const s8_x& other);
 140         s8_x& operator-=(const s8_x& other);
 141         char operator[](int index) const;
 142         char& operator[](int index);
 143
 144         static s8_x upack(const s16_x& lhs, const s16_x& rhs);
 145
 146         static s16_x multiply_add(const u8_x& lhs, const s8_x& rhs);
 147         static s8_x max(const s8_x& lhs, const s8_x& rhs);
 148         static s8_x min(const s8_x& lhs, const s8_x& rhs);
 149
 150         static s8_x shuffle(const s8_x& lhs, const s8_x& rhs);
 151         static s8_x blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask);
 152 };
 153
 154 class u8_x : public base_x<u8_x>
 155 {
 156         __m128i value_;
 157 private:
 158         template<typename> friend class base_x;
 159         friend class s32_x;
 160         friend class s16_x;
 161         friend class s8_x;
 162 public:
 163         typedef u8_x xmm_epu_tag;
 164
 165         u8_x();
 166         explicit u8_x(const s32_x& other);
 167         explicit u8_x(const s16_x& other);
 168         explicit u8_x(const s8_x& other);
 169         u8_x(const __m128i& value);
 170         u8_x(char b);
 171         u8_x(char b3,  char b2,  char b1,  char b0);
 172         u8_x(char b15, char b14, char b13, char b12,
 173                  char b11, char b10, char b9,  char b8,
 174                  char b7,  char b6,  char b5,  char b4,
 175                  char b3,  char b2,  char b1,  char b0);
 176
 177         char operator[](int index) const;
 178         char& operator[](int index);
 179
 180         static u8_x max(const u8_x& lhs, const u8_x& rhs);
 181         static u8_x min(const u8_x& lhs, const u8_x& rhs);
 182
 183         static u8_x shuffle(const u8_x& lhs, const u8_x& rhs);
 184         static u8_x blend(const u8_x& lhs, const u8_x& rhs, const u8_x& mask);
 185 };
 186
 187 // base_x
 188
 189 template<typename T>
 190 template<typename temporal, typename alignment>
 191 T base_x<T>::load(const void* source)
 192 {
 193         static_assert(temporal::value != nontemporal_tag::value, "streaming loads not supported");
 194         if(alignment::value == aligned_tag::value)
 195                 return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
 196         else
 197                 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
 198 }
 199
 200 template<typename T>
 201 template<typename temporal, typename alignment>
 202 void base_x<T>::store(const T& source, void* dest)
 203 {
 204         if(temporal::value == nontemporal_tag::value && alignment::value == aligned_tag::value)
 205                 _mm_stream_si128(reinterpret_cast<__m128i*>(dest), source.value_);
 206         else if(alignment::value == aligned_tag::value)
 207                 _mm_store_si128(reinterpret_cast<__m128i*>(dest), source.value_);
 208         else
 209                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source.value_);
 210 }
 211
 212 template<typename T>
 213 T base_x<T>::zero()
 214 {
 215         return _mm_setzero_si128();
 216 }
 217
 218 // s32_x
 219
 220 s32_x::s32_x()
 221 {
 222 }
 223
 224 s32_x::s32_x(const s16_x& other)
 225         : value_(other.value_)
 226 {
 227 }
 228
 229 s32_x::s32_x(const s8_x& other)
 230         : value_(other.value_)
 231 {
 232 }
 233
 234 s32_x::s32_x(const u8_x& other)
 235         : value_(other.value_)
 236 {
 237 }
 238
 239 s32_x::s32_x(const __m128i& value)
 240         : value_(value)
 241 {
 242 }
 243
 244 s32_x& s32_x::operator>>=(int count)
 245 {
 246         value_ = _mm_srli_epi32(value_, count);
 247         return *this;
 248 }
 249
 250 s32_x& s32_x::operator<<=(int count)
 251 {
 252         value_ = _mm_slli_epi32(value_, count);
 253         return *this;
 254 }
 255
 256 s32_x& s32_x::operator|=(const s32_x& other)
 257 {
 258         value_ = _mm_or_si128(value_, other.value_);
 259         return *this;
 260 }
 261
 262 s32_x& s32_x::operator&=(const s32_x& other)
 263 {
 264         value_ = _mm_and_si128(value_, other.value_);
 265         return *this;
 266 }
 267
 268 int32_t s32_x::operator[](int index) const
 269 {
 270         return value_.m128i_i32[index];
 271 }
 272
 273 int32_t& s32_x::operator[](int index)
 274 {
 275         return value_.m128i_i32[index];
 276 }
 277
 278 inline s32_x operator>>(const s32_x& lhs, int count)
 279 {
 280         return s32_x(lhs) >>= count;
 281 }
 282
 283 inline s32_x operator<<(const s32_x& lhs, int count)
 284 {
 285         return s32_x(lhs) <<= count;
 286 }
 287
 288 inline s32_x operator|(const s32_x& lhs, const s32_x& rhs)
 289 {
 290         return s32_x(lhs) |= rhs;
 291 }
 292
 293 inline s32_x operator&(const s32_x& lhs, const s32_x& rhs)
 294 {
 295         return s32_x(lhs) &= rhs;
 296 }
 297
 298 // s16_x
 299
 300 s16_x::s16_x()
 301 {
 302 }
 303
 304 s16_x::s16_x(const s32_x& other)
 305         : value_(other.value_)
 306 {
 307 }
 308
 309 s16_x::s16_x(const s8_x& other)
 310         : value_(other.value_)
 311 {
 312 }
 313
 314 s16_x::s16_x(const u8_x& other)
 315         : value_(other.value_)
 316 {
 317 }
 318
 319 s16_x::s16_x(const __m128i& value)
 320         : value_(value)
 321 {
 322 }
 323
 324 s16_x::s16_x(short value)
 325         : value_(_mm_set1_epi16(value))
 326 {
 327 }
 328
 329 s16_x& s16_x::operator+=(const s16_x& other)
 330 {
 331         value_ = _mm_add_epi16(value_, other.value_);
 332         return *this;
 333 }
 334
 335 s16_x& s16_x::operator-=(const s16_x& other)
 336 {
 337         value_ = _mm_sub_epi16(value_, other.value_);
 338         return *this;
 339 }
 340
 341 s16_x& s16_x::operator>>=(int count)
 342 {
 343         value_ = _mm_srli_epi16(value_, count);
 344         return *this;
 345 }
 346
 347 s16_x& s16_x::operator<<=(int count)
 348 {
 349         value_ = _mm_slli_epi16(value_, count);
 350         return *this;
 351 }
 352
 353 s16_x& s16_x::operator|=(const s16_x& other)
 354 {
 355         value_ = _mm_or_si128(value_, other.value_);
 356         return *this;
 357 }
 358
 359 s16_x& s16_x::operator&=(const s16_x& other)
 360 {
 361         value_ = _mm_and_si128(value_, other.value_);
 362         return *this;
 363 }
 364
 365 int16_t s16_x::operator[](int index) const
 366 {
 367         return value_.m128i_i16[index];
 368 }
 369
 370 int16_t& s16_x::operator[](int index)
 371 {
 372         return value_.m128i_i16[index];
 373 }
 374
 375 s16_x s16_x::unpack_low(const s8_x& lhs, const s8_x& rhs)
 376 {
 377         return _mm_unpacklo_epi8(rhs.value_, lhs.value_);
 378 }
 379
 380 s16_x s16_x::unpack_high(const s8_x& lhs, const s8_x& rhs)
 381 {
 382         return _mm_unpackhi_epi8(rhs.value_, lhs.value_);
 383 }
 384
 385 s32_x s16_x::horizontal_add(const s16_x& lhs)
 386 {
 387         #ifdef SSIM_XOP
 388                         return _mm_haddd_epi16(value_);
 389         #else
 390                         return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));
 391         #endif
 392 }
 393
 394 s16_x s16_x::multiply_low(const s16_x& lhs, const s16_x& rhs)
 395 {
 396         return _mm_mullo_epi16(lhs.value_, rhs.value_);
 397 }
 398
 399 s16_x s16_x::multiply_high(const s16_x& lhs, const s16_x& rhs)
 400 {
 401         return _mm_mulhi_epi16(lhs.value_, rhs.value_);
 402 }
 403
 404 s16_x s16_x::unpack_low(const s16_x& lhs, const s16_x& rhs)
 405 {
 406         return _mm_unpacklo_epi16(lhs.value_, rhs.value_);
 407 }
 408
 409 s16_x s16_x::unpack_high(const s16_x& lhs, const s16_x& rhs)
 410 {
 411         return _mm_unpackhi_epi16(lhs.value_, rhs.value_);
 412 }
 413
 414 s16_x s16_x::and_not(const s16_x& lhs, const s16_x& rhs)
 415 {
 416         return _mm_andnot_si128(lhs.value_, rhs.value_);
 417 }
 418
 419 s16_x s16_x::max(const s16_x& lhs, const s16_x& rhs)
 420 {
 421         return _mm_max_epi16(lhs.value_, rhs.value_);
 422 }
 423
 424 s16_x s16_x::min(const s16_x& lhs, const s16_x& rhs)
 425 {
 426         return _mm_min_epi16(lhs.value_, rhs.value_);
 427 }
 428
 429 inline s16_x operator+(const s16_x& lhs, const s16_x& rhs)
 430 {
 431         return s16_x(lhs) += rhs;
 432 }
 433
 434 inline s16_x operator-(const s16_x& lhs, const s16_x& rhs)
 435 {
 436         return s16_x(lhs) -= rhs;
 437 }
 438
 439 inline s16_x operator>>(const s16_x& lhs, int count)
 440 {
 441         return s16_x(lhs) >>= count;
 442 }
 443
 444 inline s16_x operator<<(const s16_x& lhs, int count)
 445 {
 446         return s16_x(lhs) <<= count;
 447 }
 448
 449 inline s16_x operator|(const s16_x& lhs, const s16_x& rhs)
 450 {
 451         return s16_x(lhs) |= rhs;
 452 }
 453
 454 inline s16_x operator&(const s16_x& lhs, const s16_x& rhs)
 455 {
 456         return s16_x(lhs) &= rhs;
 457 }
 458
 459 // s8_x
 460
 461 s8_x::s8_x()
 462 {
 463 }
 464
 465 s8_x::s8_x(const s32_x& other)
 466         : value_(other.value_)
 467 {
 468 }
 469
 470 s8_x::s8_x(const s16_x& other)
 471         : value_(other.value_)
 472 {
 473 }
 474
 475 s8_x::s8_x(const u8_x& other)
 476         : value_(other.value_)
 477 {
 478 }
 479
 480 s8_x::s8_x(const __m128i& value)
 481         : value_(value)
 482 {
 483 }
 484
 485 s8_x::s8_x(char b)
 486         : value_(_mm_set1_epi8(b))
 487 {
 488 }
 489
 490 s8_x::s8_x(char b3,  char b2,  char b1,  char b0)
 491         : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))
 492 {
 493 }
 494
 495 s8_x::s8_x(char b15, char b14, char b13, char b12,
 496                    char b11, char b10, char b9,  char b8,
 497                    char b7,  char b6,  char b5,  char b4,
 498                    char b3,  char b2,  char b1,  char b0)
 499         : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))
 500 {
 501 }
 502
 503 s8_x& s8_x::operator+=(const s8_x& other)
 504 {
 505         value_ = _mm_add_epi8(value_, other.value_);
 506         return *this;
 507 }
 508
 509 s8_x& s8_x::operator-=(const s8_x& other)
 510 {
 511         value_ = _mm_sub_epi8(value_, other.value_);
 512         return *this;
 513 }
 514
 515 char s8_x::operator[](int index) const
 516 {
 517         return value_.m128i_i8[index];
 518 }
 519
 520 char& s8_x::operator[](int index)
 521 {
 522         return value_.m128i_i8[index];
 523 }
 524
 525 s8_x s8_x::upack(const s16_x& lhs, const s16_x& rhs)
 526 {
 527         return _mm_packus_epi16(lhs.value_, rhs.value_);
 528 }
 529
 530 s16_x s8_x::multiply_add(const u8_x& lhs, const s8_x& rhs)
 531 {
 532         return _mm_maddubs_epi16(lhs.value_, rhs.value_);
 533 }
 534
 535 s8_x s8_x::max(const s8_x& lhs, const s8_x& rhs)
 536 {
 537         return _mm_max_epi8(lhs.value_, rhs.value_);
 538 }
 539
 540 s8_x s8_x::min(const s8_x& lhs, const s8_x& rhs)
 541 {
 542         return _mm_min_epi8(lhs.value_, rhs.value_);
 543 }
 544
 545 inline s8_x operator+(const s8_x& lhs, const s8_x& rhs)
 546 {
 547         return s8_x(lhs) += rhs;
 548 }
 549
 550 inline s8_x operator-(const s8_x& lhs, const s8_x& rhs)
 551 {
 552         return s8_x(lhs) -= rhs;
 553 }
 554
 555 s8_x s8_x::shuffle(const s8_x& lhs, const s8_x& rhs)
 556 {
 557         return _mm_shuffle_epi8(lhs.value_, rhs.value_);
 558 }
 559
 560 s8_x s8_x::blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask)
 561 {
 562         return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);
 563 }
 564
 565 // u8_x
 566
 567 u8_x::u8_x()
 568 {
 569 }
 570
 571 u8_x::u8_x(const s32_x& other)
 572         : value_(other.value_)
 573 {
 574 }
 575
 576 u8_x::u8_x(const s16_x& other)
 577         : value_(other.value_)
 578 {
 579 }
 580
 581 u8_x::u8_x(const s8_x& other)
 582         : value_(other.value_)
 583 {
 584 }
 585
 586 u8_x::u8_x(const __m128i& value)
 587         : value_(value)
 588 {
 589 }
 590
 591 u8_x::u8_x(char b)
 592         : value_(_mm_set1_epi8(b))
 593 {
 594 }
 595
 596 u8_x::u8_x(char b3,  char b2,  char b1,  char b0)
 597         : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))
 598 {
 599 }
 600
 601 u8_x::u8_x(char b15, char b14, char b13, char b12,
 602                    char b11, char b10, char b9,  char b8,
 603                    char b7,  char b6,  char b5,  char b4,
 604                    char b3,  char b2,  char b1,  char b0)
 605         : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))
 606 {
 607 }
 608
 609 char u8_x::operator[](int index) const
 610 {
 611         return value_.m128i_i8[index];
 612 }
 613
 614 char& u8_x::operator[](int index)
 615 {
 616         return value_.m128i_i8[index];
 617 }
 618
 619 u8_x u8_x::max(const u8_x& lhs, const u8_x& rhs)
 620 {
 621         return _mm_max_epu8(lhs.value_, rhs.value_);
 622 }
 623
 624 u8_x u8_x::min(const u8_x& lhs, const u8_x& rhs)
 625 {
 626         return _mm_min_epu8(lhs.value_, rhs.value_);
 627 }
 628
 629 u8_x u8_x::shuffle(const u8_x& lhs, const u8_x& rhs)
 630 {
 631         return _mm_shuffle_epi8(lhs.value_, rhs.value_);
 632 }
 633
 634 u8_x u8_x::blend(const u8_x& lhs, const u8_x& rhs, const u8_x& mask)
 635 {
 636         return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);
 637 }
 638
 639 // xmm_cast
 640
 641 //template<typename T>
 642 //struct xmm_cast_impl
 643 //{
 644 //      template<typename U>
 645 //      T operator()(const U& other)
 646 //      {
 647 //              return typename T::xmm_epi_tag(other.value_);
 648 //      }
 649 //};
 650 //
 651 //template<>
 652 //struct xmm_cast_impl<xmm_ps>
 653 //{
 654 //      xmm_ps operator()(const s32_x& other)
 655 //      {
 656 //              return _mm_cvtepi32_ps(other.value_);
 657 //      }
 658 //};
 659 //
 660 //template<typename T, typename U>
 661 //T xmm_cast(const U& other)
 662 //{
 663 //      return xmm_cast_impl<T>()(other);
 664 //}
 665
 666 }}}}