]> git.sesse.net Git - casparcg/commitdiff
git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches...
authorronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Fri, 10 Feb 2012 10:18:36 +0000 (10:18 +0000)
committerronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Fri, 10 Feb 2012 10:18:36 +0000 (10:18 +0000)
accelerator/accelerator.vcxproj
accelerator/accelerator.vcxproj.filters
accelerator/cpu/image/image_mixer.cpp
accelerator/cpu/util/simd.h [deleted file]
accelerator/cpu/util/xmm.h [new file with mode: 0644]

index 0702737c865262b252c182b3d4fdcd27098cb221..792bfef666ddcc89787cbe699aec0e3f517cf81c 100644 (file)
   <ItemGroup>\r
     <ClInclude Include="cpu\factory.h" />\r
     <ClInclude Include="cpu\image\image_mixer.h" />\r
-    <ClInclude Include="cpu\util\simd.h" />\r
+    <ClInclude Include="cpu\util\xmm.h" />\r
     <ClInclude Include="cpu\util\write_frame.h" />\r
     <ClInclude Include="factory.h" />\r
     <ClInclude Include="ogl\factory.h" />\r
index 55672d9839e7690871ced85d54e37f05cf2fec87..74a5d1d3b570ae45d1d441eab6038b16420b1a7f 100644 (file)
@@ -25,9 +25,6 @@
   </ItemGroup>\r
   <ItemGroup>\r
     <ClInclude Include="StdAfx.h" />\r
-    <ClInclude Include="cpu\util\simd.h">\r
-      <Filter>source\cpu\util</Filter>\r
-    </ClInclude>\r
     <ClInclude Include="ogl\image\image_mixer.h">\r
       <Filter>source\ogl\image</Filter>\r
     </ClInclude>\r
@@ -70,6 +67,9 @@
     <ClInclude Include="ogl\factory.h">\r
       <Filter>source\ogl</Filter>\r
     </ClInclude>\r
+    <ClInclude Include="cpu\util\xmm.h">\r
+      <Filter>source\cpu\util</Filter>\r
+    </ClInclude>\r
   </ItemGroup>\r
   <ItemGroup>\r
     <ClCompile Include="StdAfx.cpp" />\r
index 21ddadada0a8eac3947bf9059537032cfa5288df..054f7f59dcded928ac448bdb44a98c6d96661739 100644 (file)
@@ -24,7 +24,7 @@
 #include "image_mixer.h"\r
 \r
 #include "../util/write_frame.h"\r
-#include "../util/simd.h"\r
+#include "../util/xmm.h"\r
 \r
 #include <common/assert.h>\r
 #include <common/gl/gl_check.h>\r
@@ -94,50 +94,54 @@ bool operator!=(const item& lhs, const item& rhs)
        return !(lhs == rhs);\r
 }\r
        \r
-inline xmm_epi8 blend(xmm_epi8 dest, xmm_epi8 source)\r
+inline xmm::s8_x blend(xmm::s8_x dest, xmm::s8_x source)\r
 {      \r
-       auto s = xmm_cast<xmm_epi16>(source);\r
+       using namespace xmm;\r
+\r
+       auto s = s16_x(source);\r
        auto d = dest;\r
 \r
-       const xmm_epi16 round   = 128;\r
-       const xmm_epi16 lomask  = 0x00FF;\r
+       const s16_x round       = 128;\r
+       const s16_x lomask      = 0x00FF;\r
 \r
        // T(S, D) = S * D[A] + 0x80\r
-       auto aaaa   = xmm_epi8::shuffle(d, xmm_epi8(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
-       d                       = xmm_epi8::umin(d, aaaa); // overflow guard\r
+       auto aaaa   = s8_x::shuffle(d, s8_x(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
+       d                       = s8_x(u8_x::min(u8_x(d), u8_x(aaaa))); // overflow guard\r
 \r
-       auto xaxa       = xmm_cast<xmm_epi16>(aaaa) & lomask;           \r
+       auto xaxa       = s16_x(aaaa) & lomask;         \r
                              \r
        auto xrxb       = s & lomask;\r
-       auto t1         = xmm_epi16::multiply_low(xrxb, xaxa) + round;    \r
+       auto t1         = s16_x::multiply_low(xrxb, xaxa) + round;    \r
                        \r
        auto xaxg       = s >> 8;\r
-       auto t2         = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
+       auto t2         = s16_x::multiply_low(xaxg, xaxa) + round;\r
                \r
        // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
-       auto rxbx       = xmm_cast<xmm_epi8>(((t1 >> 8) + t1) >> 8);      \r
-       auto axgx       = xmm_cast<xmm_epi8>((t2 >> 8) + t2);    \r
-       auto argb   = xmm_epi8::blend(rxbx, axgx, xmm_epi8(-1, 0, -1, 0));\r
+       auto rxbx       = s8_x(((t1 >> 8) + t1) >> 8);      \r
+       auto axgx       = s8_x((t2 >> 8) + t2);    \r
+       auto argb   = s8_x::blend(rxbx, axgx, s8_x(-1, 0, -1, 0));\r
 \r
-       return xmm_cast<xmm_epi8>(s) + (d - argb);\r
+       return s8_x(s) + (d - argb);\r
 }\r
        \r
-template<typename write_op>\r
+template<typename write_tag>\r
 static void kernel(uint8_t* dest, const uint8_t* source, size_t count, const core::frame_transform& transform)\r
-{                              \r
+{                      \r
+       using namespace xmm;\r
+\r
        for(auto n = 0; n < count; n += 32)    \r
        {\r
-               auto s0 = xmm_epi8::load(dest+n+0);\r
-               auto s1 = xmm_epi8::load(dest+n+16);\r
+               auto s0 = s8_x::load(dest+n+0);\r
+               auto s1 = s8_x::load(dest+n+16);\r
 \r
-               auto d0 = xmm_epi8::load(source+n+0);\r
-               auto d1 = xmm_epi8::load(source+n+16);\r
+               auto d0 = s8_x::load(source+n+0);\r
+               auto d1 = s8_x::load(source+n+16);\r
                \r
                auto argb0 = blend(d0, s0);\r
                auto argb1 = blend(d1, s1);\r
 \r
-               xmm_epi8::write<write_op>(argb0, dest+n+0);\r
-               xmm_epi8::write<write_op>(argb1, dest+n+16);\r
+               s8_x::write(argb0, dest+n+0 , write_tag());\r
+               s8_x::write(argb1, dest+n+16, write_tag());\r
        } \r
 }\r
 \r
@@ -208,9 +212,9 @@ private:
 \r
                                auto it = items.begin();\r
                                for(; it != items.end()-1; ++it)                        \r
-                                       kernel<store_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+                                       kernel<xmm::store_tag>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
 \r
-                               kernel<stream_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+                               kernel<xmm::stream_tag>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
                        }\r
                });\r
        }\r
diff --git a/accelerator/cpu/util/simd.h b/accelerator/cpu/util/simd.h
deleted file mode 100644 (file)
index 8dd77fa..0000000
+++ /dev/null
@@ -1,701 +0,0 @@
-#pragma once\r
-\r
-#include <intrin.h>\r
-\r
-#include <type_traits>\r
-#include <vector>\r
-#include <tbb/cache_aligned_allocator.h>\r
-\r
-namespace caspar { namespace accelerator { namespace cpu {\r
-\r
-typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
-\r
-struct stream_write\r
-{\r
-       static const int value = 0x01;\r
-};\r
-struct store_write\r
-{\r
-       static const int value = 0x02;\r
-};\r
-\r
-class xmm_ps\r
-{\r
-       __m128 value_;\r
-public:\r
-       xmm_ps()\r
-       {\r
-       }\r
-       \r
-       xmm_ps(float value_)\r
-               : value_(_mm_set1_ps(value_))\r
-       {\r
-       }\r
-       \r
-       xmm_ps(__m128 value_)\r
-               : value_(value_)\r
-       {\r
-       }\r
-               \r
-       xmm_ps& operator+=(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_add_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_ps& operator-=(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_sub_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_ps& operator*=(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_mul_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_ps& operator/=(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_div_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_ps& horizontal_add(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_hadd_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_ps& horizontal_sub(const xmm_ps& other)\r
-       {\r
-               value_ = _mm_hsub_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_ps unpack_low(const xmm_ps& other)\r
-       {               \r
-               value_ = _mm_unpacklo_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_ps unpack_high(const xmm_ps& other)\r
-       {               \r
-               value_ = _mm_unpackhi_ps(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       float operator[](int index) const\r
-       {\r
-               return value_.m128_f32[index];\r
-       }\r
-\r
-       float& operator[](int index)\r
-       {\r
-               return value_.m128_f32[index];\r
-       }\r
-\r
-       static xmm_ps zero()\r
-       {\r
-               return _mm_setzero_ps();\r
-       }\r
-\r
-       static xmm_ps load(const float* ptr)\r
-       {\r
-               return _mm_load_ps(ptr);\r
-       }\r
-       \r
-       static xmm_ps loadu(const float* ptr)\r
-       {\r
-               return _mm_loadu_ps(ptr);\r
-       }\r
-\r
-       static void stream(const xmm_ps& source, float* dest)\r
-       {\r
-               _mm_stream_ps(dest, source.value_);\r
-       }\r
-               \r
-       static xmm_ps horizontal_add(const xmm_ps& lhs, const xmm_ps& rhs)\r
-       {\r
-               return xmm_ps(lhs).horizontal_add(rhs);\r
-       }\r
-\r
-       static xmm_ps horizontal_sub(const xmm_ps& lhs, const xmm_ps& rhs)\r
-       {\r
-               return xmm_ps(lhs).horizontal_sub(rhs);\r
-       }\r
-\r
-       static xmm_ps unpack_low(const xmm_ps& lhs, const xmm_ps& rhs)\r
-       {\r
-               return xmm_ps(lhs).unpack_low(rhs);\r
-       }\r
-\r
-       static xmm_ps unpack_high(const xmm_ps& lhs, const xmm_ps& rhs)\r
-       {\r
-               return xmm_ps(lhs).unpack_high(rhs);\r
-       }\r
-};\r
-       \r
-inline xmm_ps operator+(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{              \r
-       return xmm_ps(lhs) += rhs;\r
-}\r
-\r
-inline xmm_ps operator-(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{              \r
-       return xmm_ps(lhs) -= rhs;\r
-}\r
-\r
-inline xmm_ps operator*(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{              \r
-       return xmm_ps(lhs) *= rhs;\r
-}\r
-\r
-inline xmm_ps operator/(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{              \r
-       return xmm_ps(lhs) /= rhs;\r
-}\r
-\r
-class xmm_epi32\r
-{\r
-       __m128i value_;\r
-       template<typename> friend struct xmm_cast_impl;\r
-public:\r
-       typedef xmm_epi32 xmm_epi_tag;\r
-\r
-       xmm_epi32()\r
-       {\r
-       }\r
-\r
-       xmm_epi32(__m128i value)\r
-               : value_(value)\r
-       {\r
-       }\r
-       \r
-       xmm_epi32& operator>>=(int count)\r
-       {\r
-               value_ = _mm_srli_epi32(value_, count);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi32& operator<<=(int count)\r
-       {\r
-               value_ = _mm_slli_epi32(value_, count);\r
-               return *this;\r
-       }\r
-               \r
-       xmm_epi32& operator|=(const xmm_epi32& other)\r
-       {\r
-               value_ = _mm_or_si128(value_, other.value_);\r
-               return *this;\r
-       }       \r
-       \r
-       xmm_epi32& operator&=(const xmm_epi32& other)\r
-       {\r
-               value_ = _mm_and_si128(value_, other.value_);\r
-               return *this;\r
-       }       \r
-\r
-       static xmm_epi32 load(const void* source)\r
-       {\r
-               return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-       \r
-       static xmm_epi32 loadu(const void* source)\r
-       {\r
-               return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-               \r
-       int32_t operator[](int index) const\r
-       {\r
-               return value_.m128i_i32[index];\r
-       }\r
-\r
-       int32_t& operator[](int index)\r
-       {\r
-               return value_.m128i_i32[index];\r
-       }\r
-\r
-       static xmm_epi32 zero()\r
-       {\r
-               return _mm_setzero_si128();\r
-       }\r
-};\r
-\r
-inline xmm_epi32 operator>>(const xmm_epi32& lhs, int count)\r
-{              \r
-       return xmm_epi32(lhs) >>= count;\r
-}\r
-\r
-inline xmm_epi32 operator<<(const xmm_epi32& lhs, int count)\r
-{              \r
-       return xmm_epi32(lhs) <<= count;\r
-}\r
-\r
-inline xmm_epi32 operator|(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
-{              \r
-       return xmm_epi32(lhs) |= rhs;\r
-}\r
-\r
-inline xmm_epi32 operator&(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
-{              \r
-       return xmm_epi32(lhs) &= rhs;\r
-}\r
-\r
-class xmm_epi16\r
-{\r
-       __m128i value_;\r
-       template<typename> friend struct xmm_cast_impl;\r
-       friend xmm_epi32 horizontal_add(const xmm_epi16&);\r
-       friend class xmm_epi8;\r
-public:\r
-       typedef xmm_epi16 xmm_epi_tag;\r
-\r
-       xmm_epi16()\r
-       {\r
-       }\r
-\r
-       xmm_epi16(__m128i value)\r
-               : value_(value)\r
-       {\r
-       }\r
-\r
-       xmm_epi16(short value)\r
-               : value_(_mm_set1_epi16(value))\r
-       {\r
-       }\r
-\r
-       xmm_epi16& operator+=(const xmm_epi16& other)\r
-       {\r
-               value_ = _mm_add_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi16& operator-=(const xmm_epi16& other)\r
-       {\r
-               value_ = _mm_sub_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16& operator>>=(int count)\r
-       {\r
-               value_ = _mm_srli_epi16(value_, count);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi16& operator<<=(int count)\r
-       {\r
-               value_ = _mm_slli_epi16(value_, count);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16& operator|=(const xmm_epi16& other)\r
-       {\r
-               value_ = _mm_or_si128(value_, other.value_);\r
-               return *this;\r
-       }       \r
-       \r
-       xmm_epi16& operator&=(const xmm_epi16& other)\r
-       {\r
-               value_ = _mm_and_si128(value_, other.value_);\r
-               return *this;\r
-       }       \r
-               \r
-       xmm_epi16 multiply_low(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_mullo_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16 multiply_high(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_mulhi_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16 umultiply_low(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_mullo_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16 umultiply_high(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_mulhi_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi16 and_not(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_andnot_si128(other.value_, value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16 unpack_low(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_unpacklo_epi16 (value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi16 unpack_high(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_unpackhi_epi16 (value_, other.value_);\r
-               return *this;\r
-       }\r
-                       \r
-       xmm_epi16 max(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_max_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi16 min(const xmm_epi16& other)\r
-       {               \r
-               value_ = _mm_min_epi16(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       int16_t operator[](int index) const\r
-       {\r
-               return value_.m128i_i16[index];\r
-       }\r
-\r
-       int16_t& operator[](int index)\r
-       {\r
-               return value_.m128i_i16[index];\r
-       }\r
-       \r
-       static xmm_epi16 load(const void* source)\r
-       {\r
-               return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-       \r
-       static xmm_epi16 loadu(const void* source)\r
-       {\r
-               return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-       \r
-       static xmm_epi32 horizontal_add(const xmm_epi16& lhs)\r
-       {\r
-               #ifdef SSIM_XOP\r
-                               return _mm_haddd_epi16(value_);\r
-               #else\r
-                               return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));\r
-               #endif\r
-       }\r
-\r
-       static xmm_epi16 multiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).multiply_low(rhs);\r
-       }\r
-\r
-       static xmm_epi16 multiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).multiply_high(rhs);\r
-       }\r
-\r
-       static xmm_epi16 umultiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).umultiply_low(rhs);\r
-       }\r
-\r
-       static xmm_epi16 umultiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).umultiply_high(rhs);\r
-       }\r
-\r
-       static xmm_epi16 unpack_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).unpack_low(rhs);\r
-       }\r
-\r
-       static xmm_epi16 unpack_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).unpack_high(rhs);\r
-       }\r
-       \r
-       static xmm_epi16 and_not(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).and_not(rhs);\r
-       }\r
-       \r
-       static xmm_epi16 max(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).max(rhs);\r
-       }\r
-       \r
-       static xmm_epi16 min(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return xmm_epi16(lhs).min(rhs);\r
-       }\r
-\r
-       static xmm_epi16 zero()\r
-       {\r
-               return _mm_setzero_si128();\r
-       }\r
-};\r
-\r
-inline xmm_epi16 operator+(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{\r
-       return xmm_epi16(lhs) += rhs;\r
-}\r
-\r
-inline xmm_epi16 operator-(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{\r
-       return xmm_epi16(lhs) -= rhs;\r
-}\r
-\r
-inline xmm_epi16 operator>>(const xmm_epi16& lhs, int count)\r
-{              \r
-       return xmm_epi16(lhs) >>= count;\r
-}\r
-\r
-inline xmm_epi16 operator<<(const xmm_epi16& lhs, int count)\r
-{              \r
-       return xmm_epi16(lhs) <<= count;\r
-}\r
-\r
-inline xmm_epi16 operator|(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{              \r
-       return xmm_epi16(lhs) |= rhs;\r
-}\r
-\r
-inline xmm_epi16 operator&(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{              \r
-       return xmm_epi16(lhs) &= rhs;\r
-}\r
-\r
-class xmm_epi8\r
-{\r
-       __m128i value_;\r
-       template<typename> friend struct xmm_cast_impl;\r
-       friend xmm_epi16 multiply_add(const xmm_epi8&, const xmm_epi8&);\r
-public:\r
-       typedef xmm_epi8 xmm_epi_tag;\r
-\r
-       xmm_epi8()\r
-       {\r
-       }\r
-\r
-       xmm_epi8(__m128i value)\r
-               : value_(value)\r
-       {\r
-       }\r
-       \r
-       xmm_epi8(char b)\r
-               : value_(_mm_set1_epi8(b))\r
-       {\r
-       }\r
-\r
-       xmm_epi8(char b3,  char b2,  char b1,  char b0)\r
-               : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
-       {\r
-       }\r
-\r
-       xmm_epi8(char b15, char b14, char b13, char b12, \r
-                        char b11, char b10, char b9,  char b8,  \r
-                        char b7,  char b6,  char b5,  char b4,  \r
-                        char b3,  char b2,  char b1,  char b0)\r
-               : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
-       {\r
-       }\r
-       \r
-       xmm_epi8& operator+=(const xmm_epi8& other)\r
-       {\r
-               value_ = _mm_add_epi8(value_, other.value_);\r
-               return *this;\r
-       }\r
-\r
-       xmm_epi8& operator-=(const xmm_epi8& other)\r
-       {\r
-               value_ = _mm_sub_epi8(value_, other.value_);\r
-               return *this;\r
-       }\r
-                       \r
-       xmm_epi8& shuffle(const xmm_epi8& other)\r
-       {               \r
-               value_ = _mm_shuffle_epi8 (value_, other.value_);\r
-               return *this;\r
-       }\r
-               \r
-       xmm_epi8& max(const xmm_epi8& other)\r
-       {               \r
-               value_ = _mm_max_epi8 (value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi8& min(const xmm_epi8& other)\r
-       {               \r
-               value_ = _mm_min_epi8 (value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi8& umax(const xmm_epi8& other)\r
-       {               \r
-               value_ = _mm_max_epu8(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi8& umin(const xmm_epi8& other)\r
-       {               \r
-               value_ = _mm_min_epu8(value_, other.value_);\r
-               return *this;\r
-       }\r
-       \r
-       xmm_epi8& blend(const xmm_epi8& other, const xmm_epi8& mask)\r
-       {               \r
-               value_ = _mm_blendv_epi8(value_, other.value_, mask.value_);\r
-               return *this;\r
-       }\r
-       \r
-       const xmm_epi8& stream(void* dest) const\r
-       {\r
-               _mm_stream_si128(reinterpret_cast<__m128i*>(dest), value_);\r
-               return *this;\r
-       }\r
-\r
-       const xmm_epi8& store(void* dest) const\r
-       {\r
-               _mm_store_si128(reinterpret_cast<__m128i*>(dest), value_);\r
-               return *this;\r
-       }\r
-\r
-       template<typename write_op>\r
-       const xmm_epi8& write(void* dest) const\r
-       {\r
-               if(write_op::value == stream_write::value)\r
-                       return stream(dest);\r
-               else\r
-                       return store(dest);\r
-       }\r
-       \r
-       char operator[](int index) const\r
-       {\r
-               return value_.m128i_i8[index];\r
-       }\r
-\r
-       char& operator[](int index)\r
-       {\r
-               return value_.m128i_i8[index];\r
-       }\r
-       \r
-       static xmm_epi16 unpack_low(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {\r
-               return _mm_unpacklo_epi8(rhs.value_, lhs.value_);\r
-       }\r
-       \r
-       static xmm_epi16 unpack_high(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {\r
-               return _mm_unpackhi_epi8(rhs.value_, lhs.value_);\r
-       }\r
-       \r
-       static xmm_epi8 upack(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-       {\r
-               return _mm_packus_epi16(lhs.value_, rhs.value_);\r
-       }\r
-\r
-       static const xmm_epi8& stream(const xmm_epi8& source, void* dest)\r
-       {\r
-               return source.stream(dest);\r
-       }\r
-       \r
-       static const xmm_epi8& store(const xmm_epi8& source, void* dest)\r
-       {\r
-               return source.store(dest);\r
-       }\r
-       \r
-       template<typename write_op>\r
-       static const xmm_epi8& write(const xmm_epi8& source, void* dest)\r
-       {\r
-               return source.write<write_op>(dest);\r
-       }\r
-\r
-       static xmm_epi8 load(const void* source)\r
-       {\r
-               return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-       \r
-       static xmm_epi8 loadu(const void* source)\r
-       {\r
-               return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
-       }\r
-\r
-       static xmm_epi16 multiply_add(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi16(_mm_maddubs_epi16(lhs.value_, rhs.value_));\r
-       }\r
-\r
-       static xmm_epi8& shuffle(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi8(lhs).shuffle(rhs);\r
-       }\r
-       \r
-       static xmm_epi8& max(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi8(lhs).max(rhs);\r
-       }\r
-       \r
-       static xmm_epi8& min(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi8(lhs).min(rhs);\r
-       }\r
-       \r
-       static xmm_epi8& umax(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi8(lhs).umax(rhs);\r
-       }\r
-       \r
-       static xmm_epi8& umin(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-       {               \r
-               return xmm_epi8(lhs).umin(rhs);\r
-       }\r
-\r
-       static xmm_epi8& blend(const xmm_epi8& lhs, const xmm_epi8& rhs, const xmm_epi8& mask)\r
-       {               \r
-               return xmm_epi8(lhs).blend(rhs, mask);\r
-       }\r
-\r
-       static xmm_epi8 zero()\r
-       {\r
-               return _mm_setzero_si128();\r
-       }\r
-};\r
-\r
-inline xmm_epi8 operator+(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-{\r
-       return xmm_epi8(lhs) += rhs;\r
-}\r
-\r
-inline xmm_epi8 operator-(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-{\r
-       return xmm_epi8(lhs) -= rhs;\r
-}\r
-\r
-// xmm_cast\r
-\r
-template<typename T>\r
-struct xmm_cast_impl\r
-{              \r
-       template<typename U>\r
-       T operator()(const U& other)\r
-       {\r
-               return typename T::xmm_epi_tag(other.value_);\r
-       }\r
-};\r
-\r
-template<>\r
-struct xmm_cast_impl<xmm_ps>\r
-{\r
-       xmm_ps operator()(const xmm_epi32& other)\r
-       {\r
-               return _mm_cvtepi32_ps(other.value_);\r
-       }\r
-};\r
-\r
-template<typename T, typename U> \r
-T xmm_cast(const U& other)\r
-{\r
-       return xmm_cast_impl<T>()(other);\r
-}\r
-\r
-}}}
\ No newline at end of file
diff --git a/accelerator/cpu/util/xmm.h b/accelerator/cpu/util/xmm.h
new file mode 100644 (file)
index 0000000..3b9bf1d
--- /dev/null
@@ -0,0 +1,658 @@
+#pragma once\r
+\r
+#include <intrin.h>\r
+\r
+#include <type_traits>\r
+#include <vector>\r
+#include <tbb/cache_aligned_allocator.h>\r
+\r
+namespace caspar { namespace accelerator { namespace cpu { namespace xmm {\r
+\r
+typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
+\r
+struct stream_tag\r
+{\r
+       static const int value = 0x01;\r
+};\r
+struct store_tag\r
+{\r
+       static const int value = 0x02;\r
+};\r
+\r
+class s32_x;\r
+class s16_x;\r
+class  s8_x;\r
+class  u8_x;\r
+\r
+template<typename T>\r
+class base_x\r
+{\r
+public:\r
+       static T load(const void* source);\r
+       static T loadu(const void* source);\r
+       static T zero();\r
+       \r
+       static void write(const T& source, void* dest, stream_tag);\r
+       static void write(const T& source, void* dest, store_tag);\r
+       static void stream(const T& source, void* dest);\r
+       static void store(const T& source, void* dest);\r
+};\r
+\r
+class s32_x : public base_x<s32_x>\r
+{\r
+       __m128i value_;\r
+       template<typename> friend class base_x;\r
+       friend class s16_x;\r
+       friend class s8_x;\r
+       friend class u8_x;\r
+public:\r
+       typedef s32_x xmm_epi_tag;\r
+\r
+       s32_x();\r
+       s32_x(const s16_x& other);\r
+       s32_x(const s8_x& other);\r
+       s32_x(const u8_x& other);\r
+       s32_x(const __m128i& value);\r
+\r
+       s32_x& operator>>=(int count);\r
+       s32_x& operator<<=(int count);\r
+       s32_x& operator|=(const s32_x& other);\r
+       s32_x& operator&=(const s32_x& other);\r
+       int32_t operator[](int index) const;\r
+       int32_t& operator[](int index);\r
+};\r
+\r
+class s16_x : public base_x<s16_x>\r
+{\r
+       __m128i value_;\r
+\r
+private:\r
+       template<typename> friend class base_x;\r
+       friend class s32_x;\r
+       friend class s8_x;\r
+       friend class u8_x;\r
+public:\r
+       typedef s16_x xmm_epi_tag;\r
+\r
+       s16_x();\r
+       s16_x(const s32_x& other);\r
+       s16_x(const s8_x& other);\r
+       s16_x(const u8_x& other);\r
+       s16_x(const __m128i& value);\r
+       s16_x(short value);\r
+\r
+       s16_x& operator+=(const s16_x& other);  \r
+       s16_x& operator-=(const s16_x& other);\r
+       s16_x& operator>>=(int count);\r
+       s16_x& operator<<=(int count);\r
+       s16_x& operator|=(const s16_x& other);\r
+       s16_x& operator&=(const s16_x& other);  \r
+       int16_t operator[](int index) const;\r
+       int16_t& operator[](int index);\r
+       \r
+       static s16_x unpack_low(const s8_x& lhs, const s8_x& rhs);\r
+       static s16_x unpack_high(const s8_x& lhs, const s8_x& rhs);\r
+       static s32_x horizontal_add(const s16_x& lhs);\r
+       static s16_x multiply_low(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x multiply_high(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x umultiply_low(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x umultiply_high(const s16_x& lhs, const s16_x& rhs);        \r
+       static s16_x unpack_low(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x unpack_high(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x and_not(const s16_x& lhs, const s16_x& rhs);       \r
+       static s16_x max(const s16_x& lhs, const s16_x& rhs);\r
+       static s16_x min(const s16_x& lhs, const s16_x& rhs);\r
+};\r
+\r
+class s8_x : public base_x<s8_x>\r
+{\r
+       __m128i value_;\r
+private:\r
+       template<typename> friend class base_x;\r
+       friend class s32_x;\r
+       friend class s16_x;\r
+       friend class u8_x;\r
+public:\r
+       typedef s8_x xmm_epi_tag;\r
+\r
+       s8_x();\r
+       s8_x(const s32_x& other);\r
+       s8_x(const s16_x& other);\r
+       s8_x(const u8_x& other);\r
+       s8_x(const __m128i& value);     \r
+       s8_x(char b);\r
+       s8_x(char b3,  char b2,  char b1,  char b0);\r
+       s8_x(char b15, char b14, char b13, char b12, \r
+                        char b11, char b10, char b9,  char b8,  \r
+                        char b7,  char b6,  char b5,  char b4,  \r
+                        char b3,  char b2,  char b1,  char b0);\r
+\r
+       s8_x& operator+=(const s8_x& other);\r
+       s8_x& operator-=(const s8_x& other);                                                                    \r
+       char operator[](int index) const;\r
+       char& operator[](int index);\r
+       \r
+       static s8_x upack(const s16_x& lhs, const s16_x& rhs);\r
+\r
+       static s16_x multiply_add(const s8_x& lhs, const s8_x& rhs);\r
+       static s8_x shuffle(const s8_x& lhs, const s8_x& rhs);\r
+       static s8_x max(const s8_x& lhs, const s8_x& rhs);\r
+       static s8_x min(const s8_x& lhs, const s8_x& rhs);\r
+       static s8_x blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask);\r
+       static s8_x zero();\r
+};\r
+\r
+class u8_x : public base_x<u8_x>\r
+{\r
+       __m128i value_;\r
+private:\r
+       template<typename> friend class base_x;\r
+       friend class s32_x;\r
+       friend class s16_x;\r
+       friend class s8_x;\r
+public:\r
+       typedef u8_x xmm_epu_tag;\r
+\r
+       u8_x();\r
+       u8_x(const s32_x& other);\r
+       u8_x(const s16_x& other);\r
+       u8_x(const s8_x& other);\r
+       u8_x(const __m128i& value);     \r
+       u8_x(char b);\r
+       u8_x(char b3,  char b2,  char b1,  char b0);\r
+       u8_x(char b15, char b14, char b13, char b12, \r
+                        char b11, char b10, char b9,  char b8,  \r
+                        char b7,  char b6,  char b5,  char b4,  \r
+                        char b3,  char b2,  char b1,  char b0);\r
+                                                                               \r
+       char operator[](int index) const;\r
+       char& operator[](int index);\r
+                       \r
+       static u8_x max(const u8_x& lhs, const u8_x& rhs);\r
+       static u8_x min(const u8_x& lhs, const u8_x& rhs);\r
+};\r
+\r
+// base_x\r
+\r
+template<typename T>\r
+T base_x<T>::load(const void* source)\r
+{\r
+       return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
+}\r
+       \r
+template<typename T>\r
+T base_x<T>::loadu(const void* source)\r
+{\r
+       return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
+}\r
+\r
+template<typename T>\r
+T base_x<T>::zero()\r
+{\r
+       return _mm_setzero_si128();\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::write(const T& source, void* dest, store_tag)\r
+{\r
+       base_x<T>::store(source, dest);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::write(const T& source, void* dest, stream_tag)\r
+{\r
+       base_x<T>::stream(source, dest);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::stream(const T& source, void* dest)\r
+{\r
+       _mm_stream_si128(reinterpret_cast<__m128i*>(dest), source.value_);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::store(const T& source, void* dest)\r
+{      \r
+       _mm_store_si128(reinterpret_cast<__m128i*>(dest), source.value_);\r
+}\r
+\r
+// s32_x\r
+\r
+s32_x::s32_x()\r
+{\r
+}\r
+\r
+s32_x::s32_x(const s16_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const s8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const u8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const __m128i& value)\r
+       : value_(value)\r
+{\r
+}\r
+       \r
+s32_x& s32_x::operator>>=(int count)\r
+{\r
+       value_ = _mm_srli_epi32(value_, count);\r
+       return *this;\r
+}\r
+       \r
+s32_x& s32_x::operator<<=(int count)\r
+{\r
+       value_ = _mm_slli_epi32(value_, count);\r
+       return *this;\r
+}\r
+               \r
+s32_x& s32_x::operator|=(const s32_x& other)\r
+{\r
+       value_ = _mm_or_si128(value_, other.value_);\r
+       return *this;\r
+}      \r
+       \r
+s32_x& s32_x::operator&=(const s32_x& other)\r
+{\r
+       value_ = _mm_and_si128(value_, other.value_);\r
+       return *this;\r
+}      \r
+               \r
+int32_t s32_x::operator[](int index) const\r
+{\r
+       return value_.m128i_i32[index];\r
+}\r
+\r
+int32_t& s32_x::operator[](int index)\r
+{\r
+       return value_.m128i_i32[index];\r
+}\r
+\r
+inline s32_x operator>>(const s32_x& lhs, int count)\r
+{              \r
+       return s32_x(lhs) >>= count;\r
+}\r
+\r
+inline s32_x operator<<(const s32_x& lhs, int count)\r
+{              \r
+       return s32_x(lhs) <<= count;\r
+}\r
+\r
+inline s32_x operator|(const s32_x& lhs, const s32_x& rhs)\r
+{              \r
+       return s32_x(lhs) |= rhs;\r
+}\r
+\r
+inline s32_x operator&(const s32_x& lhs, const s32_x& rhs)\r
+{              \r
+       return s32_x(lhs) &= rhs;\r
+}\r
+\r
+// s16_x\r
+\r
+s16_x::s16_x()\r
+{\r
+}\r
+\r
+s16_x::s16_x(const s32_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const s8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const u8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const __m128i& value)\r
+       : value_(value)\r
+{\r
+}\r
+\r
+s16_x::s16_x(short value)\r
+       : value_(_mm_set1_epi16(value))\r
+{\r
+}\r
+\r
+s16_x& s16_x::operator+=(const s16_x& other)\r
+{\r
+       value_ = _mm_add_epi16(value_, other.value_);\r
+       return *this;\r
+}\r
+       \r
+s16_x& s16_x::operator-=(const s16_x& other)\r
+{\r
+       value_ = _mm_sub_epi16(value_, other.value_);\r
+       return *this;\r
+}\r
+\r
+s16_x& s16_x::operator>>=(int count)\r
+{\r
+       value_ = _mm_srli_epi16(value_, count);\r
+       return *this;\r
+}\r
+       \r
+s16_x& s16_x::operator<<=(int count)\r
+{\r
+       value_ = _mm_slli_epi16(value_, count);\r
+       return *this;\r
+}\r
+\r
+s16_x& s16_x::operator|=(const s16_x& other)\r
+{\r
+       value_ = _mm_or_si128(value_, other.value_);\r
+       return *this;\r
+}      \r
+       \r
+s16_x& s16_x::operator&=(const s16_x& other)\r
+{\r
+       value_ = _mm_and_si128(value_, other.value_);\r
+       return *this;\r
+}      \r
+                       \r
+int16_t s16_x::operator[](int index) const\r
+{\r
+       return value_.m128i_i16[index];\r
+}\r
+\r
+int16_t& s16_x::operator[](int index)\r
+{\r
+       return value_.m128i_i16[index];\r
+}\r
+\r
+s16_x s16_x::unpack_low(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+       return _mm_unpacklo_epi8(rhs.value_, lhs.value_);\r
+}\r
+       \r
+s16_x s16_x::unpack_high(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+       return _mm_unpackhi_epi8(rhs.value_, lhs.value_);\r
+}\r
+       \r
+s32_x s16_x::horizontal_add(const s16_x& lhs)\r
+{\r
+       #ifdef SSIM_XOP\r
+                       return _mm_haddd_epi16(value_);\r
+       #else\r
+                       return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));\r
+       #endif\r
+}\r
+\r
+s16_x s16_x::multiply_low(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_mullo_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::multiply_high(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_mulhi_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::unpack_low(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_unpacklo_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::unpack_high(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_unpackhi_epi16(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s16_x s16_x::and_not(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_andnot_si128(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s16_x s16_x::max(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_max_epi16(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s16_x s16_x::min(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_min_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+inline s16_x operator+(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return s16_x(lhs) += rhs;\r
+}\r
+\r
+inline s16_x operator-(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return s16_x(lhs) -= rhs;\r
+}\r
+\r
+inline s16_x operator>>(const s16_x& lhs, int count)\r
+{              \r
+       return s16_x(lhs) >>= count;\r
+}\r
+\r
+inline s16_x operator<<(const s16_x& lhs, int count)\r
+{              \r
+       return s16_x(lhs) <<= count;\r
+}\r
+\r
+inline s16_x operator|(const s16_x& lhs, const s16_x& rhs)\r
+{              \r
+       return s16_x(lhs) |= rhs;\r
+}\r
+\r
+inline s16_x operator&(const s16_x& lhs, const s16_x& rhs)\r
+{              \r
+       return s16_x(lhs) &= rhs;\r
+}\r
+\r
+// s8_x\r
+\r
+s8_x::s8_x()\r
+{\r
+}\r
+\r
+s8_x::s8_x(const s32_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const s16_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const u8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const __m128i& value)\r
+       : value_(value)\r
+{\r
+}      \r
+\r
+s8_x::s8_x(char b)\r
+       : value_(_mm_set1_epi8(b))\r
+{\r
+}\r
+\r
+s8_x::s8_x(char b3,  char b2,  char b1,  char b0)\r
+       : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
+{\r
+}\r
+\r
+s8_x::s8_x(char b15, char b14, char b13, char b12, \r
+                       char b11, char b10, char b9,  char b8,  \r
+                       char b7,  char b6,  char b5,  char b4,  \r
+                       char b3,  char b2,  char b1,  char b0)\r
+       : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
+{\r
+}\r
+       \r
+s8_x& s8_x::operator+=(const s8_x& other)\r
+{\r
+       value_ = _mm_add_epi8(value_, other.value_);\r
+       return *this;\r
+}\r
+\r
+s8_x& s8_x::operator-=(const s8_x& other)\r
+{\r
+       value_ = _mm_sub_epi8(value_, other.value_);\r
+       return *this;\r
+}\r
+                                                                       \r
+char s8_x::operator[](int index) const\r
+{\r
+       return value_.m128i_i8[index];\r
+}\r
+\r
+char& s8_x::operator[](int index)\r
+{\r
+       return value_.m128i_i8[index];\r
+}\r
+       \r
+s8_x s8_x::upack(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+       return _mm_packus_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s8_x::multiply_add(const s8_x& lhs, const s8_x& rhs)\r
+{              \r
+       return _mm_maddubs_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s8_x s8_x::shuffle(const s8_x& lhs, const s8_x& rhs)\r
+{              \r
+       return _mm_shuffle_epi8(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s8_x s8_x::max(const s8_x& lhs, const s8_x& rhs)\r
+{              \r
+       return _mm_max_epi8(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s8_x s8_x::min(const s8_x& lhs, const s8_x& rhs)\r
+{              \r
+       return _mm_min_epi8(lhs.value_, rhs.value_);\r
+}\r
+       \r
+s8_x s8_x::blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask)\r
+{              \r
+       return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);\r
+}\r
+\r
+inline s8_x operator+(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+       return s8_x(lhs) += rhs;\r
+}\r
+\r
+inline s8_x operator-(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+       return s8_x(lhs) -= rhs;\r
+}\r
+\r
+// u8_x\r
+\r
+u8_x::u8_x()\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s32_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s16_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s8_x& other)\r
+       : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const __m128i& value)\r
+       : value_(value)\r
+{\r
+}      \r
+\r
+u8_x::u8_x(char b)\r
+       : value_(_mm_set1_epi8(b))\r
+{\r
+}\r
+\r
+u8_x::u8_x(char b3,  char b2,  char b1,  char b0)\r
+       : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
+{\r
+}\r
+\r
+u8_x::u8_x(char b15, char b14, char b13, char b12, \r
+                       char b11, char b10, char b9,  char b8,  \r
+                       char b7,  char b6,  char b5,  char b4,  \r
+                       char b3,  char b2,  char b1,  char b0)\r
+       : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
+{\r
+}\r
+                                                                               \r
+char u8_x::operator[](int index) const\r
+{\r
+       return value_.m128i_i8[index];\r
+}\r
+\r
+char& u8_x::operator[](int index)\r
+{\r
+       return value_.m128i_i8[index];\r
+}\r
+\r
+u8_x u8_x::max(const u8_x& lhs, const u8_x& rhs)\r
+{              \r
+       return _mm_max_epu8(lhs.value_, rhs.value_);\r
+}\r
+       \r
+u8_x u8_x::min(const u8_x& lhs, const u8_x& rhs)\r
+{              \r
+       return _mm_min_epu8(lhs.value_, rhs.value_);\r
+}\r
+\r
+\r
+// xmm_cast\r
+\r
+//template<typename T>\r
+//struct xmm_cast_impl\r
+//{            \r
+//     template<typename U>\r
+//     T operator()(const U& other)\r
+//     {\r
+//             return typename T::xmm_epi_tag(other.value_);\r
+//     }\r
+//};\r
+//\r
+//template<>\r
+//struct xmm_cast_impl<xmm_ps>\r
+//{\r
+//     xmm_ps operator()(const s32_x& other)\r
+//     {\r
+//             return _mm_cvtepi32_ps(other.value_);\r
+//     }\r
+//};\r
+//\r
+//template<typename T, typename U> \r
+//T xmm_cast(const U& other)\r
+//{\r
+//     return xmm_cast_impl<T>()(other);\r
+//}\r
+\r
+}}}}
\ No newline at end of file