<ItemGroup>\r
<ClInclude Include="cpu\factory.h" />\r
<ClInclude Include="cpu\image\image_mixer.h" />\r
- <ClInclude Include="cpu\util\simd.h" />\r
+ <ClInclude Include="cpu\util\xmm.h" />\r
<ClInclude Include="cpu\util\write_frame.h" />\r
<ClInclude Include="factory.h" />\r
<ClInclude Include="ogl\factory.h" />\r
</ItemGroup>\r
<ItemGroup>\r
<ClInclude Include="StdAfx.h" />\r
- <ClInclude Include="cpu\util\simd.h">\r
- <Filter>source\cpu\util</Filter>\r
- </ClInclude>\r
<ClInclude Include="ogl\image\image_mixer.h">\r
<Filter>source\ogl\image</Filter>\r
</ClInclude>\r
<ClInclude Include="ogl\factory.h">\r
<Filter>source\ogl</Filter>\r
</ClInclude>\r
+ <ClInclude Include="cpu\util\xmm.h">\r
+ <Filter>source\cpu\util</Filter>\r
+ </ClInclude>\r
</ItemGroup>\r
<ItemGroup>\r
<ClCompile Include="StdAfx.cpp" />\r
#include "image_mixer.h"\r
\r
#include "../util/write_frame.h"\r
-#include "../util/simd.h"\r
+#include "../util/xmm.h"\r
\r
#include <common/assert.h>\r
#include <common/gl/gl_check.h>\r
return !(lhs == rhs);\r
}\r
\r
-inline xmm_epi8 blend(xmm_epi8 dest, xmm_epi8 source)\r
+inline xmm::s8_x blend(xmm::s8_x dest, xmm::s8_x source)\r
{ \r
- auto s = xmm_cast<xmm_epi16>(source);\r
+ using namespace xmm;\r
+\r
+ auto s = s16_x(source);\r
auto d = dest;\r
\r
- const xmm_epi16 round = 128;\r
- const xmm_epi16 lomask = 0x00FF;\r
+ const s16_x round = 128;\r
+ const s16_x lomask = 0x00FF;\r
\r
// T(S, D) = S * D[A] + 0x80\r
- auto aaaa = xmm_epi8::shuffle(d, xmm_epi8(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
- d = xmm_epi8::umin(d, aaaa); // overflow guard\r
+ auto aaaa = s8_x::shuffle(d, s8_x(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
+ d = s8_x(u8_x::min(u8_x(d), u8_x(aaaa))); // overflow guard\r
\r
- auto xaxa = xmm_cast<xmm_epi16>(aaaa) & lomask; \r
+ auto xaxa = s16_x(aaaa) & lomask; \r
\r
auto xrxb = s & lomask;\r
- auto t1 = xmm_epi16::multiply_low(xrxb, xaxa) + round; \r
+ auto t1 = s16_x::multiply_low(xrxb, xaxa) + round; \r
\r
auto xaxg = s >> 8;\r
- auto t2 = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
+ auto t2 = s16_x::multiply_low(xaxg, xaxa) + round;\r
\r
// C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
- auto rxbx = xmm_cast<xmm_epi8>(((t1 >> 8) + t1) >> 8); \r
- auto axgx = xmm_cast<xmm_epi8>((t2 >> 8) + t2); \r
- auto argb = xmm_epi8::blend(rxbx, axgx, xmm_epi8(-1, 0, -1, 0));\r
+ auto rxbx = s8_x(((t1 >> 8) + t1) >> 8); \r
+ auto axgx = s8_x((t2 >> 8) + t2); \r
+ auto argb = s8_x::blend(rxbx, axgx, s8_x(-1, 0, -1, 0));\r
\r
- return xmm_cast<xmm_epi8>(s) + (d - argb);\r
+ return s8_x(s) + (d - argb);\r
}\r
\r
-template<typename write_op>\r
+template<typename write_tag>\r
static void kernel(uint8_t* dest, const uint8_t* source, size_t count, const core::frame_transform& transform)\r
-{ \r
+{ \r
+ using namespace xmm;\r
+\r
for(auto n = 0; n < count; n += 32) \r
{\r
- auto s0 = xmm_epi8::load(dest+n+0);\r
- auto s1 = xmm_epi8::load(dest+n+16);\r
+ auto s0 = s8_x::load(dest+n+0);\r
+ auto s1 = s8_x::load(dest+n+16);\r
\r
- auto d0 = xmm_epi8::load(source+n+0);\r
- auto d1 = xmm_epi8::load(source+n+16);\r
+ auto d0 = s8_x::load(source+n+0);\r
+ auto d1 = s8_x::load(source+n+16);\r
\r
auto argb0 = blend(d0, s0);\r
auto argb1 = blend(d1, s1);\r
\r
- xmm_epi8::write<write_op>(argb0, dest+n+0);\r
- xmm_epi8::write<write_op>(argb1, dest+n+16);\r
+ s8_x::write(argb0, dest+n+0 , write_tag());\r
+ s8_x::write(argb1, dest+n+16, write_tag());\r
} \r
}\r
\r
\r
auto it = items.begin();\r
for(; it != items.end()-1; ++it) \r
- kernel<store_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+ kernel<xmm::store_tag>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
\r
- kernel<stream_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+ kernel<xmm::stream_tag>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
}\r
});\r
}\r
+++ /dev/null
-#pragma once\r
-\r
-#include <intrin.h>\r
-\r
-#include <type_traits>\r
-#include <vector>\r
-#include <tbb/cache_aligned_allocator.h>\r
-\r
-namespace caspar { namespace accelerator { namespace cpu {\r
-\r
-typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
-\r
-struct stream_write\r
-{\r
- static const int value = 0x01;\r
-};\r
-struct store_write\r
-{\r
- static const int value = 0x02;\r
-};\r
-\r
-class xmm_ps\r
-{\r
- __m128 value_;\r
-public:\r
- xmm_ps()\r
- {\r
- }\r
- \r
- xmm_ps(float value_)\r
- : value_(_mm_set1_ps(value_))\r
- {\r
- }\r
- \r
- xmm_ps(__m128 value_)\r
- : value_(value_)\r
- {\r
- }\r
- \r
- xmm_ps& operator+=(const xmm_ps& other)\r
- {\r
- value_ = _mm_add_ps(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_ps& operator-=(const xmm_ps& other)\r
- {\r
- value_ = _mm_sub_ps(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_ps& operator*=(const xmm_ps& other)\r
- {\r
- value_ = _mm_mul_ps(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_ps& operator/=(const xmm_ps& other)\r
- {\r
- value_ = _mm_div_ps(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_ps& horizontal_add(const xmm_ps& other)\r
- {\r
- value_ = _mm_hadd_ps(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_ps& horizontal_sub(const xmm_ps& other)\r
- {\r
- value_ = _mm_hsub_ps(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_ps unpack_low(const xmm_ps& other)\r
- { \r
- value_ = _mm_unpacklo_ps(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_ps unpack_high(const xmm_ps& other)\r
- { \r
- value_ = _mm_unpackhi_ps(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- float operator[](int index) const\r
- {\r
- return value_.m128_f32[index];\r
- }\r
-\r
- float& operator[](int index)\r
- {\r
- return value_.m128_f32[index];\r
- }\r
-\r
- static xmm_ps zero()\r
- {\r
- return _mm_setzero_ps();\r
- }\r
-\r
- static xmm_ps load(const float* ptr)\r
- {\r
- return _mm_load_ps(ptr);\r
- }\r
- \r
- static xmm_ps loadu(const float* ptr)\r
- {\r
- return _mm_loadu_ps(ptr);\r
- }\r
-\r
- static void stream(const xmm_ps& source, float* dest)\r
- {\r
- _mm_stream_ps(dest, source.value_);\r
- }\r
- \r
- static xmm_ps horizontal_add(const xmm_ps& lhs, const xmm_ps& rhs)\r
- {\r
- return xmm_ps(lhs).horizontal_add(rhs);\r
- }\r
-\r
- static xmm_ps horizontal_sub(const xmm_ps& lhs, const xmm_ps& rhs)\r
- {\r
- return xmm_ps(lhs).horizontal_sub(rhs);\r
- }\r
-\r
- static xmm_ps unpack_low(const xmm_ps& lhs, const xmm_ps& rhs)\r
- {\r
- return xmm_ps(lhs).unpack_low(rhs);\r
- }\r
-\r
- static xmm_ps unpack_high(const xmm_ps& lhs, const xmm_ps& rhs)\r
- {\r
- return xmm_ps(lhs).unpack_high(rhs);\r
- }\r
-};\r
- \r
-inline xmm_ps operator+(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{ \r
- return xmm_ps(lhs) += rhs;\r
-}\r
-\r
-inline xmm_ps operator-(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{ \r
- return xmm_ps(lhs) -= rhs;\r
-}\r
-\r
-inline xmm_ps operator*(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{ \r
- return xmm_ps(lhs) *= rhs;\r
-}\r
-\r
-inline xmm_ps operator/(const xmm_ps& lhs, const xmm_ps& rhs)\r
-{ \r
- return xmm_ps(lhs) /= rhs;\r
-}\r
-\r
-class xmm_epi32\r
-{\r
- __m128i value_;\r
- template<typename> friend struct xmm_cast_impl;\r
-public:\r
- typedef xmm_epi32 xmm_epi_tag;\r
-\r
- xmm_epi32()\r
- {\r
- }\r
-\r
- xmm_epi32(__m128i value)\r
- : value_(value)\r
- {\r
- }\r
- \r
- xmm_epi32& operator>>=(int count)\r
- {\r
- value_ = _mm_srli_epi32(value_, count);\r
- return *this;\r
- }\r
- \r
- xmm_epi32& operator<<=(int count)\r
- {\r
- value_ = _mm_slli_epi32(value_, count);\r
- return *this;\r
- }\r
- \r
- xmm_epi32& operator|=(const xmm_epi32& other)\r
- {\r
- value_ = _mm_or_si128(value_, other.value_);\r
- return *this;\r
- } \r
- \r
- xmm_epi32& operator&=(const xmm_epi32& other)\r
- {\r
- value_ = _mm_and_si128(value_, other.value_);\r
- return *this;\r
- } \r
-\r
- static xmm_epi32 load(const void* source)\r
- {\r
- return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
- \r
- static xmm_epi32 loadu(const void* source)\r
- {\r
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
- \r
- int32_t operator[](int index) const\r
- {\r
- return value_.m128i_i32[index];\r
- }\r
-\r
- int32_t& operator[](int index)\r
- {\r
- return value_.m128i_i32[index];\r
- }\r
-\r
- static xmm_epi32 zero()\r
- {\r
- return _mm_setzero_si128();\r
- }\r
-};\r
-\r
-inline xmm_epi32 operator>>(const xmm_epi32& lhs, int count)\r
-{ \r
- return xmm_epi32(lhs) >>= count;\r
-}\r
-\r
-inline xmm_epi32 operator<<(const xmm_epi32& lhs, int count)\r
-{ \r
- return xmm_epi32(lhs) <<= count;\r
-}\r
-\r
-inline xmm_epi32 operator|(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
-{ \r
- return xmm_epi32(lhs) |= rhs;\r
-}\r
-\r
-inline xmm_epi32 operator&(const xmm_epi32& lhs, const xmm_epi32& rhs)\r
-{ \r
- return xmm_epi32(lhs) &= rhs;\r
-}\r
-\r
-class xmm_epi16\r
-{\r
- __m128i value_;\r
- template<typename> friend struct xmm_cast_impl;\r
- friend xmm_epi32 horizontal_add(const xmm_epi16&);\r
- friend class xmm_epi8;\r
-public:\r
- typedef xmm_epi16 xmm_epi_tag;\r
-\r
- xmm_epi16()\r
- {\r
- }\r
-\r
- xmm_epi16(__m128i value)\r
- : value_(value)\r
- {\r
- }\r
-\r
- xmm_epi16(short value)\r
- : value_(_mm_set1_epi16(value))\r
- {\r
- }\r
-\r
- xmm_epi16& operator+=(const xmm_epi16& other)\r
- {\r
- value_ = _mm_add_epi16(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi16& operator-=(const xmm_epi16& other)\r
- {\r
- value_ = _mm_sub_epi16(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16& operator>>=(int count)\r
- {\r
- value_ = _mm_srli_epi16(value_, count);\r
- return *this;\r
- }\r
- \r
- xmm_epi16& operator<<=(int count)\r
- {\r
- value_ = _mm_slli_epi16(value_, count);\r
- return *this;\r
- }\r
-\r
- xmm_epi16& operator|=(const xmm_epi16& other)\r
- {\r
- value_ = _mm_or_si128(value_, other.value_);\r
- return *this;\r
- } \r
- \r
- xmm_epi16& operator&=(const xmm_epi16& other)\r
- {\r
- value_ = _mm_and_si128(value_, other.value_);\r
- return *this;\r
- } \r
- \r
- xmm_epi16 multiply_low(const xmm_epi16& other)\r
- { \r
- value_ = _mm_mullo_epi16(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16 multiply_high(const xmm_epi16& other)\r
- { \r
- value_ = _mm_mulhi_epi16(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16 umultiply_low(const xmm_epi16& other)\r
- { \r
- value_ = _mm_mullo_epi16(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16 umultiply_high(const xmm_epi16& other)\r
- { \r
- value_ = _mm_mulhi_epi16(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi16 and_not(const xmm_epi16& other)\r
- { \r
- value_ = _mm_andnot_si128(other.value_, value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16 unpack_low(const xmm_epi16& other)\r
- { \r
- value_ = _mm_unpacklo_epi16 (value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi16 unpack_high(const xmm_epi16& other)\r
- { \r
- value_ = _mm_unpackhi_epi16 (value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi16 max(const xmm_epi16& other)\r
- { \r
- value_ = _mm_max_epi16(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi16 min(const xmm_epi16& other)\r
- { \r
- value_ = _mm_min_epi16(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- int16_t operator[](int index) const\r
- {\r
- return value_.m128i_i16[index];\r
- }\r
-\r
- int16_t& operator[](int index)\r
- {\r
- return value_.m128i_i16[index];\r
- }\r
- \r
- static xmm_epi16 load(const void* source)\r
- {\r
- return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
- \r
- static xmm_epi16 loadu(const void* source)\r
- {\r
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
- \r
- static xmm_epi32 horizontal_add(const xmm_epi16& lhs)\r
- {\r
- #ifdef SSIM_XOP\r
- return _mm_haddd_epi16(value_);\r
- #else\r
- return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));\r
- #endif\r
- }\r
-\r
- static xmm_epi16 multiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).multiply_low(rhs);\r
- }\r
-\r
- static xmm_epi16 multiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).multiply_high(rhs);\r
- }\r
-\r
- static xmm_epi16 umultiply_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).umultiply_low(rhs);\r
- }\r
-\r
- static xmm_epi16 umultiply_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).umultiply_high(rhs);\r
- }\r
-\r
- static xmm_epi16 unpack_low(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).unpack_low(rhs);\r
- }\r
-\r
- static xmm_epi16 unpack_high(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).unpack_high(rhs);\r
- }\r
- \r
- static xmm_epi16 and_not(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).and_not(rhs);\r
- }\r
- \r
- static xmm_epi16 max(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).max(rhs);\r
- }\r
- \r
- static xmm_epi16 min(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).min(rhs);\r
- }\r
-\r
- static xmm_epi16 zero()\r
- {\r
- return _mm_setzero_si128();\r
- }\r
-};\r
-\r
-inline xmm_epi16 operator+(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{\r
- return xmm_epi16(lhs) += rhs;\r
-}\r
-\r
-inline xmm_epi16 operator-(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{\r
- return xmm_epi16(lhs) -= rhs;\r
-}\r
-\r
-inline xmm_epi16 operator>>(const xmm_epi16& lhs, int count)\r
-{ \r
- return xmm_epi16(lhs) >>= count;\r
-}\r
-\r
-inline xmm_epi16 operator<<(const xmm_epi16& lhs, int count)\r
-{ \r
- return xmm_epi16(lhs) <<= count;\r
-}\r
-\r
-inline xmm_epi16 operator|(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{ \r
- return xmm_epi16(lhs) |= rhs;\r
-}\r
-\r
-inline xmm_epi16 operator&(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
-{ \r
- return xmm_epi16(lhs) &= rhs;\r
-}\r
-\r
-class xmm_epi8\r
-{\r
- __m128i value_;\r
- template<typename> friend struct xmm_cast_impl;\r
- friend xmm_epi16 multiply_add(const xmm_epi8&, const xmm_epi8&);\r
-public:\r
- typedef xmm_epi8 xmm_epi_tag;\r
-\r
- xmm_epi8()\r
- {\r
- }\r
-\r
- xmm_epi8(__m128i value)\r
- : value_(value)\r
- {\r
- }\r
- \r
- xmm_epi8(char b)\r
- : value_(_mm_set1_epi8(b))\r
- {\r
- }\r
-\r
- xmm_epi8(char b3, char b2, char b1, char b0)\r
- : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
- {\r
- }\r
-\r
- xmm_epi8(char b15, char b14, char b13, char b12, \r
- char b11, char b10, char b9, char b8, \r
- char b7, char b6, char b5, char b4, \r
- char b3, char b2, char b1, char b0)\r
- : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
- {\r
- }\r
- \r
- xmm_epi8& operator+=(const xmm_epi8& other)\r
- {\r
- value_ = _mm_add_epi8(value_, other.value_);\r
- return *this;\r
- }\r
-\r
- xmm_epi8& operator-=(const xmm_epi8& other)\r
- {\r
- value_ = _mm_sub_epi8(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& shuffle(const xmm_epi8& other)\r
- { \r
- value_ = _mm_shuffle_epi8 (value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& max(const xmm_epi8& other)\r
- { \r
- value_ = _mm_max_epi8 (value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& min(const xmm_epi8& other)\r
- { \r
- value_ = _mm_min_epi8 (value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& umax(const xmm_epi8& other)\r
- { \r
- value_ = _mm_max_epu8(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& umin(const xmm_epi8& other)\r
- { \r
- value_ = _mm_min_epu8(value_, other.value_);\r
- return *this;\r
- }\r
- \r
- xmm_epi8& blend(const xmm_epi8& other, const xmm_epi8& mask)\r
- { \r
- value_ = _mm_blendv_epi8(value_, other.value_, mask.value_);\r
- return *this;\r
- }\r
- \r
- const xmm_epi8& stream(void* dest) const\r
- {\r
- _mm_stream_si128(reinterpret_cast<__m128i*>(dest), value_);\r
- return *this;\r
- }\r
-\r
- const xmm_epi8& store(void* dest) const\r
- {\r
- _mm_store_si128(reinterpret_cast<__m128i*>(dest), value_);\r
- return *this;\r
- }\r
-\r
- template<typename write_op>\r
- const xmm_epi8& write(void* dest) const\r
- {\r
- if(write_op::value == stream_write::value)\r
- return stream(dest);\r
- else\r
- return store(dest);\r
- }\r
- \r
- char operator[](int index) const\r
- {\r
- return value_.m128i_i8[index];\r
- }\r
-\r
- char& operator[](int index)\r
- {\r
- return value_.m128i_i8[index];\r
- }\r
- \r
- static xmm_epi16 unpack_low(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- {\r
- return _mm_unpacklo_epi8(rhs.value_, lhs.value_);\r
- }\r
- \r
- static xmm_epi16 unpack_high(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- {\r
- return _mm_unpackhi_epi8(rhs.value_, lhs.value_);\r
- }\r
- \r
- static xmm_epi8 upack(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return _mm_packus_epi16(lhs.value_, rhs.value_);\r
- }\r
-\r
- static const xmm_epi8& stream(const xmm_epi8& source, void* dest)\r
- {\r
- return source.stream(dest);\r
- }\r
- \r
- static const xmm_epi8& store(const xmm_epi8& source, void* dest)\r
- {\r
- return source.store(dest);\r
- }\r
- \r
- template<typename write_op>\r
- static const xmm_epi8& write(const xmm_epi8& source, void* dest)\r
- {\r
- return source.write<write_op>(dest);\r
- }\r
-\r
- static xmm_epi8 load(const void* source)\r
- {\r
- return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
- \r
- static xmm_epi8 loadu(const void* source)\r
- {\r
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
- }\r
-\r
- static xmm_epi16 multiply_add(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi16(_mm_maddubs_epi16(lhs.value_, rhs.value_));\r
- }\r
-\r
- static xmm_epi8& shuffle(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi8(lhs).shuffle(rhs);\r
- }\r
- \r
- static xmm_epi8& max(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi8(lhs).max(rhs);\r
- }\r
- \r
- static xmm_epi8& min(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi8(lhs).min(rhs);\r
- }\r
- \r
- static xmm_epi8& umax(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi8(lhs).umax(rhs);\r
- }\r
- \r
- static xmm_epi8& umin(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
- { \r
- return xmm_epi8(lhs).umin(rhs);\r
- }\r
-\r
- static xmm_epi8& blend(const xmm_epi8& lhs, const xmm_epi8& rhs, const xmm_epi8& mask)\r
- { \r
- return xmm_epi8(lhs).blend(rhs, mask);\r
- }\r
-\r
- static xmm_epi8 zero()\r
- {\r
- return _mm_setzero_si128();\r
- }\r
-};\r
-\r
-inline xmm_epi8 operator+(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-{\r
- return xmm_epi8(lhs) += rhs;\r
-}\r
-\r
-inline xmm_epi8 operator-(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
-{\r
- return xmm_epi8(lhs) -= rhs;\r
-}\r
-\r
-// xmm_cast\r
-\r
-template<typename T>\r
-struct xmm_cast_impl\r
-{ \r
- template<typename U>\r
- T operator()(const U& other)\r
- {\r
- return typename T::xmm_epi_tag(other.value_);\r
- }\r
-};\r
-\r
-template<>\r
-struct xmm_cast_impl<xmm_ps>\r
-{\r
- xmm_ps operator()(const xmm_epi32& other)\r
- {\r
- return _mm_cvtepi32_ps(other.value_);\r
- }\r
-};\r
-\r
-template<typename T, typename U> \r
-T xmm_cast(const U& other)\r
-{\r
- return xmm_cast_impl<T>()(other);\r
-}\r
-\r
-}}}
\ No newline at end of file
--- /dev/null
+#pragma once\r
+\r
+#include <intrin.h>\r
+\r
+#include <type_traits>\r
+#include <vector>\r
+#include <tbb/cache_aligned_allocator.h>\r
+\r
+namespace caspar { namespace accelerator { namespace cpu { namespace xmm {\r
+\r
+typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
+\r
+struct stream_tag\r
+{\r
+ static const int value = 0x01;\r
+};\r
+struct store_tag\r
+{\r
+ static const int value = 0x02;\r
+};\r
+\r
+class s32_x;\r
+class s16_x;\r
+class s8_x;\r
+class u8_x;\r
+\r
+template<typename T>\r
+class base_x\r
+{\r
+public:\r
+ static T load(const void* source);\r
+ static T loadu(const void* source);\r
+ static T zero();\r
+ \r
+ static void write(const T& source, void* dest, stream_tag);\r
+ static void write(const T& source, void* dest, store_tag);\r
+ static void stream(const T& source, void* dest);\r
+ static void store(const T& source, void* dest);\r
+};\r
+\r
+class s32_x : public base_x<s32_x>\r
+{\r
+ __m128i value_;\r
+ template<typename> friend class base_x;\r
+ friend class s16_x;\r
+ friend class s8_x;\r
+ friend class u8_x;\r
+public:\r
+ typedef s32_x xmm_epi_tag;\r
+\r
+ s32_x();\r
+ s32_x(const s16_x& other);\r
+ s32_x(const s8_x& other);\r
+ s32_x(const u8_x& other);\r
+ s32_x(const __m128i& value);\r
+\r
+ s32_x& operator>>=(int count);\r
+ s32_x& operator<<=(int count);\r
+ s32_x& operator|=(const s32_x& other);\r
+ s32_x& operator&=(const s32_x& other);\r
+ int32_t operator[](int index) const;\r
+ int32_t& operator[](int index);\r
+};\r
+\r
+class s16_x : public base_x<s16_x>\r
+{\r
+ __m128i value_;\r
+\r
+private:\r
+ template<typename> friend class base_x;\r
+ friend class s32_x;\r
+ friend class s8_x;\r
+ friend class u8_x;\r
+public:\r
+ typedef s16_x xmm_epi_tag;\r
+\r
+ s16_x();\r
+ s16_x(const s32_x& other);\r
+ s16_x(const s8_x& other);\r
+ s16_x(const u8_x& other);\r
+ s16_x(const __m128i& value);\r
+ s16_x(short value);\r
+\r
+ s16_x& operator+=(const s16_x& other); \r
+ s16_x& operator-=(const s16_x& other);\r
+ s16_x& operator>>=(int count);\r
+ s16_x& operator<<=(int count);\r
+ s16_x& operator|=(const s16_x& other);\r
+ s16_x& operator&=(const s16_x& other); \r
+ int16_t operator[](int index) const;\r
+ int16_t& operator[](int index);\r
+ \r
+ static s16_x unpack_low(const s8_x& lhs, const s8_x& rhs);\r
+ static s16_x unpack_high(const s8_x& lhs, const s8_x& rhs);\r
+ static s32_x horizontal_add(const s16_x& lhs);\r
+ static s16_x multiply_low(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x multiply_high(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x umultiply_low(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x umultiply_high(const s16_x& lhs, const s16_x& rhs); \r
+ static s16_x unpack_low(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x unpack_high(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x and_not(const s16_x& lhs, const s16_x& rhs); \r
+ static s16_x max(const s16_x& lhs, const s16_x& rhs);\r
+ static s16_x min(const s16_x& lhs, const s16_x& rhs);\r
+};\r
+\r
+class s8_x : public base_x<s8_x>\r
+{\r
+ __m128i value_;\r
+private:\r
+ template<typename> friend class base_x;\r
+ friend class s32_x;\r
+ friend class s16_x;\r
+ friend class u8_x;\r
+public:\r
+ typedef s8_x xmm_epi_tag;\r
+\r
+ s8_x();\r
+ s8_x(const s32_x& other);\r
+ s8_x(const s16_x& other);\r
+ s8_x(const u8_x& other);\r
+ s8_x(const __m128i& value); \r
+ s8_x(char b);\r
+ s8_x(char b3, char b2, char b1, char b0);\r
+ s8_x(char b15, char b14, char b13, char b12, \r
+ char b11, char b10, char b9, char b8, \r
+ char b7, char b6, char b5, char b4, \r
+ char b3, char b2, char b1, char b0);\r
+\r
+ s8_x& operator+=(const s8_x& other);\r
+ s8_x& operator-=(const s8_x& other); \r
+ char operator[](int index) const;\r
+ char& operator[](int index);\r
+ \r
+ static s8_x upack(const s16_x& lhs, const s16_x& rhs);\r
+\r
+ static s16_x multiply_add(const s8_x& lhs, const s8_x& rhs);\r
+ static s8_x shuffle(const s8_x& lhs, const s8_x& rhs);\r
+ static s8_x max(const s8_x& lhs, const s8_x& rhs);\r
+ static s8_x min(const s8_x& lhs, const s8_x& rhs);\r
+ static s8_x blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask);\r
+ static s8_x zero();\r
+};\r
+\r
+class u8_x : public base_x<u8_x>\r
+{\r
+ __m128i value_;\r
+private:\r
+ template<typename> friend class base_x;\r
+ friend class s32_x;\r
+ friend class s16_x;\r
+ friend class s8_x;\r
+public:\r
+ typedef u8_x xmm_epu_tag;\r
+\r
+ u8_x();\r
+ u8_x(const s32_x& other);\r
+ u8_x(const s16_x& other);\r
+ u8_x(const s8_x& other);\r
+ u8_x(const __m128i& value); \r
+ u8_x(char b);\r
+ u8_x(char b3, char b2, char b1, char b0);\r
+ u8_x(char b15, char b14, char b13, char b12, \r
+ char b11, char b10, char b9, char b8, \r
+ char b7, char b6, char b5, char b4, \r
+ char b3, char b2, char b1, char b0);\r
+ \r
+ char operator[](int index) const;\r
+ char& operator[](int index);\r
+ \r
+ static u8_x max(const u8_x& lhs, const u8_x& rhs);\r
+ static u8_x min(const u8_x& lhs, const u8_x& rhs);\r
+};\r
+\r
+// base_x\r
+\r
+template<typename T>\r
+T base_x<T>::load(const void* source)\r
+{\r
+ return _mm_load_si128(reinterpret_cast<const __m128i*>(source));\r
+}\r
+ \r
+template<typename T>\r
+T base_x<T>::loadu(const void* source)\r
+{\r
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));\r
+}\r
+\r
+template<typename T>\r
+T base_x<T>::zero()\r
+{\r
+ return _mm_setzero_si128();\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::write(const T& source, void* dest, store_tag)\r
+{\r
+ base_x<T>::store(source, dest);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::write(const T& source, void* dest, stream_tag)\r
+{\r
+ base_x<T>::stream(source, dest);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::stream(const T& source, void* dest)\r
+{\r
+ _mm_stream_si128(reinterpret_cast<__m128i*>(dest), source.value_);\r
+}\r
+\r
+template<typename T>\r
+void base_x<T>::store(const T& source, void* dest)\r
+{ \r
+ _mm_store_si128(reinterpret_cast<__m128i*>(dest), source.value_);\r
+}\r
+\r
+// s32_x\r
+\r
+s32_x::s32_x()\r
+{\r
+}\r
+\r
+s32_x::s32_x(const s16_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const s8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const u8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s32_x::s32_x(const __m128i& value)\r
+ : value_(value)\r
+{\r
+}\r
+ \r
+s32_x& s32_x::operator>>=(int count)\r
+{\r
+ value_ = _mm_srli_epi32(value_, count);\r
+ return *this;\r
+}\r
+ \r
+s32_x& s32_x::operator<<=(int count)\r
+{\r
+ value_ = _mm_slli_epi32(value_, count);\r
+ return *this;\r
+}\r
+ \r
+s32_x& s32_x::operator|=(const s32_x& other)\r
+{\r
+ value_ = _mm_or_si128(value_, other.value_);\r
+ return *this;\r
+} \r
+ \r
+s32_x& s32_x::operator&=(const s32_x& other)\r
+{\r
+ value_ = _mm_and_si128(value_, other.value_);\r
+ return *this;\r
+} \r
+ \r
+int32_t s32_x::operator[](int index) const\r
+{\r
+ return value_.m128i_i32[index];\r
+}\r
+\r
+int32_t& s32_x::operator[](int index)\r
+{\r
+ return value_.m128i_i32[index];\r
+}\r
+\r
+inline s32_x operator>>(const s32_x& lhs, int count)\r
+{ \r
+ return s32_x(lhs) >>= count;\r
+}\r
+\r
+inline s32_x operator<<(const s32_x& lhs, int count)\r
+{ \r
+ return s32_x(lhs) <<= count;\r
+}\r
+\r
+inline s32_x operator|(const s32_x& lhs, const s32_x& rhs)\r
+{ \r
+ return s32_x(lhs) |= rhs;\r
+}\r
+\r
+inline s32_x operator&(const s32_x& lhs, const s32_x& rhs)\r
+{ \r
+ return s32_x(lhs) &= rhs;\r
+}\r
+\r
+// s16_x\r
+\r
+s16_x::s16_x()\r
+{\r
+}\r
+\r
+s16_x::s16_x(const s32_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const s8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const u8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s16_x::s16_x(const __m128i& value)\r
+ : value_(value)\r
+{\r
+}\r
+\r
+s16_x::s16_x(short value)\r
+ : value_(_mm_set1_epi16(value))\r
+{\r
+}\r
+\r
+s16_x& s16_x::operator+=(const s16_x& other)\r
+{\r
+ value_ = _mm_add_epi16(value_, other.value_);\r
+ return *this;\r
+}\r
+ \r
+s16_x& s16_x::operator-=(const s16_x& other)\r
+{\r
+ value_ = _mm_sub_epi16(value_, other.value_);\r
+ return *this;\r
+}\r
+\r
+s16_x& s16_x::operator>>=(int count)\r
+{\r
+ value_ = _mm_srli_epi16(value_, count);\r
+ return *this;\r
+}\r
+ \r
+s16_x& s16_x::operator<<=(int count)\r
+{\r
+ value_ = _mm_slli_epi16(value_, count);\r
+ return *this;\r
+}\r
+\r
+s16_x& s16_x::operator|=(const s16_x& other)\r
+{\r
+ value_ = _mm_or_si128(value_, other.value_);\r
+ return *this;\r
+} \r
+ \r
+s16_x& s16_x::operator&=(const s16_x& other)\r
+{\r
+ value_ = _mm_and_si128(value_, other.value_);\r
+ return *this;\r
+} \r
+ \r
+int16_t s16_x::operator[](int index) const\r
+{\r
+ return value_.m128i_i16[index];\r
+}\r
+\r
+int16_t& s16_x::operator[](int index)\r
+{\r
+ return value_.m128i_i16[index];\r
+}\r
+\r
+s16_x s16_x::unpack_low(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+ return _mm_unpacklo_epi8(rhs.value_, lhs.value_);\r
+}\r
+ \r
+s16_x s16_x::unpack_high(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+ return _mm_unpackhi_epi8(rhs.value_, lhs.value_);\r
+}\r
+ \r
+s32_x s16_x::horizontal_add(const s16_x& lhs)\r
+{\r
+ #ifdef SSIM_XOP\r
+ return _mm_haddd_epi16(value_);\r
+ #else\r
+ return _mm_madd_epi16(lhs.value_, _mm_set1_epi16(1));\r
+ #endif\r
+}\r
+\r
+s16_x s16_x::multiply_low(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_mullo_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::multiply_high(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_mulhi_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::unpack_low(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_unpacklo_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s16_x::unpack_high(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_unpackhi_epi16(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s16_x s16_x::and_not(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_andnot_si128(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s16_x s16_x::max(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_max_epi16(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s16_x s16_x::min(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_min_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+inline s16_x operator+(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return s16_x(lhs) += rhs;\r
+}\r
+\r
+inline s16_x operator-(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return s16_x(lhs) -= rhs;\r
+}\r
+\r
+inline s16_x operator>>(const s16_x& lhs, int count)\r
+{ \r
+ return s16_x(lhs) >>= count;\r
+}\r
+\r
+inline s16_x operator<<(const s16_x& lhs, int count)\r
+{ \r
+ return s16_x(lhs) <<= count;\r
+}\r
+\r
+inline s16_x operator|(const s16_x& lhs, const s16_x& rhs)\r
+{ \r
+ return s16_x(lhs) |= rhs;\r
+}\r
+\r
+inline s16_x operator&(const s16_x& lhs, const s16_x& rhs)\r
+{ \r
+ return s16_x(lhs) &= rhs;\r
+}\r
+\r
+// s8_x\r
+\r
+s8_x::s8_x()\r
+{\r
+}\r
+\r
+s8_x::s8_x(const s32_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const s16_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const u8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+s8_x::s8_x(const __m128i& value)\r
+ : value_(value)\r
+{\r
+} \r
+\r
+s8_x::s8_x(char b)\r
+ : value_(_mm_set1_epi8(b))\r
+{\r
+}\r
+\r
+s8_x::s8_x(char b3, char b2, char b1, char b0)\r
+ : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
+{\r
+}\r
+\r
+s8_x::s8_x(char b15, char b14, char b13, char b12, \r
+ char b11, char b10, char b9, char b8, \r
+ char b7, char b6, char b5, char b4, \r
+ char b3, char b2, char b1, char b0)\r
+ : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
+{\r
+}\r
+ \r
+s8_x& s8_x::operator+=(const s8_x& other)\r
+{\r
+ value_ = _mm_add_epi8(value_, other.value_);\r
+ return *this;\r
+}\r
+\r
+s8_x& s8_x::operator-=(const s8_x& other)\r
+{\r
+ value_ = _mm_sub_epi8(value_, other.value_);\r
+ return *this;\r
+}\r
+ \r
+char s8_x::operator[](int index) const\r
+{\r
+ return value_.m128i_i8[index];\r
+}\r
+\r
+char& s8_x::operator[](int index)\r
+{\r
+ return value_.m128i_i8[index];\r
+}\r
+ \r
+s8_x s8_x::upack(const s16_x& lhs, const s16_x& rhs)\r
+{\r
+ return _mm_packus_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s16_x s8_x::multiply_add(const s8_x& lhs, const s8_x& rhs)\r
+{ \r
+ return _mm_maddubs_epi16(lhs.value_, rhs.value_);\r
+}\r
+\r
+s8_x s8_x::shuffle(const s8_x& lhs, const s8_x& rhs)\r
+{ \r
+ return _mm_shuffle_epi8(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s8_x s8_x::max(const s8_x& lhs, const s8_x& rhs)\r
+{ \r
+ return _mm_max_epi8(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s8_x s8_x::min(const s8_x& lhs, const s8_x& rhs)\r
+{ \r
+ return _mm_min_epi8(lhs.value_, rhs.value_);\r
+}\r
+ \r
+s8_x s8_x::blend(const s8_x& lhs, const s8_x& rhs, const s8_x& mask)\r
+{ \r
+ return _mm_blendv_epi8(lhs.value_, rhs.value_, mask.value_);\r
+}\r
+\r
+inline s8_x operator+(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+ return s8_x(lhs) += rhs;\r
+}\r
+\r
+inline s8_x operator-(const s8_x& lhs, const s8_x& rhs)\r
+{\r
+ return s8_x(lhs) -= rhs;\r
+}\r
+\r
+// u8_x\r
+\r
+u8_x::u8_x()\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s32_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s16_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const s8_x& other)\r
+ : value_(other.value_)\r
+{\r
+}\r
+\r
+u8_x::u8_x(const __m128i& value)\r
+ : value_(value)\r
+{\r
+} \r
+\r
+u8_x::u8_x(char b)\r
+ : value_(_mm_set1_epi8(b))\r
+{\r
+}\r
+\r
+u8_x::u8_x(char b3, char b2, char b1, char b0)\r
+ : value_(_mm_set_epi8(b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0, b3, b2, b1, b0))\r
+{\r
+}\r
+\r
+u8_x::u8_x(char b15, char b14, char b13, char b12, \r
+ char b11, char b10, char b9, char b8, \r
+ char b7, char b6, char b5, char b4, \r
+ char b3, char b2, char b1, char b0)\r
+ : value_(_mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0))\r
+{\r
+}\r
+ \r
+char u8_x::operator[](int index) const\r
+{\r
+ return value_.m128i_i8[index];\r
+}\r
+\r
+char& u8_x::operator[](int index)\r
+{\r
+ return value_.m128i_i8[index];\r
+}\r
+\r
+u8_x u8_x::max(const u8_x& lhs, const u8_x& rhs)\r
+{ \r
+ return _mm_max_epu8(lhs.value_, rhs.value_);\r
+}\r
+ \r
+u8_x u8_x::min(const u8_x& lhs, const u8_x& rhs)\r
+{ \r
+ return _mm_min_epu8(lhs.value_, rhs.value_);\r
+}\r
+\r
+\r
+// xmm_cast\r
+\r
+//template<typename T>\r
+//struct xmm_cast_impl\r
+//{ \r
+// template<typename U>\r
+// T operator()(const U& other)\r
+// {\r
+// return typename T::xmm_epi_tag(other.value_);\r
+// }\r
+//};\r
+//\r
+//template<>\r
+//struct xmm_cast_impl<xmm_ps>\r
+//{\r
+// xmm_ps operator()(const s32_x& other)\r
+// {\r
+// return _mm_cvtepi32_ps(other.value_);\r
+// }\r
+//};\r
+//\r
+//template<typename T, typename U> \r
+//T xmm_cast(const U& other)\r
+//{\r
+// return xmm_cast_impl<T>()(other);\r
+//}\r
+\r
+}}}}
\ No newline at end of file