#include "image_mixer.h"\r
\r
#include "../util/write_frame.h"\r
-#include "../util/blend.h"\r
+#include "../util/simd.h"\r
\r
#include <common/assert.h>\r
#include <common/gl/gl_check.h>\r
#include <boost/range/algorithm_ext/erase.hpp>\r
#include <boost/thread/future.hpp>\r
\r
+#include <intrin.h>\r
+#include <stdint.h>\r
#include <algorithm>\r
#include <vector>\r
\r
{\r
return !(lhs == rhs);\r
}\r
+ \r
+inline xmm_epi8 blend(xmm_epi8 dest, xmm_epi8 source)\r
+{ \r
+ auto s = xmm_cast<xmm_epi16>(source);\r
+ auto d = dest;\r
+\r
+ const xmm_epi16 round = 128;\r
+ const xmm_epi16 lomask = 0x00FF;\r
+\r
+ // T(S, D) = S * D[A] + 0x80\r
+ auto aaaa = xmm_epi8::shuffle(d, xmm_epi8(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
+ d = xmm_epi8::umin(d, aaaa); // overflow guard\r
+\r
+ auto xaxa = xmm_cast<xmm_epi16>(aaaa) & lomask; \r
+ \r
+ auto xrxb = s & lomask;\r
+ auto t1 = xmm_epi16::multiply_low(xrxb, xaxa) + round; \r
+ \r
+ auto xaxg = s >> 8;\r
+ auto t2 = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
+ \r
+ // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
+ auto rxbx = xmm_cast<xmm_epi8>(((t1 >> 8) + t1) >> 8); \r
+ auto axgx = xmm_cast<xmm_epi8>((t2 >> 8) + t2); \r
+ auto argb = xmm_epi8::blend(rxbx, axgx, xmm_epi8(-1, 0, -1, 0));\r
+\r
+ return xmm_cast<xmm_epi8>(s) + (d - argb);\r
+}\r
+ \r
+template<typename write_op>\r
+static void kernel(uint8_t* dest, const uint8_t* source, size_t count, const core::frame_transform& transform)\r
+{ \r
+ for(auto n = 0; n < count; n += 32) \r
+ {\r
+ auto s0 = xmm_epi8::load(dest+n+0);\r
+ auto s1 = xmm_epi8::load(dest+n+16);\r
+\r
+ auto d0 = xmm_epi8::load(source+n+0);\r
+ auto d1 = xmm_epi8::load(source+n+16);\r
+ \r
+ auto argb0 = blend(d0, s0);\r
+ auto argb1 = blend(d1, s1);\r
+\r
+ xmm_epi8::write<write_op>(argb0, dest+n+0);\r
+ xmm_epi8::write<write_op>(argb1, dest+n+16);\r
+ } \r
+}\r
\r
class image_renderer\r
{\r
\r
auto result = spl::make_shared<host_buffer>(format_desc.size, 0);\r
if(format_desc.field_mode != core::field_mode::progressive)\r
- {\r
- auto upper = items;\r
- auto lower = items;\r
-\r
- BOOST_FOREACH(auto& item, upper)\r
- item.transform.field_mode &= core::field_mode::upper;\r
- \r
- BOOST_FOREACH(auto& item, lower)\r
- item.transform.field_mode &= core::field_mode::lower;\r
- \r
- draw(upper, result->data(), format_desc.width, format_desc.height);\r
- draw(lower, result->data(), format_desc.width, format_desc.height);\r
+ { \r
+ draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::upper);\r
+ draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::lower);\r
}\r
else\r
{\r
- draw(items, result->data(), format_desc.width, format_desc.height);\r
+ draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::progressive);\r
}\r
\r
return async(launch_policy::deferred, [=]\r
}); \r
}\r
\r
- void draw(std::vector<item>& items, uint8_t* dest, int width, int height)\r
- {\r
+ void draw(std::vector<item> items, uint8_t* dest, int width, int height, core::field_mode field_mode)\r
+ { \r
BOOST_FOREACH(auto& item, items)\r
- {\r
- auto field_mode = item.transform.field_mode; \r
+ item.transform.field_mode &= field_mode;\r
\r
- if(field_mode == core::field_mode::empty)\r
- continue;\r
+ boost::remove_erase_if(items, [](item& item){return item.transform.field_mode == core::field_mode::empty;});\r
\r
- auto start = field_mode == core::field_mode::lower ? 1 : 0;\r
- auto step = field_mode == core::field_mode::progressive ? 1 : 2;\r
+ if(items.empty())\r
+ return;\r
\r
- auto source = item.buffers.at(0)->data();\r
+ static const int CACHE_SIZE = 8192;\r
\r
- // TODO: Blend using divide and conquer instead of accumulation.\r
- // TODO: Add support for fill translations.\r
- // TODO: Add support for mask translations.\r
- // TODO: Add support for opacity.\r
- // TODO: Add support for mix transition.\r
- // TODO: Add support for push transition.\r
- // TODO: Add support for wipe transition.\r
- // TODO: Add support for slide transition.\r
- tbb::parallel_for(start, height, step, [&](int y)\r
+ auto start = field_mode == core::field_mode::lower ? 1 : 0;\r
+ auto step = field_mode == core::field_mode::progressive ? 1 : 2;\r
+ \r
+ // TODO: Add support for fill translations.\r
+ // TODO: Add support for mask rect.\r
+ // TODO: Add support for opacity.\r
+ // TODO: Add support for mix transition.\r
+ // TODO: Add support for push transition.\r
+ // TODO: Add support for wipe transition.\r
+ // TODO: Add support for slide transition.\r
+ tbb::parallel_for(tbb::blocked_range<int>(0, height/step, CACHE_SIZE/(width*4)), [&](const tbb::blocked_range<int>& r)\r
+ {\r
+ for(auto n = r.begin(); n != r.end(); ++n)\r
{\r
- cpu::blend(dest + y*width*4, source + y*width*4, width*4);\r
- });\r
- }\r
+ auto y = n*step+start;\r
+\r
+ auto it = items.begin();\r
+ for(; it != items.end()-1; ++it) \r
+ kernel<store_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+\r
+ kernel<stream_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+ }\r
+ });\r
}\r
\r
void convert(std::vector<item>& items, int width, int height)\r
\r
sws_scale(sws_context.get(), input_av_frame->data, input_av_frame->linesize, 0, input_av_frame->height, av_frame->data, av_frame->linesize); \r
\r
- clamp_alpha_overflow(av_frame->data[0], av_frame->data[0], width*height*4);\r
-\r
item.buffers.clear();\r
item.buffers.push_back(dest);\r
item.pix_desc = core::pixel_format_desc(core::pixel_format::bgra);\r
pool.push(sws_context);\r
});\r
}\r
-\r
- void clamp_alpha_overflow(uint8_t* dest, const uint8_t* source, size_t count)\r
- { \r
- CASPAR_VERIFY(count % 64 == 0);\r
-\r
- auto alpha_shuffle = xmm_epi8(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3);\r
-\r
- for(auto n = 0; n < count; n += 64) \r
- {\r
- auto x0 = xmm_epi8::load(source+n+0);\r
- auto x1 = xmm_epi8::load(source+n+16);\r
- auto x2 = xmm_epi8::load(source+n+32);\r
- auto x3 = xmm_epi8::load(source+n+48);\r
-\r
- auto aaaa0 = xmm_epi8::shuffle(x0, alpha_shuffle);\r
- auto aaaa1 = xmm_epi8::shuffle(x1, alpha_shuffle);\r
- auto aaaa2 = xmm_epi8::shuffle(x2, alpha_shuffle);\r
- auto aaaa3 = xmm_epi8::shuffle(x3, alpha_shuffle);\r
-\r
- x0 = xmm_epi8::umin(x0, aaaa0);\r
- x1 = xmm_epi8::umin(x1, aaaa1);\r
- x2 = xmm_epi8::umin(x2, aaaa2);\r
- x3 = xmm_epi8::umin(x3, aaaa3);\r
- \r
- xmm_epi8::stream(x0, dest+n+0);\r
- xmm_epi8::stream(x1, dest+n+16);\r
- xmm_epi8::stream(x2, dest+n+32);\r
- xmm_epi8::stream(x3, dest+n+48);\r
- } \r
- }\r
};\r
\r
struct image_mixer::impl : boost::noncopyable\r
+++ /dev/null
-//\r
-// Copyright (c) 2012 Robert Nagy\r
-//\r
-// Premultiplied Alpha Blending\r
-//\r
-\r
-#pragma once\r
-\r
-#include "simd.h"\r
-\r
-#include <intrin.h>\r
-#include <stdint.h>\r
-\r
-namespace caspar { namespace accelerator { namespace cpu {\r
-\r
-/*\r
- Function: blend\r
-\r
- Description:\r
- \r
- Premultiplied Alpha Blending.\r
- Based on http://www.alvyray.com/Memos/CG/Microsoft/4_comp.pdf.\r
- \r
- Parameters:\r
-\r
- dest - (rgb)a destination image.\r
- source1 - (rgb)a lower source image.\r
- source2 - (rgb)a upper source image.\r
- count - Size in bytes of source1 and source2. \r
-*/\r
-static void blend(uint8_t* dest, const uint8_t* source1, const uint8_t* source2, size_t count)\r
-{\r
- CASPAR_VERIFY(count % 16 == 0);\r
-\r
- const xmm_epi16 round = 128;\r
- const xmm_epi16 lomask = 0x00FF;\r
- \r
- for(auto n = 0; n < count; n += 16) \r
- {\r
- auto s = xmm_epi16::load(source1+n);\r
- auto d = xmm_epi8::load(source2+n);\r
-\r
- // T(S, D) = S * D[A] + 0x80\r
- auto aaaa = xmm_epi8::shuffle(d, xmm_epi8(-1, 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3));\r
- auto xaxa = xmm_cast<xmm_epi16>(aaaa);\r
- \r
- auto xrxb = s & lomask;\r
- auto t1 = xmm_epi16::multiply_low(xrxb, xaxa) + round; \r
- \r
- auto xaxg = s >> 8;\r
- auto t2 = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
- \r
- // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
- auto rxbx = (t1 >> 8) + t1; \r
- auto axgx = (t2 >> 8) + t2; \r
- auto argb = xmm_cast<xmm_epi8>((rxbx >> 8) | xmm_epi16::and_not(axgx, lomask));\r
- \r
- xmm_epi8::stream(xmm_cast<xmm_epi8>(s) + (d - argb), dest+n);\r
- } \r
-}\r
-\r
-static void blend(uint8_t* dest, const uint8_t* source, size_t count)\r
-{\r
- return blend(dest, dest, source, count);\r
-}\r
-\r
-}}}
\ No newline at end of file
\r
typedef std::vector<float, tbb::cache_aligned_allocator<float>> vector_ps;\r
\r
+struct stream_write\r
+{\r
+ static const int value = 0x01;\r
+};\r
+struct store_write\r
+{\r
+ static const int value = 0x02;\r
+};\r
+\r
class xmm_ps\r
{\r
__m128 value_;\r
__m128i value_;\r
template<typename> friend struct xmm_cast_impl;\r
friend xmm_epi32 horizontal_add(const xmm_epi16&);\r
+ friend class xmm_epi8;\r
public:\r
typedef xmm_epi16 xmm_epi_tag;\r
\r
value_ = _mm_unpackhi_epi16 (value_, other.value_);\r
return *this;\r
}\r
-\r
- xmm_epi16 pack(const xmm_epi16& other)\r
- { \r
- value_ = _mm_packs_epi16(value_, other.value_);\r
- return *this;\r
- }\r
- \r
+ \r
xmm_epi16 max(const xmm_epi16& other)\r
{ \r
value_ = _mm_max_epi16(value_, other.value_);\r
{\r
return xmm_epi16(lhs).unpack_high(rhs);\r
}\r
-\r
- static xmm_epi16 pack(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
- {\r
- return xmm_epi16(lhs).pack(rhs);\r
- }\r
-\r
+ \r
static xmm_epi16 and_not(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
{\r
return xmm_epi16(lhs).and_not(rhs);\r
value_ = _mm_blendv_epi8(value_, other.value_, mask.value_);\r
return *this;\r
}\r
-\r
+ \r
const xmm_epi8& stream(void* dest) const\r
{\r
_mm_stream_si128(reinterpret_cast<__m128i*>(dest), value_);\r
return *this;\r
}\r
+\r
+ const xmm_epi8& store(void* dest) const\r
+ {\r
+ _mm_store_si128(reinterpret_cast<__m128i*>(dest), value_);\r
+ return *this;\r
+ }\r
+\r
+ template<typename write_op>\r
+ const xmm_epi8& write(void* dest) const\r
+ {\r
+ if(write_op::value == stream_write::value)\r
+ return stream(dest);\r
+ else\r
+ return store(dest);\r
+ }\r
\r
char operator[](int index) const\r
{\r
{\r
return value_.m128i_i8[index];\r
}\r
-\r
+ \r
+ static xmm_epi16 unpack_low(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
+ {\r
+ return _mm_unpacklo_epi8(rhs.value_, lhs.value_);\r
+ }\r
+ \r
+ static xmm_epi16 unpack_high(const xmm_epi8& lhs, const xmm_epi8& rhs)\r
+ {\r
+ return _mm_unpackhi_epi8(rhs.value_, lhs.value_);\r
+ }
+ \r
+ static xmm_epi8 upack(const xmm_epi16& lhs, const xmm_epi16& rhs)\r
+ {\r
+ return _mm_packus_epi16(lhs.value_, rhs.value_);\r
+ }
+
static const xmm_epi8& stream(const xmm_epi8& source, void* dest)\r
{\r
- source.stream(dest);\r
- return source;\r
+ return source.stream(dest);\r
+ }\r
+ \r
+ static const xmm_epi8& store(const xmm_epi8& source, void* dest)\r
+ {\r
+ return source.store(dest);\r
+ }\r
+ \r
+ template<typename write_op>\r
+ static const xmm_epi8& write(const xmm_epi8& source, void* dest)\r
+ {\r
+ return source.write<write_op>(dest);\r
}\r
\r
static xmm_epi8 load(const void* source)\r