]> git.sesse.net Git - casparcg/commitdiff
2.0.0.2: SSE optimized key_only.
authorronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Sun, 15 May 2011 21:08:44 +0000 (21:08 +0000)
committerronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Sun, 15 May 2011 21:08:44 +0000 (21:08 +0000)
git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches/2.0.0.2@756 362d55ac-95cf-4e76-9f9a-cbaa9c17b72d

modules/decklink/consumer/decklink_consumer.cpp

index 01b222a76b2f4e3b0a9e3d64a348f3c12340c17b..a7dfa9001cc6bda6f0df25ac2d9cdf5c393190dd 100644 (file)
@@ -58,7 +58,6 @@ enum latency
 enum output_pixels\r
 {\r
        fill_and_key,\r
-       fill_only,\r
        key_only\r
 };\r
 \r
@@ -110,39 +109,31 @@ public:
     STDMETHOD(GetAncillaryData(IDeckLinkVideoFrameAncillary** ancillary))                {return S_FALSE;}\r
 };\r
 \r
-std::shared_ptr<IDeckLinkVideoFrame> make_alpha_only_frame(const CComQIPtr<IDeckLinkOutput>& decklink, const safe_ptr<const core::read_frame>& frame, const core::video_format_desc& format_desc)\r
-{\r
-       IDeckLinkMutableVideoFrame* result;\r
+void make_alpha(void* dest, const void* source, size_t count)\r
+{      \r
+       __m128i*           dest128 = reinterpret_cast<__m128i*>(dest);  \r
+       const __m128i* source128 = reinterpret_cast<const __m128i*>(source);\r
 \r
-       if(FAILED(decklink->CreateVideoFrame(format_desc.width, format_desc.height, format_desc.size/format_desc.height, bmdFormat8BitBGRA, bmdFrameFlagDefault, &result)))\r
-               BOOST_THROW_EXCEPTION(caspar_exception());\r
+       count /= 16; // 128 bit\r
 \r
-       void* bytes = nullptr;\r
-       if(FAILED(result->GetBytes(&bytes)))\r
-               BOOST_THROW_EXCEPTION(caspar_exception());\r
-               \r
-       unsigned char* data = reinterpret_cast<unsigned char*>(bytes);\r
+       __m128i xmm0, xmm1, xmm2, xmm3;\r
 \r
-       if(static_cast<size_t>(frame->image_data().size()) == format_desc.size)\r
+       const __m128i mask128 = _mm_set_epi8(3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);\r
+       for(size_t n = 0; n < count/4; ++n)\r
        {\r
-               tbb::parallel_for(tbb::blocked_range<int>(0, frame->image_data().size()/4), [&](const tbb::blocked_range<int>& r)\r
-               {\r
-                       for(int n = r.begin(); n != r.end(); ++n)\r
-                       {\r
-                               data[n*4+0] = frame->image_data()[n*4+3];\r
-                               data[n*4+1] = frame->image_data()[n*4+3];\r
-                               data[n*4+2] = frame->image_data()[n*4+3];\r
-                               data[n*4+3] = 255;\r
-                       }\r
-               });\r
+               xmm0 = _mm_load_si128(source128++);     \r
+               xmm1 = _mm_load_si128(source128++);     \r
+               xmm2 = _mm_load_si128(source128++);     \r
+               xmm3 = _mm_load_si128(source128++);     \r
+\r
+               _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm0, mask128));\r
+               _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm1, mask128));\r
+               _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm2, mask128));\r
+               _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm3, mask128));\r
        }\r
-       else\r
-               memset(data, 0, format_desc.size);\r
-\r
-       return std::shared_ptr<IDeckLinkVideoFrame>(result, [](IDeckLinkMutableVideoFrame* p) {p->Release();});\r
 }\r
 \r
-std::shared_ptr<IDeckLinkVideoFrame> make_fill_only_frame(const CComQIPtr<IDeckLinkOutput>& decklink, const safe_ptr<const core::read_frame>& frame, const core::video_format_desc& format_desc)\r
+std::shared_ptr<IDeckLinkVideoFrame> make_alpha_only_frame(const CComQIPtr<IDeckLinkOutput>& decklink, const safe_ptr<const core::read_frame>& frame, const core::video_format_desc& format_desc)\r
 {\r
        IDeckLinkMutableVideoFrame* result;\r
 \r
@@ -157,16 +148,12 @@ std::shared_ptr<IDeckLinkVideoFrame> make_fill_only_frame(const CComQIPtr<IDeckL
 \r
        if(static_cast<size_t>(frame->image_data().size()) == format_desc.size)\r
        {\r
-               tbb::parallel_for(tbb::blocked_range<int>(0, frame->image_data().size()/4), [&](const tbb::blocked_range<int>& r)\r
-               {\r
-                       for(int n = r.begin(); n != r.end(); ++n)\r
-                       {\r
-                               data[n*4+0] = frame->image_data()[n*4+0];\r
-                               data[n*4+1] = frame->image_data()[n*4+1];\r
-                               data[n*4+2] = frame->image_data()[n*4+2];\r
-                               data[n*4+3] = 255;\r
-                       }\r
-               });\r
+               size_t count = frame->image_data().size();\r
+               tbb::affinity_partitioner ap;\r
+               tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)\r
+               {       \r
+                       make_alpha(reinterpret_cast<char*>(data) + r.begin()*128, reinterpret_cast<const char*>(frame->image_data().begin()) + r.begin()*128, r.size()*128);   \r
+               }, ap);\r
        }\r
        else\r
                memset(data, 0, format_desc.size);\r
@@ -415,8 +402,6 @@ public:
                std::shared_ptr<IDeckLinkVideoFrame> deck_frame;\r
                if(config_.output == key_only)\r
                        deck_frame = make_alpha_only_frame(output_, frame, format_desc_);\r
-               else if(config_.output == fill_only)\r
-                       deck_frame = make_fill_only_frame(output_, frame, format_desc_);\r
                else \r
                        deck_frame = std::make_shared<decklink_frame_adapter>(frame, format_desc_);\r
 \r
@@ -516,9 +501,7 @@ safe_ptr<core::frame_consumer> create_decklink_consumer(const boost::property_tr
                config.latency = low_latency;\r
 \r
        auto output_str = ptree.get("output", "fill_and_key");\r
-       if(output_str == "fill_only")\r
-               config.output = fill_only;\r
-       else if(output_str == "key_only")\r
+       if(output_str == "key_only")\r
                config.output = key_only;\r
 \r
        config.device_index = ptree.get("device", 0);\r