]> git.sesse.net Git - casparcg/blobdiff - accelerator/cpu/image/image_mixer.cpp
2.1.0: -data_frame: use unique_ptr instead of shared_ptr.
[casparcg] / accelerator / cpu / image / image_mixer.cpp
index 8a90ef1a6b4795ee3c4420934e3803d965befbe6..e72d041646e594d22a6e1252910843d98dafe2d9 100644 (file)
 \r
 #include "image_mixer.h"\r
 \r
-#include "../util/write_frame.h"\r
-#include "../util/simd.h"\r
+#include "../util/data_frame.h"\r
+#include "../util/xmm.h"\r
 \r
 #include <common/assert.h>\r
 #include <common/gl/gl_check.h>\r
 #include <common/concurrency/async.h>\r
 #include <common/memory/memcpy.h>\r
 \r
-#include <core/frame/write_frame.h>\r
+#include <core/frame/data_frame.h>\r
 #include <core/frame/frame_transform.h>\r
 #include <core/frame/pixel_format.h>\r
 #include <core/video_format.h>\r
@@ -51,9 +51,8 @@
 #include <boost/range/algorithm_ext/erase.hpp>\r
 #include <boost/thread/future.hpp>\r
 \r
-#include <intrin.h>\r
-#include <stdint.h>\r
 #include <algorithm>\r
+#include <stdint.h>\r
 #include <vector>\r
 \r
 #if defined(_MSC_VER)\r
@@ -76,7 +75,7 @@ struct item
 {\r
        core::pixel_format_desc                                         pix_desc;\r
        std::vector<spl::shared_ptr<host_buffer>>       buffers;\r
-       core::frame_transform                                           transform;\r
+       core::image_transform                                           transform;\r
 \r
        item()\r
                : pix_desc(core::pixel_format::invalid)\r
@@ -94,53 +93,59 @@ bool operator!=(const item& lhs, const item& rhs)
        return !(lhs == rhs);\r
 }\r
        \r
-inline xmm_epi8 blend(xmm_epi8 dest, xmm_epi8 source)\r
+inline xmm::s8_x blend(xmm::s8_x d, xmm::s8_x s)\r
 {      \r
-       auto s = xmm_cast<xmm_epi16>(source);\r
-       auto d = dest;\r
-\r
-       const xmm_epi16 round   = 128;\r
-       const xmm_epi16 lomask  = 0x00FF;\r
-\r
+       using namespace xmm;\r
+               \r
        // T(S, D) = S * D[A] + 0x80\r
-       auto aaaa   = xmm_epi8::shuffle(d, xmm_epi8(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
-       d                       = xmm_epi8::umin(d, aaaa); // overflow guard\r
+       auto aaaa   = s8_x::shuffle(d, s8_x(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3));\r
+       d                       = s8_x(u8_x::min(u8_x(d), u8_x(aaaa))); // overflow guard\r
 \r
-       auto xaxa       = xmm_cast<xmm_epi16>(aaaa) & lomask;           \r
+       auto xaxa       = s16_x(aaaa) >> 8;             \r
                              \r
-       auto xrxb       = s & lomask;\r
-       auto t1         = xmm_epi16::multiply_low(xrxb, xaxa) + round;    \r
-                       \r
-       auto xaxg       = s >> 8;\r
-       auto t2         = xmm_epi16::multiply_low(xaxg, xaxa) + round;\r
+       auto t1         = s16_x::multiply_low(s16_x(s) & 0x00FF, xaxa) + 0x80;    \r
+       auto t2         = s16_x::multiply_low(s16_x(s) >> 8    , xaxa) + 0x80;\r
                \r
        // C(S, D) = S + D - (((T >> 8) + T) >> 8);\r
-       auto rxbx       = xmm_cast<xmm_epi8>(((t1 >> 8) + t1) >> 8);      \r
-       auto axgx       = xmm_cast<xmm_epi8>((t2 >> 8) + t2);    \r
-       auto argb   = xmm_epi8::blend(rxbx, axgx, xmm_epi8(-1, 0, -1, 0));\r
+       auto xyxy       = s8_x(((t1 >> 8) + t1) >> 8);      \r
+       auto yxyx       = s8_x((t2 >> 8) + t2);    \r
+       auto argb   = s8_x::blend(xyxy, yxyx, s8_x(-1, 0, -1, 0));\r
 \r
-       return xmm_cast<xmm_epi8>(s) + (d - argb);\r
+       return s8_x(s) + (d - argb);\r
 }\r
        \r
-template<typename write_op>\r
-static void kernel(uint8_t* dest, const uint8_t* source, size_t count, const core::frame_transform& transform)\r
-{                              \r
+template<typename temporal, typename alignment>\r
+static void kernel(uint8_t* dest, const uint8_t* source, size_t count)\r
+{                      \r
+       using namespace xmm;\r
+\r
        for(auto n = 0; n < count; n += 32)    \r
        {\r
-               auto s0 = xmm_epi8::load(dest+n+0);\r
-               auto s1 = xmm_epi8::load(dest+n+16);\r
+               auto s0 = s8_x::load<temporal_tag, alignment>(dest+n+0);\r
+               auto s1 = s8_x::load<temporal_tag, alignment>(dest+n+16);\r
 \r
-               auto d0 = xmm_epi8::load(source+n+0);\r
-               auto d1 = xmm_epi8::load(source+n+16);\r
+               auto d0 = s8_x::load<temporal_tag, alignment>(source+n+0);\r
+               auto d1 = s8_x::load<temporal_tag, alignment>(source+n+16);\r
                \r
                auto argb0 = blend(d0, s0);\r
                auto argb1 = blend(d1, s1);\r
 \r
-               xmm_epi8::write<write_op>(argb0, dest+n+0);\r
-               xmm_epi8::write<write_op>(argb1, dest+n+16);\r
+               s8_x::store<temporal, alignment>(argb0, dest+n+0 );\r
+               s8_x::store<temporal, alignment>(argb1, dest+n+16);\r
        } \r
 }\r
 \r
+template<typename temporal>\r
+static void kernel(uint8_t* dest, const uint8_t* source, size_t count)\r
+{                      \r
+       using namespace xmm;\r
+\r
+       if(reinterpret_cast<int>(dest) % 16 != 0 || reinterpret_cast<int>(source) % 16 != 0)\r
+               kernel<temporal_tag, unaligned_tag>(dest, source, count);\r
+       else\r
+               kernel<temporal_tag, aligned_tag>(dest, source, count);\r
+}\r
+\r
 class image_renderer\r
 {\r
        std::pair<std::vector<item>, boost::shared_future<boost::iterator_range<const uint8_t*>>>               last_image_;\r
@@ -182,14 +187,22 @@ private:
        {               \r
                BOOST_FOREACH(auto& item, items)\r
                        item.transform.field_mode &= field_mode;\r
-\r
-               boost::remove_erase_if(items, [](item& item){return item.transform.field_mode == core::field_mode::empty;});\r
+               \r
+               // Remove empty items.\r
+               boost::range::remove_erase_if(items, [&](const item& item)\r
+               {\r
+                       return item.transform.field_mode == core::field_mode::empty;\r
+               });\r
+               \r
+               // Remove first field stills.\r
+               boost::range::remove_erase_if(items, [&](const item& item)\r
+               {\r
+                       return item.transform.is_still && item.transform.field_mode == field_mode; // only us last field for stills.\r
+               });\r
 \r
                if(items.empty())\r
                        return;\r
-\r
-               static const int CACHE_SIZE = 8192;\r
-\r
+               \r
                auto start = field_mode == core::field_mode::lower ? 1 : 0;\r
                auto step  = field_mode == core::field_mode::progressive ? 1 : 2;\r
                \r
@@ -200,32 +213,42 @@ private:
                // TODO: Add support for push transition.\r
                // TODO: Add support for wipe transition.\r
                // TODO: Add support for slide transition.\r
-               tbb::parallel_for(tbb::blocked_range<int>(0, height/step, CACHE_SIZE/(width*4)), [&](const tbb::blocked_range<int>& r)\r
+               tbb::parallel_for(tbb::blocked_range<int>(0, height/step), [&](const tbb::blocked_range<int>& r)\r
                {\r
-                       for(auto n = r.begin(); n != r.end(); ++n)\r
+                       for(auto i = r.begin(); i != r.end(); ++i)\r
                        {\r
-                               auto y = n*step+start;\r
-\r
-                               auto it = items.begin();\r
-                               for(; it != items.end()-1; ++it)                        \r
-                                       kernel<store_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+                               auto y = i*step+start;\r
 \r
-                               kernel<stream_write>(dest + y*width*4, it->buffers.at(0)->data() + y*width*4, width*4, it->transform);\r
+                               for(std::size_t n = 0; n < items.size()-1; ++n)\r
+                                       kernel<xmm::temporal_tag>(dest + y*width*4, items[n].buffers.at(0)->data() + y*width*4, width*4);\r
+                               \r
+                               std::size_t n = items.size()-1;                         \r
+                               kernel<xmm::nontemporal_tag>(dest + y*width*4, items[n].buffers.at(0)->data() + y*width*4, width*4);\r
                        }\r
+\r
+                       _mm_mfence();\r
                });\r
        }\r
-       \r
-       void convert(std::vector<item>& items, int width, int height)\r
+               \r
+       void convert(std::vector<item>& source_items, int width, int height)\r
        {\r
-               // TODO: Don't convert buffers multiple times just because they are in different items due to e.g. interlacing.\r
-               tbb::parallel_for_each(items.begin(), items.end(), [&](item& item)\r
-               {\r
-                       if(item.pix_desc.format == core::pixel_format::bgra && \r
-                          item.pix_desc.planes.at(0).width == width &&\r
-                          item.pix_desc.planes.at(0).height == height)\r
+               std::set<std::vector<spl::shared_ptr<host_buffer>>> buffers;\r
+\r
+               BOOST_FOREACH(auto& item, source_items)\r
+                       buffers.insert(item.buffers);\r
+               \r
+               auto dest_items = source_items;\r
+\r
+               tbb::parallel_for_each(buffers.begin(), buffers.end(), [&](const std::vector<spl::shared_ptr<host_buffer>>& buffers)\r
+               {                       \r
+                       auto pix_desc = std::find_if(source_items.begin(), source_items.end(), [&](const item& item){return item.buffers == buffers;})->pix_desc;\r
+\r
+                       if(pix_desc.format == core::pixel_format::bgra && \r
+                               pix_desc.planes.at(0).width == width &&\r
+                               pix_desc.planes.at(0).height == height)\r
                                return;\r
 \r
-                       auto input_av_frame = ffmpeg::make_av_frame(item.buffers, item.pix_desc);\r
+                       auto input_av_frame = ffmpeg::make_av_frame(buffers, pix_desc);\r
                                                                \r
                        int key = ((input_av_frame->width << 22) & 0xFFC00000) | ((input_av_frame->height << 6) & 0x003FC000) | ((input_av_frame->format << 7) & 0x00007F00);\r
                                                \r
@@ -241,66 +264,73 @@ private:
                        if(!sws_context)                                \r
                                BOOST_THROW_EXCEPTION(operation_failed() << msg_info("Could not create software scaling context.") << boost::errinfo_api_function("sws_getContext"));                           \r
                \r
-                       auto dest = spl::make_shared<host_buffer>(width*height*4);\r
+                       auto dest_frame = spl::make_shared<host_buffer>(width*height*4);\r
 \r
-                       spl::shared_ptr<AVFrame> av_frame(avcodec_alloc_frame(), av_free);      \r
-                       avcodec_get_frame_defaults(av_frame.get());                     \r
-                       avpicture_fill(reinterpret_cast<AVPicture*>(av_frame.get()), dest->data(), PIX_FMT_BGRA, width, height);\r
+                       {\r
+                               spl::shared_ptr<AVFrame> dest_av_frame(avcodec_alloc_frame(), av_free); \r
+                               avcodec_get_frame_defaults(dest_av_frame.get());                        \r
+                               avpicture_fill(reinterpret_cast<AVPicture*>(dest_av_frame.get()), dest_frame->data(), PIX_FMT_BGRA, width, height);\r
                                \r
-                       sws_scale(sws_context.get(), input_av_frame->data, input_av_frame->linesize, 0, input_av_frame->height, av_frame->data, av_frame->linesize);    \r
-                       \r
-                       item.buffers.clear();\r
-                       item.buffers.push_back(dest);\r
-                       item.pix_desc = core::pixel_format_desc(core::pixel_format::bgra);\r
-                       item.pix_desc.planes.clear();\r
-                       item.pix_desc.planes.push_back(core::pixel_format_desc::plane(width, height, 4));\r
+                               sws_scale(sws_context.get(), input_av_frame->data, input_av_frame->linesize, 0, input_av_frame->height, dest_av_frame->data, dest_av_frame->linesize);                          \r
+                               pool.push(sws_context);\r
+                       }\r
+               \r
+                       for(std::size_t n = 0; n < source_items.size(); ++n)\r
+                       {\r
+                               if(source_items[n].buffers == buffers)\r
+                               {\r
+                                       dest_items[n].buffers                   = boost::assign::list_of(dest_frame);\r
+                                       dest_items[n].pix_desc                  = core::pixel_format_desc(core::pixel_format::bgra);\r
+                                       dest_items[n].pix_desc.planes   = boost::assign::list_of(core::pixel_format_desc::plane(width, height, 4));\r
+                                       dest_items[n].transform                 = source_items[n].transform;\r
+                               }\r
+                       }\r
+               });     \r
 \r
-                       pool.push(sws_context);\r
-               });\r
+               source_items = std::move(dest_items);\r
        }\r
 };\r
                \r
 struct image_mixer::impl : boost::noncopyable\r
 {      \r
        image_renderer                                          renderer_;\r
-       std::vector<core::frame_transform>      transform_stack_;\r
+       std::vector<core::image_transform>      transform_stack_;\r
        std::vector<item>                                       items_; // layer/stream/items\r
 public:\r
        impl() \r
                : transform_stack_(1)   \r
        {\r
-               CASPAR_LOG(info) << L"Initialized CPU Accelerated Image Mixer";\r
+               CASPAR_LOG(info) << L"Initialized Streaming SIMD Extensions Accelerated CPU Image Mixer";\r
        }\r
 \r
        void begin_layer(core::blend_mode blend_mode)\r
        {\r
        }\r
                \r
-       void push(core::frame_transform& transform)\r
+       void push(const core::frame_transform& transform)\r
        {\r
-               transform_stack_.push_back(transform_stack_.back()*transform);\r
+               transform_stack_.push_back(transform_stack_.back()*transform.image_transform);\r
        }\r
                \r
-       void visit(core::data_frame& frame2)\r
+       void visit(const core::data_frame& frame2)\r
        {                       \r
-               write_frame* frame = dynamic_cast<write_frame*>(&frame2);\r
+               auto frame = dynamic_cast<const cpu::data_frame*>(&frame2);\r
                if(frame == nullptr)\r
                        return;\r
 \r
-               if(frame->get_pixel_format_desc().format == core::pixel_format::invalid)\r
+               if(frame->pixel_format_desc().format == core::pixel_format::invalid)\r
                        return;\r
 \r
-               if(frame->get_buffers().empty())\r
+               if(frame->buffers().empty())\r
                        return;\r
 \r
                if(transform_stack_.back().field_mode == core::field_mode::empty)\r
                        return;\r
 \r
                item item;\r
-               item.pix_desc                   = frame->get_pixel_format_desc();\r
-               item.buffers                    = frame->get_buffers();                         \r
+               item.pix_desc                   = frame->pixel_format_desc();\r
+               item.buffers                    = frame->buffers();                             \r
                item.transform                  = transform_stack_.back();\r
-               item.transform.volume   = core::frame_transform().volume; // Set volume to default since we don't care about it here.\r
 \r
                items_.push_back(item);\r
        }\r
@@ -319,19 +349,19 @@ public:
                return renderer_(std::move(items_), format_desc);\r
        }\r
        \r
-       virtual spl::shared_ptr<cpu::write_frame> create_frame(const void* tag, const core::pixel_format_desc& desc)\r
+       virtual spl::unique_ptr<core::data_frame> create_frame(const void* tag, const core::pixel_format_desc& desc, double frame_rate, core::field_mode field_mode)\r
        {\r
-               return spl::make_shared<cpu::write_frame>(tag, desc);\r
+               return spl::make_unique<cpu::data_frame>(tag, desc, frame_rate, field_mode);\r
        }\r
 };\r
 \r
 image_mixer::image_mixer() : impl_(new impl()){}\r
-void image_mixer::push(core::frame_transform& transform){impl_->push(transform);}\r
-void image_mixer::visit(core::data_frame& frame){impl_->visit(frame);}\r
+void image_mixer::push(const core::frame_transform& transform){impl_->push(transform);}\r
+void image_mixer::visit(const core::data_frame& frame){impl_->visit(frame);}\r
 void image_mixer::pop(){impl_->pop();}\r
 boost::shared_future<boost::iterator_range<const uint8_t*>> image_mixer::operator()(const core::video_format_desc& format_desc){return impl_->render(format_desc);}\r
 void image_mixer::begin_layer(core::blend_mode blend_mode){impl_->begin_layer(blend_mode);}\r
 void image_mixer::end_layer(){impl_->end_layer();}\r
-spl::shared_ptr<core::write_frame> image_mixer::create_frame(const void* tag, const core::pixel_format_desc& desc) {return impl_->create_frame(tag, desc);}\r
+spl::unique_ptr<core::data_frame> image_mixer::create_frame(const void* tag, const core::pixel_format_desc& desc, double frame_rate, core::field_mode field_mode) {return impl_->create_frame(tag, desc, frame_rate, field_mode);}\r
 \r
 }}}
\ No newline at end of file