X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=accelerator%2Fcpu%2Fimage%2Fimage_mixer.cpp;h=315d3f9b7b27ab116902e453c6c6e7109624512c;hb=e0e1a2b1fdcab66e8da72e3eaa2db5564f8f9d72;hp=eb08afc728df941e93b163a12866225964948456;hpb=9de14e806e98c49b336d0a11987721ae051690d2;p=casparcg diff --git a/accelerator/cpu/image/image_mixer.cpp b/accelerator/cpu/image/image_mixer.cpp index eb08afc72..315d3f9b7 100644 --- a/accelerator/cpu/image/image_mixer.cpp +++ b/accelerator/cpu/image/image_mixer.cpp @@ -1,358 +1,378 @@ -/* -* Copyright (c) 2011 Sveriges Television AB -* -* This file is part of CasparCG (www.casparcg.com). -* -* CasparCG is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. -* -* CasparCG is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with CasparCG. If not, see . -* -* Author: Robert Nagy, ronag89@gmail.com -*/ - -#include "../../stdafx.h" - -#include "image_mixer.h" - -#include "../util/write_frame.h" -#include "../util/xmm.h" - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning (push) -#pragma warning (disable : 4244) -#endif -extern "C" -{ - #include - #include - #include -} -#if defined(_MSC_VER) -#pragma warning (pop) -#endif - -namespace caspar { namespace accelerator { namespace cpu { - -struct item -{ - core::pixel_format_desc pix_desc; - std::vector> buffers; - core::frame_transform transform; - - item() - : pix_desc(core::pixel_format::invalid) - { - } -}; - -bool operator==(const item& lhs, const item& rhs) -{ - return lhs.buffers == rhs.buffers && lhs.transform == rhs.transform; -} - -bool operator!=(const item& lhs, const item& rhs) -{ - return !(lhs == rhs); -} - -inline xmm::s8_x blend(xmm::s8_x d, xmm::s8_x s) -{ - using namespace xmm; - - // T(S, D) = S * D[A] + 0x80 - auto aaaa = s8_x::shuffle(d, s8_x(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3)); - d = s8_x(u8_x::min(u8_x(d), u8_x(aaaa))); // overflow guard - - auto xaxa = s16_x(aaaa) >> 8; - - auto t1 = s16_x::multiply_low(s16_x(s) & 0x00FF, xaxa) + 0x80; - auto t2 = s16_x::multiply_low(s16_x(s) >> 8 , xaxa) + 0x80; - - // C(S, D) = S + D - (((T >> 8) + T) >> 8); - auto xyxy = s8_x(((t1 >> 8) + t1) >> 8); - auto yxyx = s8_x((t2 >> 8) + t2); - auto argb = s8_x::blend(xyxy, yxyx, s8_x(-1, 0, -1, 0)); - - return s8_x(s) + (d - argb); -} - -template -static void kernel(uint8_t* dest, const uint8_t* source, size_t count) -{ - using namespace xmm; - - for(auto n = 0; n < count; n += 32) - { - auto s0 = s8_x::load(dest+n+0); - auto s1 = s8_x::load(dest+n+16); - - auto d0 = s8_x::load(source+n+0); - auto d1 = s8_x::load(source+n+16); - - auto argb0 = blend(d0, s0); - auto argb1 = blend(d1, s1); - - s8_x::store(argb0, dest+n+0 ); - s8_x::store(argb1, dest+n+16); - } -} - -template -static void kernel(uint8_t* dest, const uint8_t* source, size_t count) -{ - using namespace xmm; - - if(reinterpret_cast(dest) % 16 != 0 || reinterpret_cast(source) % 16 != 0) - kernel(dest, source, count); - else - kernel(dest, source, count); -} - -class image_renderer -{ - std::pair, boost::shared_future>> last_image_; - tbb::concurrent_unordered_map>> sws_contexts_; -public: - boost::shared_future> operator()(std::vector items, const core::video_format_desc& format_desc) - { - if(last_image_.first == items && last_image_.second.has_value()) - return last_image_.second; - - auto image = render(items, format_desc); - last_image_ = std::make_pair(std::move(items), image); - return image; - } - -private: - boost::shared_future> render(std::vector items, const core::video_format_desc& format_desc) - { - convert(items, format_desc.width, format_desc.height); - - auto result = spl::make_shared(format_desc.size, 0); - if(format_desc.field_mode != core::field_mode::progressive) - { - draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::upper); - draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::lower); - } - else - { - draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::progressive); - } - - return async(launch_policy::deferred, [=] - { - return boost::iterator_range(result->data(), result->data() + format_desc.size); - }); - } - - void draw(std::vector items, uint8_t* dest, int width, int height, core::field_mode field_mode) - { - BOOST_FOREACH(auto& item, items) - item.transform.field_mode &= field_mode; - - boost::remove_erase_if(items, [](item& item){return item.transform.field_mode == core::field_mode::empty;}); - - if(items.empty()) - return; - - auto start = field_mode == core::field_mode::lower ? 1 : 0; - auto step = field_mode == core::field_mode::progressive ? 1 : 2; - - // TODO: Add support for fill translations. - // TODO: Add support for mask rect. - // TODO: Add support for opacity. - // TODO: Add support for mix transition. - // TODO: Add support for push transition. - // TODO: Add support for wipe transition. - // TODO: Add support for slide transition. - tbb::parallel_for(tbb::blocked_range(0, height/step), [&](const tbb::blocked_range& r) - { - for(auto i = r.begin(); i != r.end(); ++i) - { - auto y = i*step+start; - - for(std::size_t n = 0; n < items.size()-1; ++n) - kernel(dest + y*width*4, items[n].buffers.at(0)->data() + y*width*4, width*4); - - std::size_t n = items.size()-1; - kernel(dest + y*width*4, items[n].buffers.at(0)->data() + y*width*4, width*4); - } - - _mm_mfence(); - }); - } - - void convert(std::vector& source_items, int width, int height) - { - std::set>> buffers; - - BOOST_FOREACH(auto& item, source_items) - buffers.insert(item.buffers); - - auto dest_items = source_items; - - tbb::parallel_for_each(buffers.begin(), buffers.end(), [&](const std::vector>& buffers) - { - auto pix_desc = std::find_if(source_items.begin(), source_items.end(), [&](const item& item){return item.buffers == buffers;})->pix_desc; - - if(pix_desc.format == core::pixel_format::bgra && - pix_desc.planes.at(0).width == width && - pix_desc.planes.at(0).height == height) - return; - - auto input_av_frame = ffmpeg::make_av_frame(buffers, pix_desc); - - int key = ((input_av_frame->width << 22) & 0xFFC00000) | ((input_av_frame->height << 6) & 0x003FC000) | ((input_av_frame->format << 7) & 0x00007F00); - - auto& pool = sws_contexts_[key]; - - std::shared_ptr sws_context; - if(!pool.try_pop(sws_context)) - { - double param; - sws_context.reset(sws_getContext(input_av_frame->width, input_av_frame->height, static_cast(input_av_frame->format), width, height, PIX_FMT_BGRA, SWS_BILINEAR, nullptr, nullptr, ¶m), sws_freeContext); - } - - if(!sws_context) - BOOST_THROW_EXCEPTION(operation_failed() << msg_info("Could not create software scaling context.") << boost::errinfo_api_function("sws_getContext")); - - auto dest_frame = spl::make_shared(width*height*4); - - { - spl::shared_ptr dest_av_frame(avcodec_alloc_frame(), av_free); - avcodec_get_frame_defaults(dest_av_frame.get()); - avpicture_fill(reinterpret_cast(dest_av_frame.get()), dest_frame->data(), PIX_FMT_BGRA, width, height); - - sws_scale(sws_context.get(), input_av_frame->data, input_av_frame->linesize, 0, input_av_frame->height, dest_av_frame->data, dest_av_frame->linesize); - pool.push(sws_context); - } - - for(std::size_t n = 0; n < source_items.size(); ++n) - { - if(source_items[n].buffers == buffers) - { - dest_items[n].buffers = boost::assign::list_of(dest_frame); - dest_items[n].pix_desc = core::pixel_format_desc(core::pixel_format::bgra); - dest_items[n].pix_desc.planes = boost::assign::list_of(core::pixel_format_desc::plane(width, height, 4)); - dest_items[n].transform = source_items[n].transform; - } - } - }); - - source_items = std::move(dest_items); - } -}; - -struct image_mixer::impl : boost::noncopyable -{ - image_renderer renderer_; - std::vector transform_stack_; - std::vector items_; // layer/stream/items -public: - impl() - : transform_stack_(1) - { - CASPAR_LOG(info) << L"Initialized Streaming SIMD Extensions Accelerated CPU Image Mixer"; - } - - void begin_layer(core::blend_mode blend_mode) - { - } - - void push(core::frame_transform& transform) - { - transform_stack_.push_back(transform_stack_.back()*transform); - } - - void visit(core::data_frame& frame2) - { - write_frame* frame = dynamic_cast(&frame2); - if(frame == nullptr) - return; - - if(frame->get_pixel_format_desc().format == core::pixel_format::invalid) - return; - - if(frame->get_buffers().empty()) - return; - - if(transform_stack_.back().field_mode == core::field_mode::empty) - return; - - item item; - item.pix_desc = frame->get_pixel_format_desc(); - item.buffers = frame->get_buffers(); - item.transform = transform_stack_.back(); - item.transform.volume = core::frame_transform().volume; // Set volume to default since we don't care about it here. - - items_.push_back(item); - } - - void pop() - { - transform_stack_.pop_back(); - } - - void end_layer() - { - } - - boost::shared_future> render(const core::video_format_desc& format_desc) - { - return renderer_(std::move(items_), format_desc); - } - - virtual spl::shared_ptr create_frame(const void* tag, const core::pixel_format_desc& desc) - { - return spl::make_shared(tag, desc); - } -}; - -image_mixer::image_mixer() : impl_(new impl()){} -void image_mixer::push(core::frame_transform& transform){impl_->push(transform);} -void image_mixer::visit(core::data_frame& frame){impl_->visit(frame);} -void image_mixer::pop(){impl_->pop();} -boost::shared_future> image_mixer::operator()(const core::video_format_desc& format_desc){return impl_->render(format_desc);} -void image_mixer::begin_layer(core::blend_mode blend_mode){impl_->begin_layer(blend_mode);} -void image_mixer::end_layer(){impl_->end_layer();} -spl::shared_ptr image_mixer::create_frame(const void* tag, const core::pixel_format_desc& desc) {return impl_->create_frame(tag, desc);} - -}}} \ No newline at end of file +/* +* Copyright (c) 2011 Sveriges Television AB +* +* This file is part of CasparCG (www.casparcg.com). +* +* CasparCG is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* CasparCG is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with CasparCG. If not, see . +* +* Author: Robert Nagy, ronag89@gmail.com +*/ + +#include "../../StdAfx.h" + +#include "image_mixer.h" + +#include "../util/xmm.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable : 4244) +#endif +extern "C" +{ + #include + #include + #include +} +#if defined(_MSC_VER) +#pragma warning (pop) +#endif + +namespace caspar { namespace accelerator { namespace cpu { + +struct item +{ + core::pixel_format_desc pix_desc = core::pixel_format::invalid; + std::array data; + core::image_transform transform; + + item() + { + data.fill(0); + } +}; + +bool operator==(const item& lhs, const item& rhs) +{ + return lhs.data == rhs.data && lhs.transform == rhs.transform; +} + +bool operator!=(const item& lhs, const item& rhs) +{ + return !(lhs == rhs); +} + +// 100% accurate blending with correct rounding. +inline xmm::s8_x blend(xmm::s8_x d, xmm::s8_x s) +{ + using namespace xmm; + + // C(S, D) = S + D - (((T >> 8) + T) >> 8); + // T(S, D) = S * D[A] + 0x80 + + auto aaaa = s8_x::shuffle(d, s8_x(15, 15, 15, 15, 11, 11, 11, 11, 7, 7, 7, 7, 3, 3, 3, 3)); + d = s8_x(u8_x::min(u8_x(d), u8_x(aaaa))); // Overflow guard. Some source files have color values which incorrectly exceed pre-multiplied alpha values, e.g. red(255) > alpha(254). + + auto xaxa = s16_x(aaaa) >> 8; + + auto t1 = s16_x::multiply_low(s16_x(s) & 0x00FF, xaxa) + 0x80; + auto t2 = s16_x::multiply_low(s16_x(s) >> 8 , xaxa) + 0x80; + + auto xyxy = s8_x(((t1 >> 8) + t1) >> 8); + auto yxyx = s8_x((t2 >> 8) + t2); + auto argb = s8_x::blend(xyxy, yxyx, s8_x(-1, 0, -1, 0)); + + return s8_x(s) + (d - argb); +} + +template +static void kernel(uint8_t* dest, const uint8_t* source, size_t count) +{ + using namespace xmm; + + for(auto n = 0; n < count; n += 32) + { + auto s0 = s8_x::load(dest+n+0); + auto s1 = s8_x::load(dest+n+16); + + auto d0 = s8_x::load(source+n+0); + auto d1 = s8_x::load(source+n+16); + + auto argb0 = blend(d0, s0); + auto argb1 = blend(d1, s1); + + s8_x::store(argb0, dest+n+0 ); + s8_x::store(argb1, dest+n+16); + } +} + +template +static void kernel(uint8_t* dest, const uint8_t* source, size_t count) +{ + using namespace xmm; + + if(reinterpret_cast(dest) % 16 != 0 || reinterpret_cast(source) % 16 != 0) + kernel(dest, source, count); + else + kernel(dest, source, count); +} + +class image_renderer +{ + tbb::concurrent_unordered_map>> sws_devices_; + tbb::concurrent_bounded_queue> temp_buffers_; + core::video_format_desc format_desc_; +public: + std::future> operator()(std::vector items, const core::video_format_desc& format_desc) + { + if (format_desc != format_desc_) + { + format_desc_ = format_desc; + sws_devices_.clear(); + } + + convert(items, format_desc.width, format_desc.height); + + // Remove first field stills. + boost::range::remove_erase_if(items, [&](const item& item) + { + return item.transform.is_still && item.transform.field_mode == format_desc.field_mode; // only us last field for stills. + }); + + // Stills are progressive + for (auto& item : items) + { + if(item.transform.is_still) + item.transform.field_mode = core::field_mode::progressive; + } + + auto result = spl::make_shared(format_desc.size, 0); + if(format_desc.field_mode != core::field_mode::progressive) + { + draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::upper); + draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::lower); + } + else + { + draw(items, result->data(), format_desc.width, format_desc.height, core::field_mode::progressive); + } + + temp_buffers_.clear(); + + return make_ready_future(array(result->data(), format_desc.size, true, result)); + } + +private: + + void draw(std::vector items, uint8_t* dest, std::size_t width, std::size_t height, core::field_mode field_mode) + { + for (auto& item : items) + item.transform.field_mode &= field_mode; + + // Remove empty items. + boost::range::remove_erase_if(items, [&](const item& item) + { + return item.transform.field_mode == core::field_mode::empty; + }); + + if(items.empty()) + return; + + auto start = field_mode == core::field_mode::lower ? 1 : 0; + auto step = field_mode == core::field_mode::progressive ? 1 : 2; + + // TODO: Add support for fill translations. + // TODO: Add support for mask rect. + // TODO: Add support for opacity. + // TODO: Add support for mix transition. + // TODO: Add support for push transition. + // TODO: Add support for wipe transition. + // TODO: Add support for slide transition. + tbb::parallel_for(tbb::blocked_range(0, height/step), [&](const tbb::blocked_range& r) + { + for(auto i = r.begin(); i != r.end(); ++i) + { + auto y = i*step+start; + + for(std::size_t n = 0; n < items.size()-1; ++n) + kernel(dest + y*width*4, items[n].data.at(0) + y*width*4, width*4); + + std::size_t n = items.size()-1; + kernel(dest + y*width*4, items[n].data.at(0) + y*width*4, width*4); + } + + _mm_mfence(); + }); + } + + void convert(std::vector& source_items, int width, int height) + { + std::set> buffers; + + for (auto& item : source_items) + buffers.insert(item.data); + + auto dest_items = source_items; + + tbb::parallel_for_each(buffers.begin(), buffers.end(), [&](const std::array& data) + { + auto pix_desc = std::find_if(source_items.begin(), source_items.end(), [&](const item& item){return item.data == data;})->pix_desc; + + if(pix_desc.format == core::pixel_format::bgra && + pix_desc.planes.at(0).width == width && + pix_desc.planes.at(0).height == height) + return; + + std::array data2 = {}; + for(std::size_t n = 0; n < data.size(); ++n) + data2.at(n) = const_cast(data[n]); + + auto input_av_frame = ffmpeg::make_av_frame(data2, pix_desc); + + + int64_t key = ((static_cast(input_av_frame->width) << 32) & 0xFFFF00000000) | + ((static_cast(input_av_frame->height) << 16) & 0xFFFF0000) | + ((static_cast(input_av_frame->format) << 8) & 0xFF00); + + auto& pool = sws_devices_[key]; + + std::shared_ptr sws_device; + if(!pool.try_pop(sws_device)) + { + double param; + sws_device.reset(sws_getContext(input_av_frame->width, input_av_frame->height, static_cast(input_av_frame->format), width, height, AVPixelFormat::AV_PIX_FMT_BGRA, SWS_BILINEAR, nullptr, nullptr, ¶m), sws_freeContext); + } + + if(!sws_device) + CASPAR_THROW_EXCEPTION(operation_failed() << msg_info("Could not create software scaling device.") << boost::errinfo_api_function("sws_getContext")); + + auto dest_frame = spl::make_shared(width*height*4); + temp_buffers_.push(dest_frame); + + { + auto dest_av_frame = ffmpeg::create_frame(); + avpicture_fill(reinterpret_cast(dest_av_frame.get()), dest_frame->data(), AVPixelFormat::AV_PIX_FMT_BGRA, width, height); + + sws_scale(sws_device.get(), input_av_frame->data, input_av_frame->linesize, 0, input_av_frame->height, dest_av_frame->data, dest_av_frame->linesize); + pool.push(sws_device); + } + + for(std::size_t n = 0; n < source_items.size(); ++n) + { + if(source_items[n].data == data) + { + dest_items[n].data.fill(0); + dest_items[n].data[0] = dest_frame->data(); + dest_items[n].pix_desc = core::pixel_format_desc(core::pixel_format::bgra); + dest_items[n].pix_desc.planes = { core::pixel_format_desc::plane(width, height, 4) }; + dest_items[n].transform = source_items[n].transform; + } + } + }); + + source_items = std::move(dest_items); + } +}; + +struct image_mixer::impl : boost::noncopyable +{ + image_renderer renderer_; + std::vector transform_stack_; + std::vector items_; // layer/stream/items +public: + impl(int channel_id) + : transform_stack_(1) + { + CASPAR_LOG(info) << L"Initialized Streaming SIMD Extensions Accelerated CPU Image Mixer for channel " << channel_id; + } + + void push(const core::frame_transform& transform) + { + transform_stack_.push_back(transform_stack_.back()*transform.image_transform); + } + + void visit(const core::const_frame& frame) + { + if(frame.pixel_format_desc().format == core::pixel_format::invalid) + return; + + if(frame.pixel_format_desc().planes.empty()) + return; + + if(frame.pixel_format_desc().planes.at(0).size < 16) + return; + + if(transform_stack_.back().field_mode == core::field_mode::empty) + return; + + item item; + item.pix_desc = frame.pixel_format_desc(); + item.transform = transform_stack_.back(); + for(int n = 0; n < item.pix_desc.planes.size(); ++n) + item.data.at(n) = frame.image_data(n).begin(); + + items_.push_back(item); + } + + void pop() + { + transform_stack_.pop_back(); + } + + std::future> render(const core::video_format_desc& format_desc) + { + return renderer_(std::move(items_), format_desc); + } + + core::mutable_frame create_frame(const void* tag, const core::pixel_format_desc& desc, const core::audio_channel_layout& channel_layout) + { + std::vector> buffers; + for (auto& plane : desc.planes) + { + auto buf = spl::make_shared(plane.size); + buffers.push_back(array(buf->data(), plane.size, true, buf)); + } + return core::mutable_frame(std::move(buffers), core::mutable_audio_buffer(), tag, desc, channel_layout); + } +}; + +image_mixer::image_mixer(int channel_id) : impl_(new impl(channel_id)){} +image_mixer::~image_mixer(){} +void image_mixer::push(const core::frame_transform& transform){impl_->push(transform);} +void image_mixer::visit(const core::const_frame& frame){impl_->visit(frame);} +void image_mixer::pop(){impl_->pop();} +int image_mixer::get_max_frame_size() { return std::numeric_limits::max(); } +std::future> image_mixer::operator()(const core::video_format_desc& format_desc, bool /* straighten_alpha */){return impl_->render(format_desc);} +core::mutable_frame image_mixer::create_frame(const void* tag, const core::pixel_format_desc& desc, const core::audio_channel_layout& channel_layout) {return impl_->create_frame(tag, desc, channel_layout);} + +}}}