]> git.sesse.net Git - casparcg/commitdiff
2.1.0: image_mixer: Added GPU bypass for special cases.
authorronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Sun, 5 Feb 2012 11:38:36 +0000 (11:38 +0000)
committerronag <ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Sun, 5 Feb 2012 11:38:36 +0000 (11:38 +0000)
git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches/2.1.0@2253 362d55ac-95cf-4e76-9f9a-cbaa9c17b72d

accelerator/accelerator.vcxproj
accelerator/image/image_mixer.cpp
common/common.vcxproj
common/common.vcxproj.filters
common/memory/memcpy.h [new file with mode: 0644]
core/frame/frame_transform.cpp
core/frame/frame_transform.h
modules/screen/consumer/screen_consumer.cpp

index b8fb210e4177a6d74ab63655706837db5597b785..430681218ca57c5d410141e1572639e904eb1553 100644 (file)
@@ -44,8 +44,8 @@
     <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>\r
     <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)tmp\$(Configuration)\</IntDir>\r
     <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)tmp\$(Configuration)\</IntDir>\r
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;..\dependencies64\asmlib\;$(IncludePath)</IncludePath>\r
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;..\dependencies64\asmlib\;$(IncludePath)</IncludePath>\r
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;..\dependencies64\asmlib\;..\dependencies64\ffmpeg\include\;$(IncludePath)</IncludePath>\r
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;..\dependencies64\asmlib\;..\dependencies64\ffmpeg\include\;$(IncludePath)</IncludePath>\r
     <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)bin\$(Configuration)\</OutDir>\r
     <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)bin\$(Configuration)\</OutDir>\r
     <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectName)</TargetName>\r
index 0435327999410aa41a180bb9143f0e81435062f8..61ffd37a8781ba8cc88bcb7cb273f39719587b99 100644 (file)
@@ -32,6 +32,7 @@
 \r
 #include <common/gl/gl_check.h>\r
 #include <common/concurrency/async.h>\r
+#include <common/memory/memcpy.h>\r
 \r
 #include <core/frame/write_frame.h>\r
 #include <core/frame/frame_transform.h>\r
@@ -72,6 +73,7 @@ class image_renderer
 {\r
        spl::shared_ptr<context>        ogl_;\r
        image_kernel                            kernel_;\r
+       bool                                            has_uswc_copy_;\r
 public:\r
        image_renderer(const spl::shared_ptr<context>& ogl)\r
                : ogl_(ogl)\r
@@ -87,69 +89,99 @@ public:
                        return layer.second.empty();\r
                });\r
 \r
-               // Start host->device transfers.\r
-               std::map<host_buffer*, boost::shared_future<spl::shared_ptr<device_buffer>>> buffer_map;\r
-               BOOST_FOREACH(auto& layer, layers)\r
+               if(layers.empty())\r
                {\r
-                       BOOST_FOREACH(auto& item, layer.second)\r
+                       auto buffer = std::make_shared<std::vector<uint8_t, tbb::cache_aligned_allocator<uint8_t>>>(format_desc.size, 0);\r
+                       return async(launch_policy::deferred, [=]() mutable -> boost::iterator_range<const uint8_t*>\r
+                       {\r
+                               auto ptr = reinterpret_cast<const uint8_t*>(buffer->data());\r
+                               return boost::iterator_range<const uint8_t*>(ptr, ptr + buffer->size());\r
+                       });\r
+               }\r
+               else if(support_uswc_memcpy() &&\r
+                               layers.size() == 1 &&\r
+                          (kernel_.has_blend_modes() && layers.at(0).first != core::blend_mode::normal) == false &&\r
+                           layers.at(0).second.size() == 1 &&\r
+                           layers.at(0).second.at(0).pix_desc.format == core::pixel_format::bgra &&\r
+                           layers.at(0).second.at(0).buffers.at(0)->size() == format_desc.size &&\r
+                           layers.at(0).second.at(0).transform == core::frame_transform())\r
+               {\r
+                       auto source_buffer = layers.at(0).second.at(0).buffers.at(0);\r
+                       auto buffer = std::make_shared<std::vector<uint8_t, tbb::cache_aligned_allocator<uint8_t>>>(source_buffer->size());\r
+\r
+                       uswc_memcpy(buffer->data(), source_buffer->data(), source_buffer->size());\r
+                       return async(launch_policy::deferred, [=]() mutable -> boost::iterator_range<const uint8_t*>\r
+                       {\r
+                               auto ptr = reinterpret_cast<const uint8_t*>(buffer->data());\r
+                               return boost::iterator_range<const uint8_t*>(ptr, ptr + buffer->size());\r
+                       });\r
+               }\r
+               else\r
+               {\r
+                       // Start host->device transfers.\r
+                       std::map<host_buffer*, boost::shared_future<spl::shared_ptr<device_buffer>>> buffer_map;\r
+                       BOOST_FOREACH(auto& layer, layers)\r
                        {\r
-                               for(size_t n = 0; n < item.pix_desc.planes.size(); ++n) \r
+                               BOOST_FOREACH(auto& item, layer.second)\r
                                {\r
-                                       auto host_buffer = item.buffers.at(n);\r
-                                       auto it                  = buffer_map.find(host_buffer.get());\r
-                                       if(it == buffer_map.end())\r
+                                       for(size_t n = 0; n < item.pix_desc.planes.size(); ++n) \r
                                        {\r
-                                               auto plane                                      = item.pix_desc.planes[n];\r
-                                               auto future_device_buffer       = ogl_->copy_async(host_buffer, plane.width, plane.height, plane.channels);\r
-                                               it = buffer_map.insert(std::make_pair(host_buffer.get(), std::move(future_device_buffer))).first;\r
-                                       }\r
-                                       item.textures.push_back(it->second);\r
-                               }       \r
-                               item.buffers.clear();\r
-                       }\r
-               }               \r
-\r
-               // Draw\r
-               boost::shared_future<spl::shared_ptr<host_buffer>> buffer = ogl_->begin_invoke([=]() mutable -> spl::shared_ptr<host_buffer>\r
-               {\r
-                       auto draw_buffer = create_mixer_buffer(4, format_desc);\r
+                                               auto host_buffer = item.buffers.at(n);\r
+                                               auto it                  = buffer_map.find(host_buffer.get());\r
+                                               if(it == buffer_map.end())\r
+                                               {\r
+                                                       auto plane                                      = item.pix_desc.planes[n];\r
+                                                       auto future_device_buffer       = ogl_->copy_async(host_buffer, plane.width, plane.height, plane.channels);\r
+                                                       it = buffer_map.insert(std::make_pair(host_buffer.get(), std::move(future_device_buffer))).first;\r
+                                               }\r
+                                               item.textures.push_back(it->second);\r
+                                       }       \r
+                                       item.buffers.clear();\r
+                               }\r
+                       }               \r
 \r
-                       if(format_desc.field_mode != core::field_mode::progressive)\r
+                       // Draw\r
+                       boost::shared_future<spl::shared_ptr<host_buffer>> buffer = ogl_->begin_invoke([=]() mutable -> spl::shared_ptr<host_buffer>\r
                        {\r
-                               auto upper = layers;\r
-                               auto lower = std::move(layers);\r
+                               auto draw_buffer = create_mixer_buffer(4, format_desc);\r
 \r
-                               BOOST_FOREACH(auto& layer, upper)\r
+                               if(format_desc.field_mode != core::field_mode::progressive)\r
                                {\r
-                                       BOOST_FOREACH(auto& item, layer.second)\r
-                                               item.transform.field_mode = static_cast<core::field_mode>(item.transform.field_mode & core::field_mode::upper);\r
-                               }\r
+                                       auto upper = layers;\r
+                                       auto lower = std::move(layers);\r
 \r
-                               BOOST_FOREACH(auto& layer, lower)\r
+                                       BOOST_FOREACH(auto& layer, upper)\r
+                                       {\r
+                                               BOOST_FOREACH(auto& item, layer.second)\r
+                                                       item.transform.field_mode = static_cast<core::field_mode>(item.transform.field_mode & core::field_mode::upper);\r
+                                       }\r
+\r
+                                       BOOST_FOREACH(auto& layer, lower)\r
+                                       {\r
+                                               BOOST_FOREACH(auto& item, layer.second)\r
+                                                       item.transform.field_mode = static_cast<core::field_mode>(item.transform.field_mode & core::field_mode::lower);\r
+                                       }\r
+\r
+                                       draw(std::move(upper), draw_buffer, format_desc);\r
+                                       draw(std::move(lower), draw_buffer, format_desc);\r
+                               }\r
+                               else\r
                                {\r
-                                       BOOST_FOREACH(auto& item, layer.second)\r
-                                               item.transform.field_mode = static_cast<core::field_mode>(item.transform.field_mode & core::field_mode::lower);\r
+                                       draw(std::move(layers), draw_buffer, format_desc);\r
                                }\r
-\r
-                               draw(std::move(upper), draw_buffer, format_desc);\r
-                               draw(std::move(lower), draw_buffer, format_desc);\r
-                       }\r
-                       else\r
-                       {\r
-                               draw(std::move(layers), draw_buffer, format_desc);\r
-                       }\r
                        \r
-                       auto result = ogl_->create_host_buffer(static_cast<int>(format_desc.size), host_buffer::usage::read_only); \r
-                       draw_buffer->copy_to(result);                                                   \r
-                       return result;\r
-               });\r
+                               auto result = ogl_->create_host_buffer(static_cast<int>(format_desc.size), host_buffer::usage::read_only); \r
+                               draw_buffer->copy_to(result);                                                   \r
+                               return result;\r
+                       });\r
                \r
-               // Defer memory mapping.\r
-               return async(launch_policy::deferred, [=]() mutable -> boost::iterator_range<const uint8_t*>\r
-               {\r
-                       auto ptr = reinterpret_cast<const uint8_t*>(buffer.get()->data()); // .get() and ->data() can block calling thread, ->data() can also block OpenGL thread, defer it as long as possible.\r
-                       return boost::iterator_range<const uint8_t*>(ptr, ptr + buffer.get()->size());\r
-               });\r
+                       // Defer memory mapping.\r
+                       return async(launch_policy::deferred, [=]() mutable -> boost::iterator_range<const uint8_t*>\r
+                       {\r
+                               auto ptr = reinterpret_cast<const uint8_t*>(buffer.get()->data()); // .get() and ->data() can block calling thread, ->data() can also block OpenGL thread, defer it as long as possible.\r
+                               return boost::iterator_range<const uint8_t*>(ptr, ptr + buffer.get()->size());\r
+                       });\r
+               }\r
        }\r
 \r
 private:\r
@@ -311,9 +343,10 @@ public:
                        return;\r
 \r
                item item;\r
-               item.pix_desc   = frame->get_pixel_format_desc();\r
-               item.buffers    = frame->get_buffers();                         \r
-               item.transform  = transform_stack_.back();\r
+               item.pix_desc                   = frame->get_pixel_format_desc();\r
+               item.buffers                    = frame->get_buffers();                         \r
+               item.transform                  = transform_stack_.back();\r
+               item.transform.volume   = core::frame_transform().volume; // Set volume to default since we don't care about it here.\r
 \r
                layers_.back().second.push_back(item);\r
        }\r
index c32b218ec7762bac8e31c1480ddb1cb3f4317faa..b8aa6103308c0b6d8e899bd6a7afda6409411a3d 100644 (file)
   <PropertyGroup>\r
     <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)tmp\$(Configuration)\</IntDir>\r
     <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)tmp\$(Configuration)\</IntDir>\r
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\dependencies64\bluefish\include\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;$(IncludePath)</IncludePath>\r
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\dependencies64\bluefish\include\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;$(IncludePath)</IncludePath>\r
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\dependencies64\bluefish\include\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;$(IncludePath);..\..\dependencies64\ffmpeg\include\;..\dependencies64\asmlib\</IncludePath>\r
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\dependencies64\bluefish\include\;..\dependencies64\boost\;..\dependencies64\tbb\include\;..\dependencies64\sfml\include\;..\dependencies64\glew\include\;$(IncludePath);..\..\dependencies64\ffmpeg\include\;..\dependencies64\asmlib\</IncludePath>\r
     <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)bin\$(Configuration)\</OutDir>\r
     <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)bin\$(Configuration)\</OutDir>\r
     <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectName)</TargetName>\r
     <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectName)</TargetName>\r
   </PropertyGroup>\r
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Develop|x64'">\r
+    <IncludePath>$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSdkDir)include;$(FrameworkSDKDir)\include;;..\..\dependencies64\ffmpeg\include\;..\dependencies64\asmlib\</IncludePath>\r
+  </PropertyGroup>\r
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Profile|x64'">\r
+    <IncludePath>$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSdkDir)include;$(FrameworkSDKDir)\include;;..\..\dependencies64\ffmpeg\include\;..\dependencies64\asmlib\</IncludePath>\r
+  </PropertyGroup>\r
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">\r
     <ClCompile>\r
       <PrecompiledHeader>Use</PrecompiledHeader>\r
     <ClInclude Include="forward.h" />\r
     <ClInclude Include="gl\gl_check.h" />\r
     <ClInclude Include="log.h" />\r
+    <ClInclude Include="memory\memcpy.h" />\r
     <ClInclude Include="memory\memshfl.h" />\r
     <ClInclude Include="memory\page_locked_allocator.h" />\r
     <ClInclude Include="env.h" />\r
index d5d5d0dac22d715a0068d3d1b2787ae115272ebe..20c4b9e25412ceaa62e9ea356f2d015b15e42e8e 100644 (file)
     <ClInclude Include="spl\memory.h">\r
       <Filter>source\spl</Filter>\r
     </ClInclude>\r
+    <ClInclude Include="memory\memcpy.h">\r
+      <Filter>source\memory</Filter>\r
+    </ClInclude>\r
   </ItemGroup>\r
 </Project>
\ No newline at end of file
diff --git a/common/memory/memcpy.h b/common/memory/memcpy.h
new file mode 100644 (file)
index 0000000..5c6712c
--- /dev/null
@@ -0,0 +1,92 @@
+#pragma once\r
+\r
+#include <intrin.h>\r
+#include <stdint.h>\r
+\r
+#include <array>\r
+\r
+#include <tbb/parallel_for.h>\r
+\r
+#include <asmlib.h>\r
+\r
+#if defined(_MSC_VER)\r
+#pragma warning (push)\r
+#pragma warning (disable : 4244)\r
+#endif\r
+extern "C" \r
+{\r
+       #include <libavutil/cpu.h>\r
+}\r
+#if defined(_MSC_VER)\r
+#pragma warning (pop)\r
+#endif\r
+\r
+namespace caspar {\r
+\r
+static bool support_uswc_memcpy()\r
+{\r
+       static bool value = (ff_get_cpu_flags_x86() & AV_CPU_FLAG_SSE4) != 0;\r
+       return value;\r
+}\r
+         \r
+// http://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers/\r
+static void uswc_memcpy(void* dest, void* source, int count)  \r
+{  \r
+       static const int CACHED_BUFFER_SIZE = 4096;\r
+  \r
+       const int block_count = count/CACHED_BUFFER_SIZE;\r
+    \r
+       tbb::parallel_for(tbb::blocked_range<int>(0, block_count), [&](const tbb::blocked_range<int>& r)\r
+       {\r
+               __declspec(align(64)) std::array<uint8_t, CACHED_BUFFER_SIZE> cache_block;\r
+\r
+               auto load  = reinterpret_cast<__m128i*>(source)+r.begin()*CACHED_BUFFER_SIZE/sizeof(__m128i);\r
+               auto store = reinterpret_cast<__m128i*>(dest)+r.begin()*CACHED_BUFFER_SIZE/sizeof(__m128i);\r
+\r
+               for(int b = r.begin(); b != r.end(); ++b)\r
+               {\r
+                       {\r
+                               _mm_mfence();   \r
+                               auto cache = reinterpret_cast<__m128i*>(cache_block.data());     \r
+\r
+                               for(int n = 0; n < CACHED_BUFFER_SIZE; n += 64)\r
+                               {\r
+                                       auto x0 = _mm_stream_load_si128(load+0);  \r
+                                       auto x1 = _mm_stream_load_si128(load+1);  \r
+                                       auto x2 = _mm_stream_load_si128(load+2);  \r
+                                       auto x3 = _mm_stream_load_si128(load+3);  \r
+  \r
+                                       _mm_store_si128(cache+0, x0);  \r
+                                       _mm_store_si128(cache+1, x1);  \r
+                                       _mm_store_si128(cache+2, x2);  \r
+                                       _mm_store_si128(cache+3, x3);  \r
+  \r
+                                       cache += 4;  \r
+                                       load  += 4;  \r
+                               }\r
+                       }\r
+                       {\r
+                               _mm_mfence();    \r
+                               auto cache = reinterpret_cast<__m128i*>(cache_block.data());  \r
+                         \r
+                               for(int n = 0; n < CACHED_BUFFER_SIZE; n += 64)\r
+                               {  \r
+                                       auto x0 = _mm_load_si128(cache+0);  \r
+                                       auto x1 = _mm_load_si128(cache+1);  \r
+                                       auto x2 = _mm_load_si128(cache+2);  \r
+                                       auto x3 = _mm_load_si128(cache+3);  \r
+  \r
+                                       _mm_stream_si128(store+0, x0);  \r
+                                       _mm_stream_si128(store+1, x1);  \r
+                                       _mm_stream_si128(store+2, x2);  \r
+                                       _mm_stream_si128(store+3, x3);  \r
+  \r
+                                       cache += 4;  \r
+                                       store += 4;  \r
+                               }  \r
+                       }\r
+               }\r
+       });\r
+}  \r
+\r
+}
\ No newline at end of file
index 8d4842d19b0e45b0f330052712bd46634b4ab06b..19aa2f03620b224a2acc619b3a00c90c51f9eed6 100644 (file)
@@ -23,7 +23,8 @@
 \r
 #include "frame_transform.h"\r
 \r
-#include <algorithm>\r
+#include <boost/range/algorithm/equal.hpp>\r
+#include <boost/range/algorithm/fill.hpp>\r
 \r
 namespace caspar { namespace core {\r
                \r
@@ -37,10 +38,10 @@ frame_transform::frame_transform()
        , is_key(false)\r
        , is_mix(false)\r
 {\r
-       std::fill(fill_translation.begin(), fill_translation.end(), 0.0);\r
-       std::fill(fill_scale.begin(), fill_scale.end(), 1.0);\r
-       std::fill(clip_translation.begin(), clip_translation.end(), 0.0);\r
-       std::fill(clip_scale.begin(), clip_scale.end(), 1.0);\r
+       boost::range::fill(fill_translation, 0.0);\r
+       boost::range::fill(fill_scale, 1.0);\r
+       boost::range::fill(clip_translation, 0.0);\r
+       boost::range::fill(clip_scale, 1.0);\r
 }\r
 \r
 frame_transform& frame_transform::operator*=(const frame_transform &other)\r
@@ -107,14 +108,26 @@ frame_transform frame_transform::tween(double time, const frame_transform& sourc
        return result;\r
 }\r
 \r
-bool operator<(const frame_transform& lhs, const frame_transform& rhs)\r
-{\r
-       return memcmp(&lhs, &rhs, sizeof(frame_transform)) < 0;\r
-}\r
-\r
 bool operator==(const frame_transform& lhs, const frame_transform& rhs)\r
 {\r
-       return memcmp(&lhs, &rhs, sizeof(frame_transform)) == 0;\r
+       auto eq = [](double lhs, double rhs)\r
+       {\r
+               return std::abs(lhs - rhs) < 5e-8;\r
+       };\r
+\r
+       return \r
+               eq(lhs.volume, rhs.volume) &&\r
+               eq(lhs.opacity, rhs.opacity) &&\r
+               eq(lhs.contrast, rhs.contrast) &&\r
+               eq(lhs.brightness, rhs.brightness) &&\r
+               eq(lhs.saturation, rhs.saturation) &&\r
+               boost::range::equal(lhs.fill_translation, rhs.fill_translation, eq) &&\r
+               boost::range::equal(lhs.fill_scale, rhs.fill_scale, eq) &&\r
+               boost::range::equal(lhs.clip_translation, rhs.clip_translation, eq) &&\r
+               boost::range::equal(lhs.clip_scale, rhs.clip_scale, eq) &&\r
+               lhs.field_mode == rhs.field_mode &&\r
+               lhs.is_key == rhs.is_key &&\r
+               lhs.is_mix == rhs.is_mix;\r
 }\r
 \r
 bool operator!=(const frame_transform& lhs, const frame_transform& rhs)\r
index c6973be6335c0deb592bf8fd465c6f4e721ed931..70666474fe824dad284b3c294bc6a4cd0bca448a 100644 (file)
@@ -72,7 +72,6 @@ public:
        static frame_transform tween(double time, const frame_transform& source, const frame_transform& dest, double duration, const tweener& tween);\r
 };\r
 \r
-bool operator<(const frame_transform& lhs, const frame_transform& rhs);\r
 bool operator==(const frame_transform& lhs, const frame_transform& rhs);\r
 bool operator!=(const frame_transform& lhs, const frame_transform& rhs);\r
 \r
index 2ba4f49edebd85a8c444523d02485782f113dc9b..1e7f78147a550cc9ca53e6dbf364b716677ae72f 100644 (file)
@@ -45,6 +45,7 @@
 \r
 #include <tbb/atomic.h>\r
 #include <tbb/concurrent_queue.h>\r
+#include <tbb/parallel_for.h>\r
 \r
 #include <boost/assign.hpp>\r
 \r
@@ -339,44 +340,39 @@ public:
                        return;\r
 \r
                av_frame = frames[0];\r
+               \r
+               GL(glBindTexture(GL_TEXTURE_2D, texture_));\r
 \r
-               if(av_frame->linesize[0] != static_cast<int>(format_desc_.width*4))\r
-               {\r
-                       const uint8_t *src_data[4] = {0};\r
-                       A_memcpy(const_cast<uint8_t**>(&src_data[0]), av_frame->data, 4);\r
-                       const int src_linesizes[4] = {0};\r
-                       A_memcpy(const_cast<int*>(&src_linesizes[0]), av_frame->linesize, 4);\r
-\r
-                       auto av_frame2 = get_av_frame();\r
-                       av_image_alloc(av_frame2->data, av_frame2->linesize, av_frame2->width, av_frame2->height, PIX_FMT_BGRA, 16);\r
-                       av_frame = spl::shared_ptr<AVFrame>(av_frame2.get(), [=](AVFrame*)\r
-                       {\r
-                               av_freep(&av_frame2->data[0]);\r
-                       });\r
-\r
-                       av_image_copy(av_frame2->data, av_frame2->linesize, src_data, src_linesizes, PIX_FMT_BGRA, av_frame2->width, av_frame2->height);\r
-               }\r
-\r
-               glBindTexture(GL_TEXTURE_2D, texture_);\r
-\r
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbos_[0]);\r
-               glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, format_desc_.width, format_desc_.height, GL_BGRA, GL_UNSIGNED_BYTE, 0);\r
+               GL(glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbos_[0]));\r
+               GL(glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, format_desc_.width, format_desc_.height, GL_BGRA, GL_UNSIGNED_BYTE, 0));\r
 \r
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbos_[1]);\r
-               glBufferData(GL_PIXEL_UNPACK_BUFFER, format_desc_.size, 0, GL_STREAM_DRAW);\r
+               GL(glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbos_[1]));\r
+               GL(glBufferData(GL_PIXEL_UNPACK_BUFFER, format_desc_.size, 0, GL_STREAM_DRAW));\r
 \r
-               auto ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);\r
+               auto ptr = reinterpret_cast<char*>(GL2(glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY)));\r
                if(ptr)\r
                {\r
                        if(config_.key_only)\r
-                               aligned_memshfl(reinterpret_cast<char*>(ptr), av_frame->data[0], frame->image_data().size(), 0x0F0F0F0F, 0x0B0B0B0B, 0x07070707, 0x03030303);\r
+                       {\r
+                               tbb::parallel_for(tbb::blocked_range<int>(0, format_desc_.height), [&](const tbb::blocked_range<int>& r)\r
+                               {\r
+                                       for(int n = r.begin(); n != r.end(); ++n)\r
+                                               aligned_memshfl(ptr+n*format_desc_.width*4, av_frame->data[0]+n*av_frame->linesize[0], format_desc_.width*4, 0x0F0F0F0F, 0x0B0B0B0B, 0x07070707, 0x03030303);\r
+                               });\r
+                       }\r
                        else\r
-                               A_memcpy(reinterpret_cast<char*>(ptr), av_frame->data[0], frame->image_data().size());\r
+                       {       \r
+                               tbb::parallel_for(tbb::blocked_range<int>(0, format_desc_.height), [&](const tbb::blocked_range<int>& r)\r
+                               {\r
+                                       for(int n = r.begin(); n != r.end(); ++n)\r
+                                               A_memcpy(ptr+n*format_desc_.width*4, av_frame->data[0]+n*av_frame->linesize[0], format_desc_.width*4);\r
+                               });\r
+                       }\r
                        \r
-                       glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); // release the mapped buffer\r
+                       GL(glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER)); // release the mapped buffer\r
                }\r
 \r
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);\r
+               GL(glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0));\r
                                \r
                GL(glClear(GL_COLOR_BUFFER_BIT));                       \r
                glBegin(GL_QUADS);\r
@@ -386,7 +382,7 @@ public:
                                glTexCoord2f(0.0f,        0.0f);        glVertex2f(-width_,  height_);\r
                glEnd();\r
                \r
-               glBindTexture(GL_TEXTURE_2D, 0);\r
+               GL(glBindTexture(GL_TEXTURE_2D, 0));\r
 \r
                std::rotate(pbos_.begin(), pbos_.begin() + 1, pbos_.end());\r
        }\r