+#pragma once\r
+\r
+#include <assert.h>\r
+\r
+#include <tbb/parallel_for.h>\r
+\r
+namespace caspar {\r
+\r
+namespace internal {\r
+\r
+static void* fast_memcpy(void* dest, const void* source, size_t count)\r
+{\r
+ assert(count % (16*8) == 0);\r
+ assert(dest != nullptr);\r
+ assert(source != nullptr);\r
+\r
+ __asm \r
+ { \r
+ mov esi, source; \r
+ mov edi, dest; \r
+ mov ebx, count; \r
+ shr ebx, 7;\r
+\r
+ cpy: \r
+ movdqa xmm0, [esi+00h]; \r
+ movdqa xmm1, [esi+10h]; \r
+ movdqa xmm2, [esi+20h]; \r
+ movdqa xmm3, [esi+30h]; \r
+\r
+ movntdq [edi+00h], xmm0;\r
+ movntdq [edi+10h], xmm1;\r
+ movntdq [edi+20h], xmm2; \r
+ movntdq [edi+30h], xmm3;\r
+\r
+ movdqa xmm4, [esi+40h];\r
+ movdqa xmm5, [esi+50h];\r
+ movdqa xmm6, [esi+60h];\r
+ movdqa xmm7, [esi+70h]; \r
+\r
+ movntdq [edi+40h], xmm4; \r
+ movntdq [edi+50h], xmm5; \r
+ movntdq [edi+60h], xmm6; \r
+ movntdq [edi+70h], xmm7; \r
+\r
+ lea edi, [edi+80h]; \r
+ lea esi, [esi+80h]; \r
+\r
+ dec ebx; \r
+ jnz cpy; \r
+ } \r
+ return dest;\r
+}\r
+\r
+}\r
+\r
+static void* fast_memcpy(void* dest, const void* source, size_t num)\r
+{ \r
+ tbb::affinity_partitioner partitioner;\r
+ tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)\r
+ { \r
+ internal::fast_memcpy(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128); \r
+ }, partitioner); \r
+ return dest;\r
+}\r
+\r
+\r
+}
\ No newline at end of file