\r
namespace caspar {\r
\r
-namespace internal {\r
+namespace detail {\r
\r
-static void* fast_memcpy(void* dest, const void* source, size_t count)\r
+static void* fast_memcpy_aligned_impl(void* dest, const void* source, size_t count)\r
{\r
CASPAR_ASSERT(dest != nullptr);\r
CASPAR_ASSERT(source != nullptr);\r
return dest;\r
}\r
\r
-static void* fast_memcpy_small(void* dest, const void* source, size_t count)\r
+\r
+static void* fast_memcpy_unaligned_impl(void* dest, const void* source, size_t count)\r
+{\r
+ CASPAR_ASSERT(dest != nullptr);\r
+ CASPAR_ASSERT(source != nullptr);\r
+\r
+ if(count == 0)\r
+ return dest;\r
+\r
+ __asm \r
+ { \r
+ mov esi, source; \r
+ mov edi, dest; \r
+ mov ebx, count; \r
+ shr ebx, 7;\r
+\r
+ cpy: \r
+ movdqu xmm0, [esi+00h]; \r
+ movdqu xmm1, [esi+10h]; \r
+ movdqu xmm2, [esi+20h]; \r
+ movdqu xmm3, [esi+30h]; \r
+\r
+ movdqu [edi+00h], xmm0;\r
+ movdqu [edi+10h], xmm1;\r
+ movdqu [edi+20h], xmm2; \r
+ movdqu [edi+30h], xmm3;\r
+\r
+ movdqu xmm4, [esi+40h];\r
+ movdqu xmm5, [esi+50h];\r
+ movdqu xmm6, [esi+60h];\r
+ movdqu xmm7, [esi+70h]; \r
+\r
+ movdqu [edi+40h], xmm4; \r
+ movdqu [edi+50h], xmm5; \r
+ movdqu [edi+60h], xmm6; \r
+ movdqu [edi+70h], xmm7; \r
+\r
+ lea edi, [edi+80h]; \r
+ lea esi, [esi+80h]; \r
+\r
+ dec ebx; \r
+ jnz cpy; \r
+ } \r
+ return dest;\r
+}\r
+\r
+static void* fast_memcpy_small_aligned(char* dest8, const char* source8, size_t count)\r
{ \r
size_t rest = count & 127;\r
count &= ~127;\r
\r
- internal::fast_memcpy(reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(source), count); \r
- return memcpy(reinterpret_cast<char*>(dest)+count, reinterpret_cast<const char*>(source)+count, rest);\r
+ fast_memcpy_aligned_impl(dest8, source8, count); \r
+\r
+ return memcpy(dest8+count, source8+count, rest);\r
}\r
\r
+static void* fast_memcpy_small_unaligned(char* dest8, const char* source8, size_t count)\r
+{ \r
+ size_t rest = count & 127;\r
+ count &= ~127;\r
+\r
+ fast_memcpy_unaligned_impl(dest8, source8, count); \r
+\r
+ return memcpy(dest8+count, source8+count, rest);\r
+}\r
+\r
+static void* fast_memcpy_aligned(void* dest, const void* source, size_t count)\r
+{ \r
+ auto dest8 = reinterpret_cast<char*>(dest);\r
+ auto source8 = reinterpret_cast<const char*>(source);\r
+ \r
+ size_t rest = count & 2047;\r
+ count &= ~2047;\r
+\r
+ Concurrency::parallel_for<size_t>(0, count / 2048, [&](size_t n)\r
+ { \r
+ detail::fast_memcpy_aligned_impl(dest8 + n*2048, source8 + n*2048, 2048); \r
+ });\r
+\r
+ return detail::fast_memcpy_small_aligned(dest8+count, source8+count, rest);\r
}\r
\r
-static void* fast_memcpy(void* dest, const void* source, size_t count)\r
+static void* fast_memcpy_unaligned(void* dest, const void* source, size_t count)\r
{ \r
- if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15) || count < 128)\r
- return memcpy(reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(source), count);\r
- \r
- size_t rest = count & 511;\r
- count &= ~511;\r
+ auto dest8 = reinterpret_cast<char*>(dest);\r
+ auto source8 = reinterpret_cast<const char*>(source);\r
+ \r
+ size_t rest = count & 2047;\r
+ count &= ~2047;\r
\r
- Concurrency::parallel_for<int>(0, count / 512, [&](size_t n)\r
+ Concurrency::parallel_for<size_t>(0, count / 2048, [&](size_t n)\r
{ \r
- internal::fast_memcpy(reinterpret_cast<char*>(dest) + n*512, reinterpret_cast<const char*>(source) + n*512, 512); \r
+ detail::fast_memcpy_unaligned_impl(dest8 + n*2048, source8 + n*2048, 2048); \r
});\r
\r
- return internal::fast_memcpy_small(reinterpret_cast<char*>(dest)+count, reinterpret_cast<const char*>(source)+count, rest);\r
+ return detail::fast_memcpy_small_unaligned(dest8+count, source8+count, rest);\r
+}\r
+\r
}\r
\r
template<typename T>\r
-static safe_ptr<T> fast_memdup(const T* source, size_t count)\r
+T* fast_memcpy(T* dest, const void* source, size_t count)\r
+{ \r
+ if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15))\r
+ return reinterpret_cast<T*>(detail::fast_memcpy_unaligned(dest, source, count));\r
+ else\r
+ return reinterpret_cast<T*>(detail::fast_memcpy_aligned(dest, source, count));\r
+}\r
+\r
+template<typename T>\r
+safe_ptr<T> fast_memdup(const T* source, size_t count)\r
{ \r
- auto dest = reinterpret_cast<T*>(scalable_aligned_malloc(count + 16, 32));\r
- auto dest8 = reinterpret_cast<char*>(dest);\r
- auto source8 = reinterpret_cast<const char*>(source); \r
auto source_align = reinterpret_cast<int>(source) & 15;\r
- \r
- try\r
- {\r
- fast_memcpy(dest8, source8-source_align, count+source_align);\r
- }\r
- catch(...)\r
- {\r
- scalable_free(dest);\r
- throw;\r
- }\r
-\r
- return safe_ptr<T>(reinterpret_cast<T*>(dest8+source_align), [dest](T*){scalable_free(dest);});\r
+ auto dest8 = reinterpret_cast<char*>(scalable_aligned_malloc(count + source_align, 32));\r
+ auto source8 = reinterpret_cast<const char*>(source); \r
+ detail::fast_memcpy_aligned(dest8, source8-source_align, count+source_align);\r
+ return safe_ptr<T>(reinterpret_cast<T*>(dest8+source_align), [dest8](T*){scalable_free(dest8);});\r
}\r
\r
\r