]> git.sesse.net Git - casparcg/blobdiff - common/memory/memcpy.h
git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches...
[casparcg] / common / memory / memcpy.h
index 57ab081e0b1920f1ae973855323e9ea43feac922..82040c02c5f3b0df5fb5b7c6e7c7c7625f99fa18 100644 (file)
 */\r
 #pragma once\r
 \r
+#include "../utility/assert.h"\r
+#include "../memory/safe_ptr.h"\r
+\r
 #include <assert.h>\r
 \r
 #include <tbb/parallel_for.h>\r
 \r
 namespace caspar {\r
 \r
-namespace internal {\r
+namespace detail {\r
 \r
-static void* fast_memcpy(void* dest, const void* source, size_t count)\r
+static void* fast_memcpy_aligned_impl(void* dest, const void* source, size_t count)\r
 {\r
-       assert(dest != nullptr);\r
-       assert(source != nullptr);\r
+       CASPAR_ASSERT(dest != nullptr);\r
+       CASPAR_ASSERT(source != nullptr);\r
+       CASPAR_ASSERT(reinterpret_cast<int>(dest) % 16 == 0);\r
+       CASPAR_ASSERT(reinterpret_cast<int>(source) % 16 == 0);\r
+\r
+       if(count == 0)\r
+               return dest;\r
 \r
        __asm   \r
        {      \r
@@ -69,27 +77,112 @@ static void* fast_memcpy(void* dest, const void* source, size_t count)
        return dest;\r
 }\r
 \r
+static void* fast_memcpy_unaligned_impl(void* dest, const void* source, size_t count)\r
+{\r
+       CASPAR_ASSERT(dest != nullptr);\r
+       CASPAR_ASSERT(source != nullptr);\r
+\r
+       if(count == 0)\r
+               return dest;\r
+\r
+       __asm   \r
+       {      \r
+               mov esi, source;          \r
+               mov edi, dest;    \r
+               mov ebx, count;     \r
+               shr ebx, 7;\r
+\r
+               cpy:             \r
+                       movdqu xmm0, [esi+00h];       \r
+                       movdqu xmm1, [esi+10h];      \r
+                       movdqu xmm2, [esi+20h];         \r
+                       movdqu xmm3, [esi+30h];   \r
+\r
+                       movdqu [edi+00h], xmm0;\r
+                       movdqu [edi+10h], xmm1;\r
+                       movdqu [edi+20h], xmm2;    \r
+                       movdqu [edi+30h], xmm3;\r
+\r
+                       movdqu xmm4, [esi+40h];\r
+                       movdqu xmm5, [esi+50h];\r
+                       movdqu xmm6, [esi+60h];\r
+                       movdqu xmm7, [esi+70h];  \r
+\r
+                       movdqu [edi+40h], xmm4; \r
+                       movdqu [edi+50h], xmm5;      \r
+                       movdqu [edi+60h], xmm6;    \r
+                       movdqu [edi+70h], xmm7;    \r
+\r
+                       lea edi, [edi+80h];       \r
+                       lea esi, [esi+80h];      \r
+\r
+                       dec ebx;      \r
+               jnz cpy;  \r
+       }   \r
+       return dest;\r
 }\r
 \r
-static void* fast_memcpy(void* dest, const void* source, size_t count)\r
+static void* fast_memcpy_small_aligned(char* dest8, const char* source8, size_t count)\r
 {   \r
-       if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15))\r
-               return memcpy(dest, source, count);\r
+       size_t rest = count & 127;\r
+       count &= ~127;\r
+\r
+       fast_memcpy_aligned_impl(dest8, source8, count);   \r
+\r
+       return memcpy(dest8+count,  source8+count, rest);\r
+}\r
 \r
-       if(count < 2048)\r
-               return memcpy(dest, source, count);\r
+static void* fast_memcpy_small_unaligned(char* dest8, const char* source8, size_t count)\r
+{   \r
+       size_t rest = count & 127;\r
+       count &= ~127;\r
+\r
+       fast_memcpy_unaligned_impl(dest8, source8, count);   \r
 \r
-       size_t rest = count % 128;\r
-       count -= rest;\r
+       return memcpy(dest8+count,  source8+count, rest);\r
+}\r
 \r
-       tbb::affinity_partitioner ap;\r
+static void* fast_memcpy_aligned(void* dest, const void* source, size_t count)\r
+{   \r
+       auto dest8                      = reinterpret_cast<char*>(dest);\r
+       auto source8            = reinterpret_cast<const char*>(source);\r
+               \r
+       size_t rest = count & 2047;\r
+       count &= ~2047;\r
+               \r
        tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)\r
        {       \r
-               internal::fast_memcpy(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);   \r
-       }, ap);\r
+               fast_memcpy_aligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);   \r
+       });\r
+       \r
+       return fast_memcpy_small_aligned(dest8+count, source8+count, rest);\r
+}\r
 \r
-       return memcpy(reinterpret_cast<char*>(dest)+count,  reinterpret_cast<const char*>(source)+count, rest);\r
+static void* fast_memcpy_unaligned(void* dest, const void* source, size_t count)\r
+{   \r
+       auto dest8                      = reinterpret_cast<char*>(dest);\r
+       auto source8            = reinterpret_cast<const char*>(source);\r
+               \r
+       size_t rest = count & 2047;\r
+       count &= ~2047;\r
+               \r
+       tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)\r
+       {       \r
+               fast_memcpy_unaligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);   \r
+       });\r
+       \r
+       return fast_memcpy_small_unaligned(dest8+count, source8+count, rest);\r
 }\r
 \r
+}\r
 \r
-}
\ No newline at end of file
+template<typename T>\r
+T* fast_memcpy(T* dest, const void* source, size_t count)\r
+{   \r
+       if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15))\r
+               return reinterpret_cast<T*>(detail::fast_memcpy_unaligned(dest, source, count));\r
+       else\r
+               return reinterpret_cast<T*>(detail::fast_memcpy_aligned(dest, source, count));\r
+}\r
+\r
+}\r