X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fmemory%2Fmemcpy.h;h=82040c02c5f3b0df5fb5b7c6e7c7c7625f99fa18;hb=01d8a61541548cf535791be39d233799eefc4187;hp=57ab081e0b1920f1ae973855323e9ea43feac922;hpb=bb950f407ae396399427f3d35af388dfc666d7b5;p=casparcg diff --git a/common/memory/memcpy.h b/common/memory/memcpy.h index 57ab081e0..82040c02c 100644 --- a/common/memory/memcpy.h +++ b/common/memory/memcpy.h @@ -19,18 +19,26 @@ */ #pragma once +#include "../utility/assert.h" +#include "../memory/safe_ptr.h" + #include #include namespace caspar { -namespace internal { +namespace detail { -static void* fast_memcpy(void* dest, const void* source, size_t count) +static void* fast_memcpy_aligned_impl(void* dest, const void* source, size_t count) { - assert(dest != nullptr); - assert(source != nullptr); + CASPAR_ASSERT(dest != nullptr); + CASPAR_ASSERT(source != nullptr); + CASPAR_ASSERT(reinterpret_cast(dest) % 16 == 0); + CASPAR_ASSERT(reinterpret_cast(source) % 16 == 0); + + if(count == 0) + return dest; __asm { @@ -69,27 +77,112 @@ static void* fast_memcpy(void* dest, const void* source, size_t count) return dest; } +static void* fast_memcpy_unaligned_impl(void* dest, const void* source, size_t count) +{ + CASPAR_ASSERT(dest != nullptr); + CASPAR_ASSERT(source != nullptr); + + if(count == 0) + return dest; + + __asm + { + mov esi, source; + mov edi, dest; + mov ebx, count; + shr ebx, 7; + + cpy: + movdqu xmm0, [esi+00h]; + movdqu xmm1, [esi+10h]; + movdqu xmm2, [esi+20h]; + movdqu xmm3, [esi+30h]; + + movdqu [edi+00h], xmm0; + movdqu [edi+10h], xmm1; + movdqu [edi+20h], xmm2; + movdqu [edi+30h], xmm3; + + movdqu xmm4, [esi+40h]; + movdqu xmm5, [esi+50h]; + movdqu xmm6, [esi+60h]; + movdqu xmm7, [esi+70h]; + + movdqu [edi+40h], xmm4; + movdqu [edi+50h], xmm5; + movdqu [edi+60h], xmm6; + movdqu [edi+70h], xmm7; + + lea edi, [edi+80h]; + lea esi, [esi+80h]; + + dec ebx; + jnz cpy; + } + return dest; } -static void* fast_memcpy(void* dest, const void* source, size_t count) +static void* fast_memcpy_small_aligned(char* dest8, const char* source8, size_t count) { - if((reinterpret_cast(source) & 15) || (reinterpret_cast(dest) & 15)) - return memcpy(dest, source, count); + size_t rest = count & 127; + count &= ~127; + + fast_memcpy_aligned_impl(dest8, source8, count); + + return memcpy(dest8+count, source8+count, rest); +} - if(count < 2048) - return memcpy(dest, source, count); +static void* fast_memcpy_small_unaligned(char* dest8, const char* source8, size_t count) +{ + size_t rest = count & 127; + count &= ~127; + + fast_memcpy_unaligned_impl(dest8, source8, count); - size_t rest = count % 128; - count -= rest; + return memcpy(dest8+count, source8+count, rest); +} - tbb::affinity_partitioner ap; +static void* fast_memcpy_aligned(void* dest, const void* source, size_t count) +{ + auto dest8 = reinterpret_cast(dest); + auto source8 = reinterpret_cast(source); + + size_t rest = count & 2047; + count &= ~2047; + tbb::parallel_for(tbb::blocked_range(0, count/128), [&](const tbb::blocked_range& r) { - internal::fast_memcpy(reinterpret_cast(dest) + r.begin()*128, reinterpret_cast(source) + r.begin()*128, r.size()*128); - }, ap); + fast_memcpy_aligned_impl(reinterpret_cast(dest) + r.begin()*128, reinterpret_cast(source) + r.begin()*128, r.size()*128); + }); + + return fast_memcpy_small_aligned(dest8+count, source8+count, rest); +} - return memcpy(reinterpret_cast(dest)+count, reinterpret_cast(source)+count, rest); +static void* fast_memcpy_unaligned(void* dest, const void* source, size_t count) +{ + auto dest8 = reinterpret_cast(dest); + auto source8 = reinterpret_cast(source); + + size_t rest = count & 2047; + count &= ~2047; + + tbb::parallel_for(tbb::blocked_range(0, count/128), [&](const tbb::blocked_range& r) + { + fast_memcpy_unaligned_impl(reinterpret_cast(dest) + r.begin()*128, reinterpret_cast(source) + r.begin()*128, r.size()*128); + }); + + return fast_memcpy_small_unaligned(dest8+count, source8+count, rest); } +} -} \ No newline at end of file +template +T* fast_memcpy(T* dest, const void* source, size_t count) +{ + if((reinterpret_cast(source) & 15) || (reinterpret_cast(dest) & 15)) + return reinterpret_cast(detail::fast_memcpy_unaligned(dest, source, count)); + else + return reinterpret_cast(detail::fast_memcpy_aligned(dest, source, count)); +} + +}