1 #include "../stdafx.h"
\r
7 #include <tbb/parallel_for.h>
\r
8 #include <tbb/blocked_range.h>
\r
10 namespace caspar { namespace common {
\r
12 void* memcpy_SSE2(void* dest, const void* source, size_t num)
\r
14 assert(dest != nullptr);
\r
15 assert(source != nullptr);
\r
16 assert(dest != source);
\r
17 assert(num % 256 == 0);
\r
27 prefetchnta [esi+80h];
\r
29 movdqa xmm0, [esi+00h];
\r
30 movdqa xmm1, [esi+10h];
\r
31 movdqa xmm2, [esi+20h];
\r
32 movdqa xmm3, [esi+30h];
\r
34 movntdq [edi+00h], xmm0;
\r
35 movntdq [edi+10h], xmm1;
\r
36 movntdq [edi+20h], xmm2;
\r
37 movntdq [edi+30h], xmm3;
\r
39 prefetchnta [esi+0C0h];
\r
41 movdqa xmm4, [esi+40h];
\r
42 movdqa xmm5, [esi+50h];
\r
43 movdqa xmm6, [esi+60h];
\r
44 movdqa xmm7, [esi+70h];
\r
46 movntdq [edi+40h], xmm4;
\r
47 movntdq [edi+50h], xmm5;
\r
48 movntdq [edi+60h], xmm6;
\r
49 movntdq [edi+70h], xmm7;
\r
60 void* copy(void* dest, const void* source, size_t num)
\r
62 tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)
\r
64 memcpy_SSE2(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
\r
65 }, tbb::affinity_partitioner());
\r
70 void* clear(void* dest, size_t size)
\r
72 tbb::parallel_for(tbb::blocked_range<size_t>(0, size/16), [&](const tbb::blocked_range<size_t>& r)
\r
74 __m128i val = _mm_setzero_si128();
\r
75 __m128i* ptr = reinterpret_cast<__m128i*>(dest)+r.begin();
\r
76 __m128i* end = ptr + r.size();
\r
79 _mm_stream_si128(ptr++, val);
\r