1 #include "../stdafx.h"
\r
7 #include <tbb/parallel_for.h>
\r
8 #include <tbb/blocked_range.h>
\r
10 namespace caspar { namespace common {
\r
12 void* memcpy_SSE2(void* dest, const void* source, size_t num)
\r
23 prefetchnta [esi+80h];
\r
25 movdqa xmm0, [esi+00h];
\r
26 movdqa xmm1, [esi+10h];
\r
27 movdqa xmm2, [esi+20h];
\r
28 movdqa xmm3, [esi+30h];
\r
30 movntdq [edi+00h], xmm0;
\r
31 movntdq [edi+10h], xmm1;
\r
32 movntdq [edi+20h], xmm2;
\r
33 movntdq [edi+30h], xmm3;
\r
35 prefetchnta [esi+0C0h];
\r
37 movdqa xmm4, [esi+40h];
\r
38 movdqa xmm5, [esi+50h];
\r
39 movdqa xmm6, [esi+60h];
\r
40 movdqa xmm7, [esi+70h];
\r
42 movntdq [edi+40h], xmm4;
\r
43 movntdq [edi+50h], xmm5;
\r
44 movntdq [edi+60h], xmm6;
\r
45 movntdq [edi+70h], xmm7;
\r
56 void* aligned_memcpy(void* dest, const void* source, size_t num)
\r
59 return memcpy(dest, source, num);
\r
61 tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)
\r
63 memcpy_SSE2(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
\r
64 }, tbb::affinity_partitioner());
\r
69 void* clear(void* dest, size_t size)
\r
71 tbb::parallel_for(tbb::blocked_range<size_t>(0, size/16), [&](const tbb::blocked_range<size_t>& r)
\r
73 __m128i val = _mm_setzero_si128();
\r
74 __m128i* ptr = reinterpret_cast<__m128i*>(dest)+r.begin();
\r
75 __m128i* end = ptr + r.size();
\r
78 _mm_stream_si128(ptr++, val);
\r