]> git.sesse.net Git - casparcg/blob - common/utility/memory.cpp
2.0.0.2:
[casparcg] / common / utility / memory.cpp
1 #include "../stdafx.h"\r
2 \r
3 #include "memory.h"\r
4 \r
5 #include <intrin.h>\r
6 \r
7 #include <tbb/parallel_for.h>\r
8 #include <tbb/blocked_range.h>\r
9 \r
10 namespace caspar { namespace common {\r
11         \r
12 void* memcpy_SSE2(void* dest, const void* source, size_t num)\r
13 {       \r
14         assert(dest != nullptr);\r
15         assert(source != nullptr);\r
16         assert(dest != source);\r
17         assert(num % 256 == 0);\r
18         __asm\r
19         {\r
20                 mov esi, source;    \r
21                 mov edi, dest;   \r
22  \r
23                 mov ebx, num; \r
24                 shr ebx, 7;      \r
25   \r
26                 cpy:\r
27                         prefetchnta [esi+80h];\r
28  \r
29                         movdqa xmm0, [esi+00h];\r
30                         movdqa xmm1, [esi+10h];\r
31                         movdqa xmm2, [esi+20h];\r
32                         movdqa xmm3, [esi+30h];\r
33           \r
34                         movntdq [edi+00h], xmm0;\r
35                         movntdq [edi+10h], xmm1;\r
36                         movntdq [edi+20h], xmm2;\r
37                         movntdq [edi+30h], xmm3;\r
38                         \r
39                         prefetchnta [esi+0C0h];\r
40                         \r
41                         movdqa xmm4, [esi+40h];\r
42                         movdqa xmm5, [esi+50h];\r
43                         movdqa xmm6, [esi+60h];\r
44                         movdqa xmm7, [esi+70h];\r
45           \r
46                         movntdq [edi+40h], xmm4;\r
47                         movntdq [edi+50h], xmm5;\r
48                         movntdq [edi+60h], xmm6;\r
49                         movntdq [edi+70h], xmm7;\r
50  \r
51                         lea edi, [edi+80h];\r
52                         lea esi, [esi+80h];\r
53                         dec ebx;\r
54  \r
55                 jnz cpy;\r
56         }\r
57         return dest;\r
58 }\r
59 \r
60 void* copy(void* dest, const void* source, size_t num)\r
61 {       \r
62         tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)\r
63         {\r
64                 memcpy_SSE2(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);\r
65         }, tbb::affinity_partitioner());\r
66 \r
67         return dest;\r
68 }\r
69 \r
70 void* clear(void* dest, size_t size)\r
71 {\r
72         tbb::parallel_for(tbb::blocked_range<size_t>(0, size/16), [&](const tbb::blocked_range<size_t>& r)\r
73         {\r
74                 __m128i val = _mm_setzero_si128();\r
75                 __m128i* ptr = reinterpret_cast<__m128i*>(dest)+r.begin();\r
76                 __m128i* end = ptr + r.size();\r
77 \r
78                 while(ptr != end)       \r
79                         _mm_stream_si128(ptr++, val);\r
80         });     \r
81         return dest;\r
82 }\r
83 \r
84 }}