]> git.sesse.net Git - casparcg/blob - common/utility/memory.cpp
2.0.0.2:
[casparcg] / common / utility / memory.cpp
1 #include "../stdafx.h"\r
2 \r
3 #include "memory.h"\r
4 \r
5 #include <intrin.h>\r
6 \r
7 #include <tbb/parallel_for.h>\r
8 #include <tbb/blocked_range.h>\r
9 \r
10 namespace caspar { namespace common {\r
11         \r
12 void* memcpy_SSE2(void* dest, const void* source, size_t num)\r
13 {       \r
14         __asm\r
15         {\r
16                 mov esi, source;    \r
17                 mov edi, dest;   \r
18  \r
19                 mov ebx, num; \r
20                 shr ebx, 7;      \r
21   \r
22                 cpy:\r
23                         prefetchnta [esi+80h];\r
24  \r
25                         movdqa xmm0, [esi+00h];\r
26                         movdqa xmm1, [esi+10h];\r
27                         movdqa xmm2, [esi+20h];\r
28                         movdqa xmm3, [esi+30h];\r
29           \r
30                         movntdq [edi+00h], xmm0;\r
31                         movntdq [edi+10h], xmm1;\r
32                         movntdq [edi+20h], xmm2;\r
33                         movntdq [edi+30h], xmm3;\r
34                         \r
35                         prefetchnta [esi+0C0h];\r
36                         \r
37                         movdqa xmm4, [esi+40h];\r
38                         movdqa xmm5, [esi+50h];\r
39                         movdqa xmm6, [esi+60h];\r
40                         movdqa xmm7, [esi+70h];\r
41           \r
42                         movntdq [edi+40h], xmm4;\r
43                         movntdq [edi+50h], xmm5;\r
44                         movntdq [edi+60h], xmm6;\r
45                         movntdq [edi+70h], xmm7;\r
46  \r
47                         lea edi, [edi+80h];\r
48                         lea esi, [esi+80h];\r
49                         dec ebx;\r
50  \r
51                 jnz cpy;\r
52         }\r
53         return dest;\r
54 }\r
55 \r
56 void* aligned_memcpy(void* dest, const void* source, size_t num)\r
57 {       \r
58         if(num < 128)\r
59                 return memcpy(dest, source, num);\r
60 \r
61         tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)\r
62         {\r
63                 memcpy_SSE2(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);\r
64         }, tbb::affinity_partitioner());\r
65 \r
66         return dest;\r
67 }\r
68 \r
69 void* clear(void* dest, size_t size)\r
70 {\r
71         tbb::parallel_for(tbb::blocked_range<size_t>(0, size/16), [&](const tbb::blocked_range<size_t>& r)\r
72         {\r
73                 __m128i val = _mm_setzero_si128();\r
74                 __m128i* ptr = reinterpret_cast<__m128i*>(dest)+r.begin();\r
75                 __m128i* end = ptr + r.size();\r
76 \r
77                 while(ptr != end)       \r
78                         _mm_stream_si128(ptr++, val);\r
79         });     \r
80         return dest;\r
81 }\r
82 \r
83 }}