]> git.sesse.net Git - casparcg/blob - common/memory/memcpy.h
git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches...
[casparcg] / common / memory / memcpy.h
1 /*\r
2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>\r
3 *\r
4 *  This file is part of CasparCG.\r
5 *\r
6 *    CasparCG is free software: you can redistribute it and/or modify\r
7 *    it under the terms of the GNU General Public License as published by\r
8 *    the Free Software Foundation, either version 3 of the License, or\r
9 *    (at your option) any later version.\r
10 *\r
11 *    CasparCG is distributed in the hope that it will be useful,\r
12 *    but WITHOUT ANY WARRANTY; without even the implied warranty of\r
13 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
14 *    GNU General Public License for more details.\r
15 \r
16 *    You should have received a copy of the GNU General Public License\r
17 *    along with CasparCG.  If not, see <http://www.gnu.org/licenses/>.\r
18 *\r
19 */\r
20 #pragma once\r
21 \r
22 #include "../utility/assert.h"\r
23 #include "../memory/safe_ptr.h"\r
24 \r
25 #include <assert.h>\r
26 \r
27 #include <tbb/parallel_for.h>\r
28 \r
29 namespace caspar {\r
30 \r
31 namespace detail {\r
32 \r
33 static void* fast_memcpy_aligned_impl(void* dest, const void* source, size_t count)\r
34 {\r
35         CASPAR_ASSERT(dest != nullptr);\r
36         CASPAR_ASSERT(source != nullptr);\r
37         CASPAR_ASSERT(reinterpret_cast<int>(dest) % 16 == 0);\r
38         CASPAR_ASSERT(reinterpret_cast<int>(source) % 16 == 0);\r
39 \r
40         if(count == 0)\r
41                 return dest;\r
42 \r
43         __asm   \r
44         {      \r
45                 mov esi, source;          \r
46                 mov edi, dest;    \r
47                 mov ebx, count;     \r
48                 shr ebx, 7;\r
49 \r
50                 cpy:             \r
51                         movdqa xmm0, [esi+00h];       \r
52                         movdqa xmm1, [esi+10h];      \r
53                         movdqa xmm2, [esi+20h];         \r
54                         movdqa xmm3, [esi+30h];   \r
55 \r
56                         movntdq [edi+00h], xmm0;\r
57                         movntdq [edi+10h], xmm1;\r
58                         movntdq [edi+20h], xmm2;    \r
59                         movntdq [edi+30h], xmm3;\r
60 \r
61                         movdqa xmm4, [esi+40h];\r
62                         movdqa xmm5, [esi+50h];\r
63                         movdqa xmm6, [esi+60h];\r
64                         movdqa xmm7, [esi+70h];  \r
65 \r
66                         movntdq [edi+40h], xmm4; \r
67                         movntdq [edi+50h], xmm5;      \r
68                         movntdq [edi+60h], xmm6;    \r
69                         movntdq [edi+70h], xmm7;    \r
70 \r
71                         lea edi, [edi+80h];       \r
72                         lea esi, [esi+80h];      \r
73 \r
74                         dec ebx;      \r
75                 jnz cpy;  \r
76         }   \r
77         return dest;\r
78 }\r
79 \r
80 static void* fast_memcpy_unaligned_impl(void* dest, const void* source, size_t count)\r
81 {\r
82         CASPAR_ASSERT(dest != nullptr);\r
83         CASPAR_ASSERT(source != nullptr);\r
84 \r
85         if(count == 0)\r
86                 return dest;\r
87 \r
88         __asm   \r
89         {      \r
90                 mov esi, source;          \r
91                 mov edi, dest;    \r
92                 mov ebx, count;     \r
93                 shr ebx, 7;\r
94 \r
95                 cpy:             \r
96                         movdqu xmm0, [esi+00h];       \r
97                         movdqu xmm1, [esi+10h];      \r
98                         movdqu xmm2, [esi+20h];         \r
99                         movdqu xmm3, [esi+30h];   \r
100 \r
101                         movdqu [edi+00h], xmm0;\r
102                         movdqu [edi+10h], xmm1;\r
103                         movdqu [edi+20h], xmm2;    \r
104                         movdqu [edi+30h], xmm3;\r
105 \r
106                         movdqu xmm4, [esi+40h];\r
107                         movdqu xmm5, [esi+50h];\r
108                         movdqu xmm6, [esi+60h];\r
109                         movdqu xmm7, [esi+70h];  \r
110 \r
111                         movdqu [edi+40h], xmm4; \r
112                         movdqu [edi+50h], xmm5;      \r
113                         movdqu [edi+60h], xmm6;    \r
114                         movdqu [edi+70h], xmm7;    \r
115 \r
116                         lea edi, [edi+80h];       \r
117                         lea esi, [esi+80h];      \r
118 \r
119                         dec ebx;      \r
120                 jnz cpy;  \r
121         }   \r
122         return dest;\r
123 }\r
124 \r
125 static void* fast_memcpy_small_aligned(char* dest8, const char* source8, size_t count)\r
126 {   \r
127         size_t rest = count & 127;\r
128         count &= ~127;\r
129 \r
130         fast_memcpy_aligned_impl(dest8, source8, count);   \r
131 \r
132         return memcpy(dest8+count,  source8+count, rest);\r
133 }\r
134 \r
135 static void* fast_memcpy_small_unaligned(char* dest8, const char* source8, size_t count)\r
136 {   \r
137         size_t rest = count & 127;\r
138         count &= ~127;\r
139 \r
140         fast_memcpy_unaligned_impl(dest8, source8, count);   \r
141 \r
142         return memcpy(dest8+count,  source8+count, rest);\r
143 }\r
144 \r
145 static void* fast_memcpy_aligned(void* dest, const void* source, size_t count)\r
146 {   \r
147         auto dest8                      = reinterpret_cast<char*>(dest);\r
148         auto source8            = reinterpret_cast<const char*>(source);\r
149                 \r
150         size_t rest = count & 2047;\r
151         count &= ~2047;\r
152                 \r
153         tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)\r
154         {       \r
155                 fast_memcpy_aligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);   \r
156         });\r
157         \r
158         return fast_memcpy_small_aligned(dest8+count, source8+count, rest);\r
159 }\r
160 \r
161 static void* fast_memcpy_unaligned(void* dest, const void* source, size_t count)\r
162 {   \r
163         auto dest8                      = reinterpret_cast<char*>(dest);\r
164         auto source8            = reinterpret_cast<const char*>(source);\r
165                 \r
166         size_t rest = count & 2047;\r
167         count &= ~2047;\r
168                 \r
169         tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)\r
170         {       \r
171                 fast_memcpy_unaligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);   \r
172         });\r
173         \r
174         return fast_memcpy_small_unaligned(dest8+count, source8+count, rest);\r
175 }\r
176 \r
177 }\r
178 \r
179 template<typename T>\r
180 T* fast_memcpy(T* dest, const void* source, size_t count)\r
181 {   \r
182         if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15))\r
183                 return reinterpret_cast<T*>(detail::fast_memcpy_unaligned(dest, source, count));\r
184         else\r
185                 return reinterpret_cast<T*>(detail::fast_memcpy_aligned(dest, source, count));\r
186 }\r
187 \r
188 }\r