2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
22 #include "../utility/assert.h"
\r
23 #include "../memory/safe_ptr.h"
\r
27 #include <tbb/parallel_for.h>
\r
33 static void* fast_memcpy_aligned_impl(void* dest, const void* source, size_t count)
\r
35 CASPAR_ASSERT(dest != nullptr);
\r
36 CASPAR_ASSERT(source != nullptr);
\r
37 CASPAR_ASSERT(reinterpret_cast<int>(dest) % 16 == 0);
\r
38 CASPAR_ASSERT(reinterpret_cast<int>(source) % 16 == 0);
\r
51 movdqa xmm0, [esi+00h];
\r
52 movdqa xmm1, [esi+10h];
\r
53 movdqa xmm2, [esi+20h];
\r
54 movdqa xmm3, [esi+30h];
\r
56 movntdq [edi+00h], xmm0;
\r
57 movntdq [edi+10h], xmm1;
\r
58 movntdq [edi+20h], xmm2;
\r
59 movntdq [edi+30h], xmm3;
\r
61 movdqa xmm4, [esi+40h];
\r
62 movdqa xmm5, [esi+50h];
\r
63 movdqa xmm6, [esi+60h];
\r
64 movdqa xmm7, [esi+70h];
\r
66 movntdq [edi+40h], xmm4;
\r
67 movntdq [edi+50h], xmm5;
\r
68 movntdq [edi+60h], xmm6;
\r
69 movntdq [edi+70h], xmm7;
\r
71 lea edi, [edi+80h];
\r
72 lea esi, [esi+80h];
\r
80 static void* fast_memcpy_unaligned_impl(void* dest, const void* source, size_t count)
\r
82 CASPAR_ASSERT(dest != nullptr);
\r
83 CASPAR_ASSERT(source != nullptr);
\r
96 movdqu xmm0, [esi+00h];
\r
97 movdqu xmm1, [esi+10h];
\r
98 movdqu xmm2, [esi+20h];
\r
99 movdqu xmm3, [esi+30h];
\r
101 movdqu [edi+00h], xmm0;
\r
102 movdqu [edi+10h], xmm1;
\r
103 movdqu [edi+20h], xmm2;
\r
104 movdqu [edi+30h], xmm3;
\r
106 movdqu xmm4, [esi+40h];
\r
107 movdqu xmm5, [esi+50h];
\r
108 movdqu xmm6, [esi+60h];
\r
109 movdqu xmm7, [esi+70h];
\r
111 movdqu [edi+40h], xmm4;
\r
112 movdqu [edi+50h], xmm5;
\r
113 movdqu [edi+60h], xmm6;
\r
114 movdqu [edi+70h], xmm7;
\r
116 lea edi, [edi+80h];
\r
117 lea esi, [esi+80h];
\r
125 static void* fast_memcpy_small_aligned(char* dest8, const char* source8, size_t count)
\r
127 size_t rest = count & 127;
\r
130 fast_memcpy_aligned_impl(dest8, source8, count);
\r
132 return memcpy(dest8+count, source8+count, rest);
\r
135 static void* fast_memcpy_small_unaligned(char* dest8, const char* source8, size_t count)
\r
137 size_t rest = count & 127;
\r
140 fast_memcpy_unaligned_impl(dest8, source8, count);
\r
142 return memcpy(dest8+count, source8+count, rest);
\r
145 static void* fast_memcpy_aligned(void* dest, const void* source, size_t count)
\r
147 auto dest8 = reinterpret_cast<char*>(dest);
\r
148 auto source8 = reinterpret_cast<const char*>(source);
\r
150 size_t rest = count & 2047;
\r
153 tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)
\r
155 fast_memcpy_aligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
\r
158 return fast_memcpy_small_aligned(dest8+count, source8+count, rest);
\r
161 static void* fast_memcpy_unaligned(void* dest, const void* source, size_t count)
\r
163 auto dest8 = reinterpret_cast<char*>(dest);
\r
164 auto source8 = reinterpret_cast<const char*>(source);
\r
166 size_t rest = count & 2047;
\r
169 tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)
\r
171 fast_memcpy_unaligned_impl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
\r
174 return fast_memcpy_small_unaligned(dest8+count, source8+count, rest);
\r
179 template<typename T>
\r
180 T* fast_memcpy(T* dest, const void* source, size_t count)
\r
182 if((reinterpret_cast<int>(source) & 15) || (reinterpret_cast<int>(dest) & 15))
\r
183 return reinterpret_cast<T*>(detail::fast_memcpy_unaligned(dest, source, count));
\r
185 return reinterpret_cast<T*>(detail::fast_memcpy_aligned(dest, source, count));
\r