3 #include <functional>
\r
10 #include <tbb/scalable_allocator.h>
\r
11 #include <tbb/parallel_for.h>
\r
14 const unsigned int TEST_COUNT = 1000;
\r
15 const unsigned int TEST_SIZE = 1920*1080*4;
\r
17 void test(const std::function<void*(void*, const void*, size_t)>& func)
\r
19 size_t size = TEST_SIZE;
\r
20 void* src = scalable_aligned_malloc(size, 16);
\r
21 void* dest = scalable_aligned_malloc(size, 16);
\r
24 double total_time = 0.0;
\r
25 for(int i = 0 ;i < TEST_COUNT; ++i)
\r
28 func(dest, src, size);
\r
29 total_time += timer.time();
\r
30 if(memcmp(dest, src, size) != 0)
\r
31 std::cout << "ERROR";
\r
32 memset(src, rand(), size); // flush
\r
35 scalable_aligned_free(dest);
\r
36 scalable_aligned_free(src);
\r
38 double unit_time = total_time/static_cast<double>(TEST_COUNT);
\r
39 std::cout << 1.0/unit_time*static_cast<double>(TEST_SIZE)/1000000000.0 << " gb/s";
\r
42 void* memcpy_SSE2_1(void* dest, const void* source, size_t size)
\r
44 __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
\r
45 const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
\r
47 for(size_t n = 0; n < size/16; n += 8)
\r
49 _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
\r
50 _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
\r
52 __m128i xmm0 = _mm_load_si128(source_128++);
\r
53 __m128i xmm1 = _mm_load_si128(source_128++);
\r
54 __m128i xmm2 = _mm_load_si128(source_128++);
\r
55 __m128i xmm3 = _mm_load_si128(source_128++);
\r
57 _mm_stream_si128(dest_128++, xmm0);
\r
58 _mm_stream_si128(dest_128++, xmm1);
\r
59 _mm_stream_si128(dest_128++, xmm2);
\r
60 _mm_stream_si128(dest_128++, xmm3);
\r
62 __m128i xmm4 = _mm_load_si128(source_128++);
\r
63 __m128i xmm5 = _mm_load_si128(source_128++);
\r
64 __m128i xmm6 = _mm_load_si128(source_128++);
\r
65 __m128i xmm7 = _mm_load_si128(source_128++);
\r
67 _mm_stream_si128(dest_128++, xmm4);
\r
68 _mm_stream_si128(dest_128++, xmm5);
\r
69 _mm_stream_si128(dest_128++, xmm6);
\r
70 _mm_stream_si128(dest_128++, xmm7);
\r
75 void* memcpy_SSE2_2(void* dest, const void* source, size_t num)
\r
86 prefetchnta [esi+80h];
\r
87 prefetchnta [esi+0A0h];
\r
88 prefetchnta [esi+0C0h];
\r
89 prefetchnta [esi+0E0h];
\r
91 movdqa xmm0, [esi+00h];
\r
92 movdqa xmm1, [esi+10h];
\r
93 movdqa xmm2, [esi+20h];
\r
94 movdqa xmm3, [esi+30h];
\r
96 movntdq [edi+00h], xmm0;
\r
97 movntdq [edi+10h], xmm1;
\r
98 movntdq [edi+20h], xmm2;
\r
99 movntdq [edi+30h], xmm3;
\r
101 movdqa xmm4, [esi+40h];
\r
102 movdqa xmm5, [esi+50h];
\r
103 movdqa xmm6, [esi+60h];
\r
104 movdqa xmm7, [esi+70h];
\r
106 movntdq [edi+40h], xmm4;
\r
107 movntdq [edi+50h], xmm5;
\r
108 movntdq [edi+60h], xmm6;
\r
109 movntdq [edi+70h], xmm7;
\r
120 void* memcpy_SSE2_3(void* dest, const void* source, size_t num)
\r
131 prefetchnta [esi+80h];
\r
132 prefetchnta [esi+0C0h];
\r
134 movdqa xmm0, [esi+00h];
\r
135 movdqa xmm1, [esi+10h];
\r
136 movdqa xmm2, [esi+20h];
\r
137 movdqa xmm3, [esi+30h];
\r
139 movntdq [edi+00h], xmm0;
\r
140 movntdq [edi+10h], xmm1;
\r
141 movntdq [edi+20h], xmm2;
\r
142 movntdq [edi+30h], xmm3;
\r
144 movdqa xmm4, [esi+40h];
\r
145 movdqa xmm5, [esi+50h];
\r
146 movdqa xmm6, [esi+60h];
\r
147 movdqa xmm7, [esi+70h];
\r
149 movntdq [edi+40h], xmm4;
\r
150 movntdq [edi+50h], xmm5;
\r
151 movntdq [edi+60h], xmm6;
\r
152 movntdq [edi+70h], xmm7;
\r
154 lea edi, [edi+80h];
\r
155 lea esi, [esi+80h];
\r
162 void* memcpy_SSE2_3_tbb(void* dest, const void* source, size_t num)
\r
164 tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)
\r
166 memcpy_SSE2_3(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
\r
167 }, tbb::affinity_partitioner());
\r
172 void* X_aligned_memcpy_sse2(void* dest, const void* src, size_t size_t)
\r
176 mov esi, src; //src pointer
\r
177 mov edi, dest; //dest pointer
\r
179 mov ebx, size_t; //ebx is our counter
\r
180 shr ebx, 7; //divide by 128 (8 * 128bit registers)
\r
184 prefetchnta 128[ESI]; //SSE2 prefetch
\r
185 prefetchnta 160[ESI];
\r
186 prefetchnta 192[ESI];
\r
187 prefetchnta 224[ESI];
\r
189 movdqa xmm0, 0[ESI]; //move data from src to registers
\r
190 movdqa xmm1, 16[ESI];
\r
191 movdqa xmm2, 32[ESI];
\r
192 movdqa xmm3, 48[ESI];
\r
193 movdqa xmm4, 64[ESI];
\r
194 movdqa xmm5, 80[ESI];
\r
195 movdqa xmm6, 96[ESI];
\r
196 movdqa xmm7, 112[ESI];
\r
198 movntdq 0[EDI], xmm0; //move data from registers to dest
\r
199 movntdq 16[EDI], xmm1;
\r
200 movntdq 32[EDI], xmm2;
\r
201 movntdq 48[EDI], xmm3;
\r
202 movntdq 64[EDI], xmm4;
\r
203 movntdq 80[EDI], xmm5;
\r
204 movntdq 96[EDI], xmm6;
\r
205 movntdq 112[EDI], xmm7;
\r
211 jnz loop_copy; //loop please
\r
216 int main(int argc, wchar_t* argv[])
\r
218 SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
\r
220 std::cout << " memcpy" << std::endl;
\r
221 test(memcpy_SSE2_1);
\r
222 std::cout << " memcpy_SSE2_1" << std::endl;
\r
223 test(memcpy_SSE2_2);
\r
224 std::cout << " memcpy_SSE2_2" << std::endl;
\r
225 test(memcpy_SSE2_3);
\r
226 std::cout << " memcpy_SSE2_3" << std::endl;
\r
227 test(memcpy_SSE2_3_tbb);
\r
228 std::cout << " memcpy_SSE2_3_tbb" << std::endl;
\r
229 test(X_aligned_memcpy_sse2);
\r
230 std::cout << " X_aligned_memcpy_sse2" << std::endl;
\r
232 std::cout << "Press ENTER to continue... " << std::endl;
\r
233 std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
\r