2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
26 #include <tbb/parallel_for.h>
\r
30 namespace internal {
\r
32 static void* fast_memshfl(void* dest, const void* source, size_t count, int m1, int m2, int m3, int m4)
\r
34 __m128i* dest128 = reinterpret_cast<__m128i*>(dest);
\r
35 const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
\r
37 count /= 16; // 128 bit
\r
39 __m128i xmm0, xmm1, xmm2, xmm3;
\r
41 const __m128i mask128 = _mm_set_epi32(m1, m2, m3, m4);
\r
42 for(size_t n = 0; n < count/4; ++n)
\r
44 xmm0 = _mm_load_si128(source128++);
\r
45 xmm1 = _mm_load_si128(source128++);
\r
46 xmm2 = _mm_load_si128(source128++);
\r
47 xmm3 = _mm_load_si128(source128++);
\r
49 _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm0, mask128));
\r
50 _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm1, mask128));
\r
51 _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm2, mask128));
\r
52 _mm_stream_si128(dest128++, _mm_shuffle_epi8(xmm3, mask128));
\r
59 static void* fast_memshfl(void* dest, const void* source, size_t count, int m1, int m2, int m3, int m4)
\r
61 tbb::affinity_partitioner ap;
\r
62 tbb::parallel_for(tbb::blocked_range<size_t>(0, count/128), [&](const tbb::blocked_range<size_t>& r)
\r
64 internal::fast_memshfl(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128, m1, m2, m3, m4);
\r