2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
21 #include "../stdafx.h"
\r
26 #include <functional>
\r
28 #include "../utility/types.h"
\r
30 #include "tbb/parallel_for.h"
\r
31 #include "tbb/blocked_range.h"
\r
33 using namespace std::tr1::placeholders;
\r
39 static const size_t STRIDE = sizeof(__m128i)*4;
\r
41 void DocopyParallel(const tbb::blocked_range<size_t>& r, const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source)
\r
43 size_t offset = r.begin()*STRIDE;
\r
44 size_t size = r.size()*STRIDE;
\r
45 func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source) + offset, size);
\r
48 void copyParallel(const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source, size_t size)
\r
50 tbb::parallel_for(tbb::blocked_range<size_t>(0, size/STRIDE), std::bind(&DocopyParallel, std::placeholders::_1, func, dest, source));
\r
53 copy_fun get_copy_fun(SIMD simd)
\r
56 return copyParallel_SSE2;
\r
58 return copyParallel_REF;
\r
61 // TODO: (R.N) optimize => prefetch and cacheline loop unroll
\r
62 void copy_SSE2(void* dest, const void* source, size_t size)
\r
64 __m128i val = _mm_setzero_si128();
\r
65 __m128i* pD = reinterpret_cast<__m128i*>(dest);
\r
66 const __m128i* pS = reinterpret_cast<const __m128i*>(source);
\r
68 int times = size / 16;
\r
69 for(int i=0; i < times; ++i)
\r
71 val = _mm_load_si128(pS);
\r
72 _mm_stream_si128(pD, val);
\r
77 _mm_mfence(); //ensure last WC buffers get flushed to memory
\r
80 void copyParallel_SSE2(void* dest, const void* source, size_t size)
\r
82 copyParallel(©_SSE2, dest, source, size);
\r
85 void copy_REF(void* dest, const void* source, size_t size)
\r
87 __movsd(reinterpret_cast<unsigned long*>(dest), reinterpret_cast<const unsigned long*>(source), size/4);
\r
90 void copyParallel_REF(void* dest, const void* source, size_t size)
\r
92 copyParallel(©_REF, dest, source, size);
\r