2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
21 #include "..\..\stdafx.h"
\r
23 #include "CopyField.hpp"
\r
27 #include <functional>
\r
29 #include "../Types.hpp"
\r
31 #include "tbb/parallel_for.h"
\r
32 #include "tbb/blocked_range.h"
\r
34 using namespace std::tr1::placeholders;
\r
40 void DoCopyFieldParallel(size_t index, const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source, size_t width4)
\r
42 size_t offset = index*width4;
\r
43 size_t size = width4;
\r
44 func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source) + offset, size);
\r
47 void CopyFieldParallel(const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source, size_t fieldIndex, size_t width, size_t height)
\r
49 tbb::parallel_for(fieldIndex, height, static_cast<size_t>(2), std::tr1::bind(&DoCopyFieldParallel, _1, func, dest, source, width*4)); // copy for each row
\r
52 CopyFieldFun GetCopyFieldFun(SIMD simd)
\r
55 // return CopyFieldParallel_SSE2;
\r
57 return CopyFieldParallel_REF; // REF is faster
\r
60 // TODO: (R.N) optimize => prefetch and cacheline loop unroll
\r
61 void CopyField_SSE2(unsigned char* pDest, unsigned char* pSrc, size_t fieldIndex, size_t width, size_t height)
\r
63 for(int rowIndex=fieldIndex; rowIndex < height; rowIndex+=2)
\r
65 int offset = width*4*rowIndex;
\r
67 __m128i val = _mm_setzero_si128();
\r
68 __m128i* pD = reinterpret_cast<__m128i*>(&(pDest[offset]));
\r
69 const __m128i* pS = reinterpret_cast<const __m128i*>(&(pSrc[offset]));
\r
71 int times = width / 4;
\r
72 for(int i=0; i < times; ++i)
\r
74 val = _mm_load_si128(pS);
\r
75 _mm_stream_si128(pD, val);
\r
81 _mm_mfence(); //ensure last WC buffers get flushed to memory
\r
84 void CopyFieldParallel_SSE2(unsigned char* dest, unsigned char* source, size_t fieldIndex, size_t width, size_t height)
\r
86 CopyFieldParallel(&Copy_SSE2, dest, source, fieldIndex, width, height);
\r
89 void CopyField_REF(unsigned char* pDest, unsigned char* pSrc, size_t fieldIndex, size_t width, size_t height)
\r
91 for(int rowIndex=fieldIndex; rowIndex < height; rowIndex+=2)
\r
93 int offset = width*4*rowIndex;
\r
94 __movsd(reinterpret_cast<unsigned long*>(&(pDest[offset])), reinterpret_cast<const unsigned long*>(&(pSrc[offset])), width);
\r
98 void CopyFieldParallel_REF(unsigned char* dest, unsigned char* source, size_t fieldIndex, size_t width, size_t height)
\r
100 CopyFieldParallel(&Copy_REF, dest, source, fieldIndex, width, height);
\r