2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
21 #include "..\..\stdafx.h"
\r
26 #include <functional>
\r
28 #include "../Types.hpp"
\r
30 #include "tbb/parallel_for.h"
\r
31 #include "tbb/blocked_range.h"
\r
33 using namespace std::tr1::placeholders;
\r
39 static const size_t STRIDE = sizeof(__m128i)*4;
\r
41 void DoLerpParallel(const tbb::blocked_range<size_t>& r, const std::tr1::function<void(void*, const void*, const void*, float, size_t)>& func, void* dest, const void* source1, const void* source2, float alpha)
\r
43 size_t offset = r.begin()*STRIDE;
\r
44 size_t size = r.size()*STRIDE;
\r
45 func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source1) + offset, reinterpret_cast<const s8*>(source2) + offset, alpha, size);
\r
48 void LerpParallel(const std::tr1::function<void(void*, const void*, const void*, float, size_t)>& func, void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
50 tbb::parallel_for(tbb::blocked_range<size_t>(0, size/STRIDE), std::tr1::bind(&DoLerpParallel, _1, func, dest, source1, source2, alpha));
\r
53 LerpFun GetLerpFun(SIMD simd)
\r
56 return LerpParallel_SSE2;
\r
58 return LerpParallel_REF;
\r
61 void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
63 static const u32 PSD = 64;
\r
65 static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
\r
66 static const __m128i round = _mm_set1_epi16(128);
\r
68 assert(source1 != NULL && source2 != NULL && dest != NULL);
\r
69 assert(size % STRIDE == 0);
\r
70 assert(alpha >= 0.0 && alpha <= 1.0);
\r
72 const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
\r
73 const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
\r
74 __m128i* dest128 = reinterpret_cast<__m128i*>(dest);
\r
76 __m128i s = _mm_setzero_si128();
\r
77 __m128i d = _mm_setzero_si128();
\r
78 const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
\r
80 __m128i drb, dga, srb, sga;
\r
82 for (size_t k = 0, length = size/STRIDE; k < length; ++k)
\r
84 _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);
\r
85 _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);
\r
86 // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/
\r
88 for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
\r
90 // r = d + (s-d)*alpha/256
\r
91 s = _mm_load_si128(source128_1); // AABBGGRR
\r
92 d = _mm_load_si128(source128_2); // AABBGGRR
\r
94 srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack
\r
95 sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack
\r
97 drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack
\r
98 dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack
\r
100 srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub
\r
101 srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul
\r
102 srb = _mm_add_epi16(srb, round);
\r
104 sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub
\r
105 sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul
\r
106 sga = _mm_add_epi16(sga, round);
\r
108 srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div
\r
109 sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div
\r
111 srb = _mm_or_si128(srb, sga); // AABBGGRR // pack
\r
113 srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N)
\r
115 _mm_stream_si128(dest128, srb);
\r
118 _mm_mfence(); //ensure last WC buffers get flushed to memory
\r
121 void LerpParallel_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
123 LerpParallel(&Lerp_SSE2, dest, source1, source2, alpha, size);
\r
126 void Lerp_REF(void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
128 assert(source1 != NULL && source2 != NULL && dest != NULL);
\r
129 assert(size % 4 == 0);
\r
130 assert(alpha >= 0.0f && alpha <= 1.0f);
\r
132 const u8* source8_1 = reinterpret_cast<const u8*>(source1);
\r
133 const u8* source8_2 = reinterpret_cast<const u8*>(source2);
\r
134 u8* dest8 = reinterpret_cast<u8*>(dest);
\r
136 u8 a = static_cast<u8>(alpha*256.0f);
\r
137 for(size_t n = 0; n < size; n+=4)
\r
140 u32 sr = source8_1[n+0];
\r
141 u32 sg = source8_1[n+1];
\r
142 u32 sb = source8_1[n+2];
\r
143 u32 sa = source8_1[n+3];
\r
146 u32 dr = source8_2[n+0];
\r
147 u32 dg = source8_2[n+1];
\r
148 u32 db = source8_2[n+2];
\r
149 u32 da = source8_2[n+3];
\r
151 //dest8[n+0] = dr + ((sr-dr)*a)/256;
\r
152 //dest8[n+1] = dg + ((sg-dg)*a)/256;
\r
153 //dest8[n+2] = db + ((sb-db)*a)/256;
\r
154 //dest8[n+3] = da + ((sa-da)*a)/256;
\r
156 dest8[n+0] = dr + int(float((sr-dr)*a)/256.0f+0.5f);
\r
157 dest8[n+1] = dg + int(float((sg-dg)*a)/256.0f+0.5f);
\r
158 dest8[n+2] = db + int(float((sb-db)*a)/256.0f+0.5f);
\r
159 dest8[n+3] = da + int(float((sa-da)*a)/256.0f+0.5f);
\r
164 void LerpParallel_REF(void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
166 LerpParallel(&Lerp_REF, dest, source1, source2, alpha, size);
\r
169 // Author: Niclas P Andersson
\r
170 void Lerp_OLD(void* dest, const void* source1, const void* source2, float alpha, size_t size)
\r
172 __m128i ps1, ps2, pd1, pd2, m0, m1, pr1, pr2;
\r
174 __m128i* pSource = (__m128i*)source1;
\r
175 __m128i* pDest = (__m128i*)source2;
\r
176 __m128i* pResult = (__m128i*)dest;
\r
178 __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
\r
179 m0 = _mm_setzero_si128();
\r
181 int count = size/4;
\r
182 for ( int i = 0; i < count; i+=4 )
\r
184 ps1 = _mm_load_si128(pSource); //load 4 pixels from source
\r
185 pd1 = _mm_load_si128(pDest); //load 4 pixels from dest
\r
186 ps2 = _mm_unpackhi_epi64(ps1, m0); //move the 2 high pixels from source
\r
187 pd2 = _mm_unpackhi_epi64(pd1, m0); //move the 2 high pixels from dest
\r
189 //compute the 2 "lower" pixels
\r
190 ps1 = _mm_unpacklo_epi8(ps1, m0); //unpack the 2 low pixels from source (bytes -> words)
\r
191 pd1 = _mm_unpacklo_epi8(pd1, m0); //unpack the 2 low pixels from dest (bytes -> words)
\r
193 pr1 = _mm_sub_epi16(ps1, pd1); //x = src - dest
\r
194 pr1 = _mm_mullo_epi16(pr1, a); //y = x*alpha
\r
195 pr1 = _mm_srli_epi16(pr1, 8); //w = y/256
\r
196 pr1 = _mm_add_epi8(pr1, pd1); //z = w + dest
\r
198 //same thing for the 2 "high" pixels
\r
199 ps2 = _mm_unpacklo_epi8(ps2, m0);
\r
200 pd2 = _mm_unpacklo_epi8(pd2, m0);
\r
202 pr2 = _mm_sub_epi16(ps2, pd2); //x = src - dest
\r
203 pr2 = _mm_mullo_epi16(pr2, a); //y = x*alpha
\r
204 pr2 = _mm_srli_epi16(pr2, 8); //w = y/256
\r
205 pr2 = _mm_add_epi8(pr2, pd2); //z = w + dest
\r
207 m1 = _mm_packus_epi16(pr1, pr2); //pack all 4 together again (words -> bytes)
\r
208 _mm_store_si128(pResult, m1);
\r
216 } // namespace image
\r
217 } // namespace utils
\r
218 } // namespace caspar
\r