2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>
\r
4 * This file is part of CasparCG.
\r
6 * CasparCG is free software: you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation, either version 3 of the License, or
\r
9 * (at your option) any later version.
\r
11 * CasparCG is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with CasparCG. If not, see <http://www.gnu.org/licenses/>.
\r
21 #include "..\..\stdafx.h"
\r
23 #include "Premultiply.hpp"
\r
26 #include <functional>
\r
28 #include "../Types.hpp"
\r
30 #include "tbb/parallel_for.h"
\r
31 #include "tbb/blocked_range.h"
\r
33 using namespace std::tr1::placeholders;
\r
39 static const size_t STRIDE = sizeof(__m128i)*4;
\r
41 void DoPreMultiplyParallel(const tbb::blocked_range<size_t>& r, const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source)
\r
43 size_t offset = r.begin()*STRIDE;
\r
44 size_t size = r.size()*STRIDE;
\r
45 func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source) + offset, size);
\r
48 void PreMultiplyParallel(const std::tr1::function<void(void*, const void*, size_t)>& func, void* dest, const void* source, size_t size)
\r
50 tbb::parallel_for(tbb::blocked_range<size_t>(0, size/STRIDE), std::tr1::bind(&DoPreMultiplyParallel, _1, func, dest, source));
\r
53 PremultiplyFun GetPremultiplyFun(SIMD simd)
\r
56 return PremultiplyParallel_SSE2;
\r
58 return PremultiplyParallel_REF;
\r
61 // this function performs precise calculations
\r
62 void Premultiply_SSE2(void* dest, const void* source, size_t size)
\r
64 static const u32 PSD = 64;
\r
66 static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
\r
67 static const __m128i amask = _mm_set1_epi32(0xFF000000);
\r
68 static const __m128i round = _mm_set1_epi16(128);
\r
70 assert(source != NULL && dest != NULL);
\r
71 assert(size % STRIDE == 0);
\r
73 const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
\r
74 __m128i* dest128 = reinterpret_cast<__m128i*>(dest);
\r
76 __m128i s, rb, ag, a, t;
\r
78 for(size_t k = 0, length = size/STRIDE; k != length; ++k)
\r
80 // TODO: put prefetch between calculations?(R.N)
\r
81 _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);
\r
83 // prefetch fetches entire cacheline (512bit). work on entire cacheline before next prefetch. 512/128 = 4, unroll four times = 16 pixels
\r
85 // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/
\r
87 for(int n = 0; n < 4; ++n, ++dest128, ++source128)
\r
89 s = _mm_load_si128(source128); // AABBGGRR
\r
91 // set alpha to lo16 from source
\r
92 rb = _mm_srli_epi32(s, 24); // 000000AA
\r
93 a = _mm_slli_epi32(rb, 16); // 00AA0000
\r
94 a = _mm_or_si128(rb, a); // 00AA00AA
\r
96 rb = _mm_and_si128(lomask, s); // 00BB00RR
\r
97 rb = _mm_mullo_epi16(rb, a); // BBBBRRRR
\r
98 rb = _mm_add_epi16(rb, round); // BBBBRRRR
\r
99 t = _mm_srli_epi16(rb, 8); // 00BB00RR
\r
100 t = _mm_add_epi16(t, rb);
\r
101 rb = _mm_srli_epi16(t, 8);
\r
103 ag = _mm_srli_epi16(s, 8); // 00AA00GG
\r
104 ag = _mm_mullo_epi16(ag, a); // AAAAGGGG
\r
105 ag = _mm_add_epi16(ag, round);
\r
106 t = _mm_srli_epi16(ag, 8);
\r
107 t = _mm_add_epi16(t, ag);
\r
108 ag = _mm_andnot_si128(lomask, t); // AA00GG00
\r
110 a = _mm_or_si128(rb, ag); // XXBBGGRR
\r
111 a = _mm_andnot_si128(amask, a); // 00BBGGRR
\r
113 s = _mm_and_si128(amask, s); // AA000000
\r
115 s = _mm_or_si128(a, s); // AABBGGRR pack
\r
117 // TODO: store entire cache line at the same time (write-combining => burst)? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N)
\r
118 _mm_stream_si128(dest128, s);
\r
121 _mm_mfence(); //ensure last WC buffers get flushed to memory
\r
124 void PremultiplyParallel_SSE2(void* dest, const void* source1, size_t size)
\r
126 PreMultiplyParallel(&Premultiply_SSE2, dest, source1, size);
\r
129 void Premultiply_FastSSE2(void* dest, const void* source, size_t size)
\r
131 static const size_t stride = sizeof(__m128i)*4;
\r
132 static const u32 PSD = 64;
\r
134 static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
\r
135 static const __m128i amask = _mm_set1_epi32(0xFF000000);
\r
138 assert(source != NULL && dest != NULL);
\r
139 assert(size % stride == 0);
\r
141 const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
\r
142 __m128i* dest128 = reinterpret_cast<__m128i*>(dest);
\r
144 __m128i s = _mm_setzero_si128();
\r
145 __m128i rb = _mm_setzero_si128();
\r
146 __m128i ag = _mm_setzero_si128();
\r
147 __m128i a = _mm_setzero_si128();
\r
149 for(size_t k = 0, length = size/stride; k != length; ++k)
\r
151 // TODO: put prefetch between calculations?(R.N)
\r
152 _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);
\r
154 //work on entire cacheline before next prefetch
\r
156 // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/
\r
158 for(int n = 0; n < 4; ++n, ++dest128, ++source128)
\r
160 s = _mm_load_si128(source128); // AABBGGRR
\r
162 // set alpha to lo16 from source
\r
163 rb = _mm_srli_epi32(s, 24); // 000000AA
\r
164 a = _mm_slli_epi32(rb, 16); // 00AA0000
\r
165 a = _mm_or_si128(rb, a); // 00AA00AA
\r
167 // fix alpha a = a > 127 ? a+1 : a
\r
168 rb = _mm_srli_epi16(a, 7);
\r
169 a = _mm_add_epi16(a, rb);
\r
171 rb = _mm_and_si128(lomask, s); // 00BB00RR unpack
\r
172 rb = _mm_mullo_epi16(rb, a); // BBBBRRRR mul (D[A]*S)
\r
173 rb = _mm_srli_epi16(rb, 8); // 00BB00RR prepack and div [(D[A]*S)]/255
\r
175 ag = _mm_srli_epi16(s, 8); // 00AA00GG unpack
\r
176 ag = _mm_mullo_epi16(ag, a); // XXXXGGGG mul (D[A]*S)
\r
177 ag = _mm_andnot_si128(lomask, ag); // XX00GG00 prepack and div [(D[A]*S)]/255
\r
179 a = _mm_or_si128(rb, ag); // XXBBGGRR
\r
180 a = _mm_andnot_si128(amask, a); // 00BBGGRR
\r
182 s = _mm_and_si128(amask, s); // AA000000
\r
184 s = _mm_or_si128(a, s); // AABBGGRR pack
\r
186 // TODO: store entire cache line at the same time (write-combining => burst)? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N)
\r
187 _mm_store_si128(dest128, s);
\r
192 void Premultiply_REF(void* dest, const void* source, size_t size)
\r
194 assert(source != NULL && dest != NULL);
\r
195 assert(size % 4 == 0);
\r
197 const u8* source8 = reinterpret_cast<const u8*>(source);
\r
198 u8* dest8 = reinterpret_cast<u8*>(dest);
\r
200 for(size_t n = 0; n < size; n+=4)
\r
202 u32 r = source8[n+0];
\r
203 u32 g = source8[n+1];
\r
204 u32 b = source8[n+2];
\r
205 u32 a = source8[n+3];
\r
207 dest8[n+0] = (r*a)/255;
\r
208 dest8[n+1] = (g*a)/255;
\r
209 dest8[n+2] = (b*a)/255;
\r
214 void PremultiplyParallel_REF(void* dest, const void* source1, size_t size)
\r
216 PreMultiplyParallel(&Premultiply_REF, dest, source1, size);
\r
220 //void StraightTransform_REF(const void* source, void* dest, size_t size)
\r
222 // assert(source != NULL && dest != NULL);
\r
223 // assert((size % 4) == 0);
\r
225 // const u8* source8 = reinterpret_cast<const u8*>(source);
\r
226 // u8* dest8 = reinterpret_cast<u8*>(dest);
\r
228 // for(int n = 0; n < size; n+=4)
\r
230 // u32 r = source8[n+0];
\r
231 // u32 g = source8[n+1];
\r
232 // u32 b = source8[n+2];
\r
233 // u32 a = source8[n+3];
\r
237 // dest8[n+0] = (r*255)/a;
\r
238 // dest8[n+1] = (g*255)/a;
\r
239 // dest8[n+2] = (b*255)/a;
\r
253 } // namespace image
\r
254 } // namespace utils
\r
255 } // namespace caspar
\r