]> git.sesse.net Git - casparcg/blob - server/utils/image/Lerp.cpp
8d4d4fb2166e44ffa80c4636142d720eb8717fd5
[casparcg] / server / utils / image / Lerp.cpp
1 /*\r
2 * copyright (c) 2010 Sveriges Television AB <info@casparcg.com>\r
3 *\r
4 *  This file is part of CasparCG.\r
5 *\r
6 *    CasparCG is free software: you can redistribute it and/or modify\r
7 *    it under the terms of the GNU General Public License as published by\r
8 *    the Free Software Foundation, either version 3 of the License, or\r
9 *    (at your option) any later version.\r
10 *\r
11 *    CasparCG is distributed in the hope that it will be useful,\r
12 *    but WITHOUT ANY WARRANTY; without even the implied warranty of\r
13 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
14 *    GNU General Public License for more details.\r
15 \r
16 *    You should have received a copy of the GNU General Public License\r
17 *    along with CasparCG.  If not, see <http://www.gnu.org/licenses/>.\r
18 *\r
19 */\r
20  \r
21 #include "..\..\stdafx.h"\r
22 \r
23 #include "Lerp.hpp"\r
24 \r
25 #include <intrin.h>\r
26 #include <functional>\r
27 \r
28 #include "../Types.hpp"\r
29 \r
30 #include "tbb/parallel_for.h"\r
31 #include "tbb/blocked_range.h"\r
32 \r
33 using namespace std::tr1::placeholders;\r
34 \r
35 namespace caspar{\r
36 namespace utils{\r
37 namespace image{\r
38 \r
39 static const size_t STRIDE = sizeof(__m128i)*4;\r
40 \r
41 void DoLerpParallel(const tbb::blocked_range<size_t>& r, const std::tr1::function<void(void*, const void*, const void*, float, size_t)>& func, void* dest, const void* source1, const void* source2, float alpha)\r
42 {\r
43         size_t offset = r.begin()*STRIDE;\r
44         size_t size = r.size()*STRIDE;\r
45         func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source1) + offset, reinterpret_cast<const s8*>(source2) + offset, alpha, size);\r
46 }\r
47 \r
48 void LerpParallel(const std::tr1::function<void(void*, const void*, const void*, float, size_t)>& func, void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
49 {\r
50         tbb::parallel_for(tbb::blocked_range<size_t>(0, size/STRIDE), std::tr1::bind(&DoLerpParallel, _1, func, dest, source1, source2, alpha));        \r
51 }\r
52 \r
53 LerpFun GetLerpFun(SIMD simd)\r
54 {\r
55         if(simd >= SSE2)\r
56                 return LerpParallel_SSE2;\r
57         else\r
58                 return LerpParallel_REF;\r
59 }\r
60 \r
61 void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
62 {\r
63         static const u32 PSD = 64;\r
64         \r
65         static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);\r
66         static const __m128i round = _mm_set1_epi16(128);\r
67 \r
68         assert(source1 != NULL && source2 != NULL && dest != NULL);\r
69         assert(size % STRIDE == 0);\r
70         assert(alpha >= 0.0 && alpha <= 1.0);\r
71 \r
72         const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);\r
73         const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);\r
74         __m128i* dest128 = reinterpret_cast<__m128i*>(dest);\r
75 \r
76         __m128i s = _mm_setzero_si128();\r
77         __m128i d = _mm_setzero_si128();\r
78         const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));\r
79         \r
80         __m128i drb, dga, srb, sga;\r
81         \r
82         for (size_t k = 0, length = size/STRIDE; k < length; ++k)\r
83         {               \r
84                 _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);   \r
85                 _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);\r
86                 // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/\r
87 \r
88                 for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)\r
89                 {\r
90                         // r = d + (s-d)*alpha/256\r
91                         s = _mm_load_si128(source128_1);        // AABBGGRR\r
92                         d = _mm_load_si128(source128_2);        // AABBGGRR\r
93 \r
94                         srb = _mm_and_si128(lomask, s);         // 00BB00RR             // unpack\r
95                         sga = _mm_srli_epi16(s, 8);                     // AA00GG00             // unpack\r
96                         \r
97                         drb = _mm_and_si128(lomask, d);         // 00BB00RR             // unpack\r
98                         dga = _mm_srli_epi16(d, 8);                     // AA00GG00             // unpack\r
99 \r
100                         srb = _mm_sub_epi16(srb, drb);          // BBBBRRRR             // sub\r
101                         srb = _mm_mullo_epi16(srb, a);          // BBBBRRRR             // mul\r
102                         srb = _mm_add_epi16(srb, round);\r
103                         \r
104                         sga = _mm_sub_epi16(sga, dga);          // AAAAGGGG             // sub\r
105                         sga = _mm_mullo_epi16(sga, a);          // AAAAGGGG             // mul\r
106                         sga = _mm_add_epi16(sga, round);\r
107 \r
108                         srb = _mm_srli_epi16(srb, 8);           // 00BB00RR             // prepack and div\r
109                         sga = _mm_andnot_si128(lomask, sga);// AA00GG00         // prepack and div\r
110 \r
111                         srb = _mm_or_si128(srb, sga);           // AABBGGRR             // pack\r
112 \r
113                         srb = _mm_add_epi8(srb, d);                     // AABBGGRR             // add          there is no overflow(R.N)\r
114 \r
115                         _mm_stream_si128(dest128, srb);\r
116                 }\r
117         }\r
118         _mm_mfence();   //ensure last WC buffers get flushed to memory\r
119 }\r
120 \r
121 void LerpParallel_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
122 {\r
123         LerpParallel(&Lerp_SSE2, dest, source1, source2, alpha, size);\r
124 }\r
125 \r
126 void Lerp_REF(void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
127 {\r
128         assert(source1 != NULL && source2 != NULL && dest != NULL);\r
129         assert(size % 4 == 0);\r
130         assert(alpha >= 0.0f && alpha <= 1.0f);\r
131 \r
132         const u8* source8_1 = reinterpret_cast<const u8*>(source1);\r
133         const u8* source8_2 = reinterpret_cast<const u8*>(source2);\r
134         u8* dest8 = reinterpret_cast<u8*>(dest);\r
135 \r
136         u8 a = static_cast<u8>(alpha*256.0f);\r
137         for(size_t n = 0; n < size; n+=4)\r
138         {\r
139                 // s\r
140                 u32 sr = source8_1[n+0];\r
141                 u32 sg = source8_1[n+1];\r
142                 u32 sb = source8_1[n+2];\r
143                 u32 sa = source8_1[n+3];\r
144 \r
145                 // d\r
146                 u32 dr = source8_2[n+0];\r
147                 u32 dg = source8_2[n+1];\r
148                 u32 db = source8_2[n+2];\r
149                 u32 da = source8_2[n+3];\r
150 \r
151                 //dest8[n+0] = dr + ((sr-dr)*a)/256;\r
152                 //dest8[n+1] = dg + ((sg-dg)*a)/256;\r
153                 //dest8[n+2] = db + ((sb-db)*a)/256;\r
154                 //dest8[n+3] = da + ((sa-da)*a)/256;\r
155 \r
156                 dest8[n+0] = dr + int(float((sr-dr)*a)/256.0f+0.5f);\r
157                 dest8[n+1] = dg + int(float((sg-dg)*a)/256.0f+0.5f);\r
158                 dest8[n+2] = db + int(float((sb-db)*a)/256.0f+0.5f);\r
159                 dest8[n+3] = da + int(float((sa-da)*a)/256.0f+0.5f);\r
160 \r
161         }\r
162 }\r
163 \r
164 void LerpParallel_REF(void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
165 {\r
166         LerpParallel(&Lerp_REF, dest, source1, source2, alpha, size);\r
167 }\r
168 \r
169 // Author: Niclas P Andersson\r
170 void Lerp_OLD(void* dest, const void* source1, const void* source2, float alpha, size_t size)\r
171 {\r
172         __m128i ps1, ps2, pd1, pd2, m0, m1, pr1, pr2;\r
173 \r
174         __m128i* pSource = (__m128i*)source1;\r
175         __m128i* pDest = (__m128i*)source2;\r
176         __m128i* pResult = (__m128i*)dest;\r
177 \r
178         __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));\r
179         m0 = _mm_setzero_si128();\r
180 \r
181         int count = size/4;\r
182         for ( int i = 0; i < count; i+=4 )\r
183         {\r
184                 ps1 = _mm_load_si128(pSource);          //load 4 pixels from source\r
185                 pd1 = _mm_load_si128(pDest);            //load 4 pixels from dest\r
186                 ps2 = _mm_unpackhi_epi64(ps1, m0);      //move the 2 high pixels from source\r
187                 pd2 = _mm_unpackhi_epi64(pd1, m0);      //move the 2 high pixels from dest\r
188 \r
189                 //compute the 2 "lower" pixels\r
190                 ps1 = _mm_unpacklo_epi8(ps1, m0);       //unpack the 2 low pixels from source (bytes -> words)\r
191                 pd1 = _mm_unpacklo_epi8(pd1, m0);       //unpack the 2 low pixels from dest (bytes -> words)\r
192 \r
193                 pr1 = _mm_sub_epi16(ps1, pd1);          //x = src - dest\r
194                 pr1 = _mm_mullo_epi16(pr1, a);          //y = x*alpha\r
195                 pr1 = _mm_srli_epi16(pr1, 8);       //w = y/256         \r
196                 pr1 = _mm_add_epi8(pr1, pd1);           //z = w + dest\r
197 \r
198                 //same thing for the 2 "high" pixels\r
199                 ps2 = _mm_unpacklo_epi8(ps2, m0);\r
200                 pd2 = _mm_unpacklo_epi8(pd2, m0);\r
201 \r
202                 pr2 = _mm_sub_epi16(ps2, pd2);          //x = src - dest\r
203                 pr2 = _mm_mullo_epi16(pr2, a);          //y = x*alpha\r
204                 pr2 = _mm_srli_epi16(pr2, 8);       //w = y/256         \r
205                 pr2 = _mm_add_epi8(pr2, pd2);           //z = w + dest\r
206 \r
207                 m1 = _mm_packus_epi16(pr1, pr2);        //pack all 4 together again (words -> bytes)\r
208                 _mm_store_si128(pResult, m1);\r
209 \r
210                 pSource++;\r
211                 pDest++;\r
212                 pResult++;\r
213         }\r
214 }\r
215 \r
216 } // namespace image\r
217 } // namespace utils\r
218 } // namespace caspar\r
219 \r
220 \r