2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
45 #define SFENCE "sfence"
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #define YSCALEYUV2YV12X(x) \
64 "xorl %%eax, %%eax \n\t"\
65 "pxor %%mm3, %%mm3 \n\t"\
66 "pxor %%mm4, %%mm4 \n\t"\
67 "movl %0, %%edx \n\t"\
68 ".balign 16 \n\t" /* FIXME Unroll? */\
70 "movl (%1, %%edx, 4), %%esi \n\t"\
71 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
72 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
73 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
74 "pmulhw %%mm0, %%mm2 \n\t"\
75 "pmulhw %%mm0, %%mm5 \n\t"\
76 "paddw %%mm2, %%mm3 \n\t"\
77 "paddw %%mm5, %%mm4 \n\t"\
78 "addl $1, %%edx \n\t"\
80 "psraw $3, %%mm3 \n\t"\
81 "psraw $3, %%mm4 \n\t"\
82 "packuswb %%mm4, %%mm3 \n\t"\
83 MOVNTQ(%%mm3, (%3, %%eax))\
84 "addl $8, %%eax \n\t"\
85 "cmpl %4, %%eax \n\t"\
86 "pxor %%mm3, %%mm3 \n\t"\
87 "pxor %%mm4, %%mm4 \n\t"\
88 "movl %0, %%edx \n\t"\
91 #define YSCALEYUV2YV121 \
92 "movl %2, %%eax \n\t"\
93 ".balign 16 \n\t" /* FIXME Unroll? */\
95 "movq (%0, %%eax, 2), %%mm0 \n\t"\
96 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
97 "psraw $7, %%mm0 \n\t"\
98 "psraw $7, %%mm1 \n\t"\
99 "packuswb %%mm1, %%mm0 \n\t"\
100 MOVNTQ(%%mm0, (%1, %%eax))\
101 "addl $8, %%eax \n\t"\
105 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107 "r" (dest), "m" (dstW),
108 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
111 #define YSCALEYUV2RGBX \
112 "xorl %%eax, %%eax \n\t"\
115 "movl %1, %%edx \n\t" /* -chrFilterSize */\
116 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
117 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
118 "pxor %%mm3, %%mm3 \n\t"\
119 "pxor %%mm4, %%mm4 \n\t"\
121 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
122 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
123 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
124 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
125 "pmulhw %%mm0, %%mm2 \n\t"\
126 "pmulhw %%mm0, %%mm5 \n\t"\
127 "paddw %%mm2, %%mm3 \n\t"\
128 "paddw %%mm5, %%mm4 \n\t"\
129 "addl $1, %%edx \n\t"\
132 "movl %0, %%edx \n\t" /* -lumFilterSize */\
133 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
134 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
135 "pxor %%mm1, %%mm1 \n\t"\
136 "pxor %%mm7, %%mm7 \n\t"\
138 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
139 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
140 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
141 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
142 "pmulhw %%mm0, %%mm2 \n\t"\
143 "pmulhw %%mm0, %%mm5 \n\t"\
144 "paddw %%mm2, %%mm1 \n\t"\
145 "paddw %%mm5, %%mm7 \n\t"\
146 "addl $1, %%edx \n\t"\
149 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
150 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
151 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
152 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
153 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
155 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
156 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
159 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
160 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
161 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
162 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163 "paddw %%mm3, %%mm4 \n\t"\
164 "movq %%mm2, %%mm0 \n\t"\
165 "movq %%mm5, %%mm6 \n\t"\
166 "movq %%mm4, %%mm3 \n\t"\
167 "punpcklwd %%mm2, %%mm2 \n\t"\
168 "punpcklwd %%mm5, %%mm5 \n\t"\
169 "punpcklwd %%mm4, %%mm4 \n\t"\
170 "paddw %%mm1, %%mm2 \n\t"\
171 "paddw %%mm1, %%mm5 \n\t"\
172 "paddw %%mm1, %%mm4 \n\t"\
173 "punpckhwd %%mm0, %%mm0 \n\t"\
174 "punpckhwd %%mm6, %%mm6 \n\t"\
175 "punpckhwd %%mm3, %%mm3 \n\t"\
176 "paddw %%mm7, %%mm0 \n\t"\
177 "paddw %%mm7, %%mm6 \n\t"\
178 "paddw %%mm7, %%mm3 \n\t"\
179 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180 "packuswb %%mm0, %%mm2 \n\t"\
181 "packuswb %%mm6, %%mm5 \n\t"\
182 "packuswb %%mm3, %%mm4 \n\t"\
183 "pxor %%mm7, %%mm7 \n\t"
185 #define FULL_YSCALEYUV2RGB \
186 "pxor %%mm7, %%mm7 \n\t"\
187 "movd %6, %%mm6 \n\t" /*yalpha1*/\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "punpcklwd %%mm6, %%mm6 \n\t"\
190 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "punpcklwd %%mm5, %%mm5 \n\t"\
193 "xorl %%eax, %%eax \n\t"\
196 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
197 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
198 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
199 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
200 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
201 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
206 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
209 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
211 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
212 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
213 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
216 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
218 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
219 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
220 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
221 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
222 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
225 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
226 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
228 "paddw %%mm1, %%mm3 \n\t" /* B*/\
229 "paddw %%mm1, %%mm0 \n\t" /* R*/\
230 "packuswb %%mm3, %%mm3 \n\t"\
232 "packuswb %%mm0, %%mm0 \n\t"\
233 "paddw %%mm4, %%mm2 \n\t"\
234 "paddw %%mm2, %%mm1 \n\t" /* G*/\
236 "packuswb %%mm1, %%mm1 \n\t"
238 #define YSCALEYUV2RGB \
239 "movd %6, %%mm6 \n\t" /*yalpha1*/\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
241 "punpcklwd %%mm6, %%mm6 \n\t"\
242 "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
243 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
245 "punpcklwd %%mm5, %%mm5 \n\t"\
246 "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
247 "xorl %%eax, %%eax \n\t"\
250 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
251 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
252 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
253 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
254 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
256 "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
257 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
263 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
264 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
265 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
266 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
267 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
269 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
271 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
272 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
273 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
274 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
275 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
276 "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277 "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
278 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
282 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
285 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
286 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
287 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289 "paddw %%mm3, %%mm4 \n\t"\
290 "movq %%mm2, %%mm0 \n\t"\
291 "movq %%mm5, %%mm6 \n\t"\
292 "movq %%mm4, %%mm3 \n\t"\
293 "punpcklwd %%mm2, %%mm2 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
295 "punpcklwd %%mm4, %%mm4 \n\t"\
296 "paddw %%mm1, %%mm2 \n\t"\
297 "paddw %%mm1, %%mm5 \n\t"\
298 "paddw %%mm1, %%mm4 \n\t"\
299 "punpckhwd %%mm0, %%mm0 \n\t"\
300 "punpckhwd %%mm6, %%mm6 \n\t"\
301 "punpckhwd %%mm3, %%mm3 \n\t"\
302 "paddw %%mm7, %%mm0 \n\t"\
303 "paddw %%mm7, %%mm6 \n\t"\
304 "paddw %%mm7, %%mm3 \n\t"\
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306 "packuswb %%mm0, %%mm2 \n\t"\
307 "packuswb %%mm6, %%mm5 \n\t"\
308 "packuswb %%mm3, %%mm4 \n\t"\
309 "pxor %%mm7, %%mm7 \n\t"
311 #define YSCALEYUV2RGB1 \
312 "xorl %%eax, %%eax \n\t"\
315 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
316 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
317 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
319 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
320 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
321 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
322 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
323 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
325 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
326 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
327 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
328 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
333 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
334 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
335 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
336 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337 "paddw %%mm3, %%mm4 \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "movq %%mm5, %%mm6 \n\t"\
340 "movq %%mm4, %%mm3 \n\t"\
341 "punpcklwd %%mm2, %%mm2 \n\t"\
342 "punpcklwd %%mm5, %%mm5 \n\t"\
343 "punpcklwd %%mm4, %%mm4 \n\t"\
344 "paddw %%mm1, %%mm2 \n\t"\
345 "paddw %%mm1, %%mm5 \n\t"\
346 "paddw %%mm1, %%mm4 \n\t"\
347 "punpckhwd %%mm0, %%mm0 \n\t"\
348 "punpckhwd %%mm6, %%mm6 \n\t"\
349 "punpckhwd %%mm3, %%mm3 \n\t"\
350 "paddw %%mm7, %%mm0 \n\t"\
351 "paddw %%mm7, %%mm6 \n\t"\
352 "paddw %%mm7, %%mm3 \n\t"\
353 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354 "packuswb %%mm0, %%mm2 \n\t"\
355 "packuswb %%mm6, %%mm5 \n\t"\
356 "packuswb %%mm3, %%mm4 \n\t"\
357 "pxor %%mm7, %%mm7 \n\t"
359 // do vertical chrominance interpolation
360 #define YSCALEYUV2RGB1b \
361 "xorl %%eax, %%eax \n\t"\
364 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
365 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
366 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
367 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
368 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
370 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
371 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
372 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
373 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
374 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
375 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
376 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
378 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
380 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
381 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
386 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
387 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
388 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
389 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390 "paddw %%mm3, %%mm4 \n\t"\
391 "movq %%mm2, %%mm0 \n\t"\
392 "movq %%mm5, %%mm6 \n\t"\
393 "movq %%mm4, %%mm3 \n\t"\
394 "punpcklwd %%mm2, %%mm2 \n\t"\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm4, %%mm4 \n\t"\
397 "paddw %%mm1, %%mm2 \n\t"\
398 "paddw %%mm1, %%mm5 \n\t"\
399 "paddw %%mm1, %%mm4 \n\t"\
400 "punpckhwd %%mm0, %%mm0 \n\t"\
401 "punpckhwd %%mm6, %%mm6 \n\t"\
402 "punpckhwd %%mm3, %%mm3 \n\t"\
403 "paddw %%mm7, %%mm0 \n\t"\
404 "paddw %%mm7, %%mm6 \n\t"\
405 "paddw %%mm7, %%mm3 \n\t"\
406 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407 "packuswb %%mm0, %%mm2 \n\t"\
408 "packuswb %%mm6, %%mm5 \n\t"\
409 "packuswb %%mm3, %%mm4 \n\t"\
410 "pxor %%mm7, %%mm7 \n\t"
413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414 "movq %%mm2, %%mm1 \n\t" /* B */\
415 "movq %%mm5, %%mm6 \n\t" /* R */\
416 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
417 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
418 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
419 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
420 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
421 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
422 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
423 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
424 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
425 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
427 MOVNTQ(%%mm0, (%4, %%eax, 4))\
428 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
432 "addl $8, %%eax \n\t"\
433 "cmpl %5, %%eax \n\t"\
437 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
438 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
439 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
440 "psrlq $3, %%mm2 \n\t"\
442 "movq %%mm2, %%mm1 \n\t"\
443 "movq %%mm4, %%mm3 \n\t"\
445 "punpcklbw %%mm7, %%mm3 \n\t"\
446 "punpcklbw %%mm5, %%mm2 \n\t"\
447 "punpckhbw %%mm7, %%mm4 \n\t"\
448 "punpckhbw %%mm5, %%mm1 \n\t"\
450 "psllq $3, %%mm3 \n\t"\
451 "psllq $3, %%mm4 \n\t"\
453 "por %%mm3, %%mm2 \n\t"\
454 "por %%mm4, %%mm1 \n\t"\
456 MOVNTQ(%%mm2, (%4, %%eax, 2))\
457 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
459 "addl $8, %%eax \n\t"\
460 "cmpl %5, %%eax \n\t"\
464 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
465 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
466 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
467 "psrlq $3, %%mm2 \n\t"\
468 "psrlq $1, %%mm5 \n\t"\
470 "movq %%mm2, %%mm1 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
473 "punpcklbw %%mm7, %%mm3 \n\t"\
474 "punpcklbw %%mm5, %%mm2 \n\t"\
475 "punpckhbw %%mm7, %%mm4 \n\t"\
476 "punpckhbw %%mm5, %%mm1 \n\t"\
478 "psllq $2, %%mm3 \n\t"\
479 "psllq $2, %%mm4 \n\t"\
481 "por %%mm3, %%mm2 \n\t"\
482 "por %%mm4, %%mm1 \n\t"\
484 MOVNTQ(%%mm2, (%4, %%eax, 2))\
485 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
487 "addl $8, %%eax \n\t"\
488 "cmpl %5, %%eax \n\t"\
491 #define WRITEBGR24OLD \
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493 "movq %%mm2, %%mm1 \n\t" /* B */\
494 "movq %%mm5, %%mm6 \n\t" /* R */\
495 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
496 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
497 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
498 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
499 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
500 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
501 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
502 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
503 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
504 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
506 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
507 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
508 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
510 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
511 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
512 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
513 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
515 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
516 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
517 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
518 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
519 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
520 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
521 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
522 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
524 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
525 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
526 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
527 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
529 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
530 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
531 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
532 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
534 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
535 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
536 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
538 MOVNTQ(%%mm0, (%%ebx))\
539 MOVNTQ(%%mm2, 8(%%ebx))\
540 MOVNTQ(%%mm3, 16(%%ebx))\
541 "addl $24, %%ebx \n\t"\
543 "addl $8, %%eax \n\t"\
544 "cmpl %5, %%eax \n\t"\
547 #define WRITEBGR24MMX \
548 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549 "movq %%mm2, %%mm1 \n\t" /* B */\
550 "movq %%mm5, %%mm6 \n\t" /* R */\
551 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
552 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
553 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
554 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
555 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
556 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
557 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
558 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
559 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
560 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
562 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
563 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
564 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
565 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
567 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
568 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
569 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
570 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
572 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
573 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
574 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
575 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
577 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
578 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
579 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
580 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
581 MOVNTQ(%%mm0, (%%ebx))\
583 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
584 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
585 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
586 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
587 MOVNTQ(%%mm6, 8(%%ebx))\
589 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
590 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
591 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
592 MOVNTQ(%%mm5, 16(%%ebx))\
594 "addl $24, %%ebx \n\t"\
596 "addl $8, %%eax \n\t"\
597 "cmpl %5, %%eax \n\t"\
600 #define WRITEBGR24MMX2 \
601 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
602 "movq "MANGLE(M24A)", %%mm0 \n\t"\
603 "movq "MANGLE(M24C)", %%mm7 \n\t"\
604 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
605 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
606 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
608 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
609 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
610 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
612 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
613 "por %%mm1, %%mm6 \n\t"\
614 "por %%mm3, %%mm6 \n\t"\
615 MOVNTQ(%%mm6, (%%ebx))\
617 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
618 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
619 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
620 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
622 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
623 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
624 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
626 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
627 "por %%mm3, %%mm6 \n\t"\
628 MOVNTQ(%%mm6, 8(%%ebx))\
630 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
631 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
632 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
634 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
635 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
636 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
638 "por %%mm1, %%mm3 \n\t"\
639 "por %%mm3, %%mm6 \n\t"\
640 MOVNTQ(%%mm6, 16(%%ebx))\
642 "addl $24, %%ebx \n\t"\
644 "addl $8, %%eax \n\t"\
645 "cmpl %5, %%eax \n\t"\
650 #define WRITEBGR24 WRITEBGR24MMX2
653 #define WRITEBGR24 WRITEBGR24MMX
656 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
659 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
666 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
668 : "%eax", "%edx", "%esi"
672 YSCALEYUV2YV12X(4096)
673 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
675 : "%eax", "%edx", "%esi"
681 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683 : "%eax", "%edx", "%esi"
686 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
687 chrFilter, chrSrc, chrFilterSize,
688 dest, uDest, vDest, dstW);
692 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
700 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
707 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
715 :: "r" (lumSrc + dstW), "r" (dest + dstW),
720 //FIXME Optimize (just quickly writen not opti..)
721 //FIXME replace MINMAX with LUTs
723 for(i=0; i<dstW; i++)
725 int val= lumSrc[i]>>7;
727 dest[i]= MIN(MAX(val>>19, 0), 255);
731 for(i=0; i<(dstW>>1); i++)
734 int v=chrSrc[i + 2048]>>7;
736 uDest[i]= MIN(MAX(u>>19, 0), 255);
737 vDest[i]= MIN(MAX(v>>19, 0), 255);
744 * vertical scale YV12 to RGB
746 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
747 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
748 uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
750 /* if(flags&SWS_FULL_UV_IPOL)
757 if(dstFormat == IMGFMT_BGR32) //FIXME untested
763 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
764 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
765 "r" (dest), "m" (dstW),
766 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
767 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
770 else if(dstFormat == IMGFMT_BGR24) //FIXME untested
774 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
775 "addl %4, %%ebx \n\t"
778 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
779 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
780 "r" (dest), "m" (dstW),
781 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
782 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
785 else if(dstFormat==IMGFMT_BGR15)
789 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
791 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
792 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
793 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
798 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
799 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
800 "r" (dest), "m" (dstW),
801 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
802 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
805 else if(dstFormat==IMGFMT_BGR16)
809 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
811 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
812 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
813 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
818 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
819 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
820 "r" (dest), "m" (dstW),
821 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
822 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
826 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
827 chrFilter, chrSrc, chrFilterSize,
828 dest, dstW, dstFormat);
836 * vertical bilinear scale YV12 to RGB
838 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
839 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
841 int yalpha1=yalpha^4095;
842 int uvalpha1=uvalpha^4095;
844 if(flags&SWS_FULL_CHR_H_INT)
848 if(dstFormat==IMGFMT_BGR32)
854 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
855 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
857 "movq %%mm3, %%mm1 \n\t"
858 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
859 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
861 MOVNTQ(%%mm3, (%4, %%eax, 4))
862 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
864 "addl $4, %%eax \n\t"
865 "cmpl %5, %%eax \n\t"
869 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
870 "m" (yalpha1), "m" (uvalpha1)
874 else if(dstFormat==IMGFMT_BGR24)
881 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
882 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
884 "movq %%mm3, %%mm1 \n\t"
885 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
886 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
888 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
889 "psrlq $8, %%mm3 \n\t" // GR0BGR00
890 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
891 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
892 "por %%mm2, %%mm3 \n\t" // BGRBGR00
893 "movq %%mm1, %%mm2 \n\t"
894 "psllq $48, %%mm1 \n\t" // 000000BG
895 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
897 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
898 "psrld $16, %%mm2 \n\t" // R000R000
899 "psrlq $24, %%mm1 \n\t" // 0BGR0000
900 "por %%mm2, %%mm1 \n\t" // RBGRR000
902 "movl %4, %%ebx \n\t"
903 "addl %%eax, %%ebx \n\t"
907 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
908 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
910 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
911 "psrlq $32, %%mm3 \n\t"
912 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
913 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
915 "addl $4, %%eax \n\t"
916 "cmpl %5, %%eax \n\t"
919 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
920 "m" (yalpha1), "m" (uvalpha1)
924 else if(dstFormat==IMGFMT_BGR15)
930 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
931 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
932 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
934 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
935 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
936 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
938 "psrlw $3, %%mm3 \n\t"
939 "psllw $2, %%mm1 \n\t"
940 "psllw $7, %%mm0 \n\t"
941 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
942 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
944 "por %%mm3, %%mm1 \n\t"
945 "por %%mm1, %%mm0 \n\t"
947 MOVNTQ(%%mm0, (%4, %%eax, 2))
949 "addl $4, %%eax \n\t"
950 "cmpl %5, %%eax \n\t"
953 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
954 "m" (yalpha1), "m" (uvalpha1)
958 else if(dstFormat==IMGFMT_BGR16)
964 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
965 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
966 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
968 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
969 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
970 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
972 "psrlw $3, %%mm3 \n\t"
973 "psllw $3, %%mm1 \n\t"
974 "psllw $8, %%mm0 \n\t"
975 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
976 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
978 "por %%mm3, %%mm1 \n\t"
979 "por %%mm1, %%mm0 \n\t"
981 MOVNTQ(%%mm0, (%4, %%eax, 2))
983 "addl $4, %%eax \n\t"
984 "cmpl %5, %%eax \n\t"
987 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
988 "m" (yalpha1), "m" (uvalpha1)
993 if(dstFormat==IMGFMT_BGR32)
997 // vertical linear interpolation && yuv2rgb in a single step:
998 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
999 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1000 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1001 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1002 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1003 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1007 else if(dstFormat==IMGFMT_BGR24)
1010 for(i=0;i<dstW;i++){
1011 // vertical linear interpolation && yuv2rgb in a single step:
1012 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1015 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1021 else if(dstFormat==IMGFMT_BGR16)
1024 for(i=0;i<dstW;i++){
1025 // vertical linear interpolation && yuv2rgb in a single step:
1026 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1030 ((uint16_t*)dest)[i] =
1031 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1032 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1033 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1036 else if(dstFormat==IMGFMT_BGR15)
1039 for(i=0;i<dstW;i++){
1040 // vertical linear interpolation && yuv2rgb in a single step:
1041 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1042 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1043 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1045 ((uint16_t*)dest)[i] =
1046 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1047 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1048 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1056 if(dstFormat==IMGFMT_BGR32)
1062 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1063 "m" (yalpha1), "m" (uvalpha1)
1067 else if(dstFormat==IMGFMT_BGR24)
1070 "movl %4, %%ebx \n\t"
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075 "m" (yalpha1), "m" (uvalpha1)
1079 else if(dstFormat==IMGFMT_BGR15)
1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1086 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1087 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1092 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1093 "m" (yalpha1), "m" (uvalpha1)
1097 else if(dstFormat==IMGFMT_BGR16)
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1110 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1111 "m" (yalpha1), "m" (uvalpha1)
1116 if(dstFormat==IMGFMT_BGR32)
1119 for(i=0; i<dstW-1; i+=2){
1120 // vertical linear interpolation && yuv2rgb in a single step:
1121 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1122 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1123 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1124 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1126 int Cb= yuvtab_40cf[U];
1127 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1128 int Cr= yuvtab_3343[V];
1130 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1131 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1132 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1134 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1135 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1136 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1139 else if(dstFormat==IMGFMT_BGR24)
1142 for(i=0; i<dstW-1; i+=2){
1143 // vertical linear interpolation && yuv2rgb in a single step:
1144 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1145 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1146 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1147 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1149 int Cb= yuvtab_40cf[U];
1150 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1151 int Cr= yuvtab_3343[V];
1153 dest[0]=clip_table[((Y1 + Cb) >>13)];
1154 dest[1]=clip_table[((Y1 + Cg) >>13)];
1155 dest[2]=clip_table[((Y1 + Cr) >>13)];
1157 dest[3]=clip_table[((Y2 + Cb) >>13)];
1158 dest[4]=clip_table[((Y2 + Cg) >>13)];
1159 dest[5]=clip_table[((Y2 + Cr) >>13)];
1163 else if(dstFormat==IMGFMT_BGR16)
1167 static int ditherb1=1<<14;
1168 static int ditherg1=1<<13;
1169 static int ditherr1=2<<14;
1170 static int ditherb2=3<<14;
1171 static int ditherg2=3<<13;
1172 static int ditherr2=0<<14;
1174 ditherb1 ^= (1^2)<<14;
1175 ditherg1 ^= (1^2)<<13;
1176 ditherr1 ^= (1^2)<<14;
1177 ditherb2 ^= (3^0)<<14;
1178 ditherg2 ^= (3^0)<<13;
1179 ditherr2 ^= (3^0)<<14;
1181 const int ditherb1=0;
1182 const int ditherg1=0;
1183 const int ditherr1=0;
1184 const int ditherb2=0;
1185 const int ditherg2=0;
1186 const int ditherr2=0;
1188 for(i=0; i<dstW-1; i+=2){
1189 // vertical linear interpolation && yuv2rgb in a single step:
1190 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1191 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1192 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1193 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1195 int Cb= yuvtab_40cf[U];
1196 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1197 int Cr= yuvtab_3343[V];
1199 ((uint16_t*)dest)[i] =
1200 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1201 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1202 clip_table16r[(Y1 + Cr + ditherr1) >>13];
1204 ((uint16_t*)dest)[i+1] =
1205 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1206 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1207 clip_table16r[(Y2 + Cr + ditherr2) >>13];
1210 else if(dstFormat==IMGFMT_BGR15)
1214 static int ditherb1=1<<14;
1215 static int ditherg1=1<<14;
1216 static int ditherr1=2<<14;
1217 static int ditherb2=3<<14;
1218 static int ditherg2=3<<14;
1219 static int ditherr2=0<<14;
1221 ditherb1 ^= (1^2)<<14;
1222 ditherg1 ^= (1^2)<<14;
1223 ditherr1 ^= (1^2)<<14;
1224 ditherb2 ^= (3^0)<<14;
1225 ditherg2 ^= (3^0)<<14;
1226 ditherr2 ^= (3^0)<<14;
1228 const int ditherb1=0;
1229 const int ditherg1=0;
1230 const int ditherr1=0;
1231 const int ditherb2=0;
1232 const int ditherg2=0;
1233 const int ditherr2=0;
1235 for(i=0; i<dstW-1; i+=2){
1236 // vertical linear interpolation && yuv2rgb in a single step:
1237 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1238 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1239 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1240 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1242 int Cb= yuvtab_40cf[U];
1243 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1244 int Cr= yuvtab_3343[V];
1246 ((uint16_t*)dest)[i] =
1247 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1248 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1249 clip_table15r[(Y1 + Cr + ditherr1) >>13];
1251 ((uint16_t*)dest)[i+1] =
1252 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1253 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1254 clip_table15r[(Y2 + Cr + ditherr2) >>13];
1262 * YV12 to RGB without scaling or interpolating
1264 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1265 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1267 int uvalpha1=uvalpha^4095;
1268 const int yalpha1=0;
1270 if(flags&SWS_FULL_CHR_H_INT)
1272 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1277 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1279 if(dstFormat==IMGFMT_BGR32)
1284 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1285 "m" (yalpha1), "m" (uvalpha1)
1289 else if(dstFormat==IMGFMT_BGR24)
1292 "movl %4, %%ebx \n\t"
1295 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1296 "m" (yalpha1), "m" (uvalpha1)
1300 else if(dstFormat==IMGFMT_BGR15)
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1307 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1308 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1311 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1312 "m" (yalpha1), "m" (uvalpha1)
1316 else if(dstFormat==IMGFMT_BGR16)
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1322 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1323 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1324 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1328 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1329 "m" (yalpha1), "m" (uvalpha1)
1336 if(dstFormat==IMGFMT_BGR32)
1341 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1342 "m" (yalpha1), "m" (uvalpha1)
1346 else if(dstFormat==IMGFMT_BGR24)
1349 "movl %4, %%ebx \n\t"
1352 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1353 "m" (yalpha1), "m" (uvalpha1)
1357 else if(dstFormat==IMGFMT_BGR15)
1361 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1363 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1364 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1365 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1368 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1369 "m" (yalpha1), "m" (uvalpha1)
1373 else if(dstFormat==IMGFMT_BGR16)
1377 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1379 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1380 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1381 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1385 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1386 "m" (yalpha1), "m" (uvalpha1)
1392 //FIXME write 2 versions (for even & odd lines)
1394 if(dstFormat==IMGFMT_BGR32)
1397 for(i=0; i<dstW-1; i+=2){
1398 // vertical linear interpolation && yuv2rgb in a single step:
1399 int Y1=yuvtab_2568[buf0[i]>>7];
1400 int Y2=yuvtab_2568[buf0[i+1]>>7];
1401 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1402 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1404 int Cb= yuvtab_40cf[U];
1405 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1406 int Cr= yuvtab_3343[V];
1408 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1409 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1410 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1412 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1413 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1414 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1417 else if(dstFormat==IMGFMT_BGR24)
1420 for(i=0; i<dstW-1; i+=2){
1421 // vertical linear interpolation && yuv2rgb in a single step:
1422 int Y1=yuvtab_2568[buf0[i]>>7];
1423 int Y2=yuvtab_2568[buf0[i+1]>>7];
1424 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1425 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1427 int Cb= yuvtab_40cf[U];
1428 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1429 int Cr= yuvtab_3343[V];
1431 dest[0]=clip_table[((Y1 + Cb) >>13)];
1432 dest[1]=clip_table[((Y1 + Cg) >>13)];
1433 dest[2]=clip_table[((Y1 + Cr) >>13)];
1435 dest[3]=clip_table[((Y2 + Cb) >>13)];
1436 dest[4]=clip_table[((Y2 + Cg) >>13)];
1437 dest[5]=clip_table[((Y2 + Cr) >>13)];
1441 else if(dstFormat==IMGFMT_BGR16)
1445 static int ditherb1=1<<14;
1446 static int ditherg1=1<<13;
1447 static int ditherr1=2<<14;
1448 static int ditherb2=3<<14;
1449 static int ditherg2=3<<13;
1450 static int ditherr2=0<<14;
1452 ditherb1 ^= (1^2)<<14;
1453 ditherg1 ^= (1^2)<<13;
1454 ditherr1 ^= (1^2)<<14;
1455 ditherb2 ^= (3^0)<<14;
1456 ditherg2 ^= (3^0)<<13;
1457 ditherr2 ^= (3^0)<<14;
1459 const int ditherb1=0;
1460 const int ditherg1=0;
1461 const int ditherr1=0;
1462 const int ditherb2=0;
1463 const int ditherg2=0;
1464 const int ditherr2=0;
1466 for(i=0; i<dstW-1; i+=2){
1467 // vertical linear interpolation && yuv2rgb in a single step:
1468 int Y1=yuvtab_2568[buf0[i]>>7];
1469 int Y2=yuvtab_2568[buf0[i+1]>>7];
1470 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1471 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1473 int Cb= yuvtab_40cf[U];
1474 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1475 int Cr= yuvtab_3343[V];
1477 ((uint16_t*)dest)[i] =
1478 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1479 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1480 clip_table16r[(Y1 + Cr + ditherr1) >>13];
1482 ((uint16_t*)dest)[i+1] =
1483 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1484 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1485 clip_table16r[(Y2 + Cr + ditherr2) >>13];
1488 else if(dstFormat==IMGFMT_BGR15)
1492 static int ditherb1=1<<14;
1493 static int ditherg1=1<<14;
1494 static int ditherr1=2<<14;
1495 static int ditherb2=3<<14;
1496 static int ditherg2=3<<14;
1497 static int ditherr2=0<<14;
1499 ditherb1 ^= (1^2)<<14;
1500 ditherg1 ^= (1^2)<<14;
1501 ditherr1 ^= (1^2)<<14;
1502 ditherb2 ^= (3^0)<<14;
1503 ditherg2 ^= (3^0)<<14;
1504 ditherr2 ^= (3^0)<<14;
1506 const int ditherb1=0;
1507 const int ditherg1=0;
1508 const int ditherr1=0;
1509 const int ditherb2=0;
1510 const int ditherg2=0;
1511 const int ditherr2=0;
1513 for(i=0; i<dstW-1; i+=2){
1514 // vertical linear interpolation && yuv2rgb in a single step:
1515 int Y1=yuvtab_2568[buf0[i]>>7];
1516 int Y2=yuvtab_2568[buf0[i+1]>>7];
1517 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1518 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1520 int Cb= yuvtab_40cf[U];
1521 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1522 int Cr= yuvtab_3343[V];
1524 ((uint16_t*)dest)[i] =
1525 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1526 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1527 clip_table15r[(Y1 + Cr + ditherr1) >>13];
1529 ((uint16_t*)dest)[i+1] =
1530 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1531 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1532 clip_table15r[(Y2 + Cr + ditherr2) >>13];
1538 //FIXME yuy2* can read upto 7 samples to much
1540 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1544 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1545 "movl %0, %%eax \n\t"
1547 "movq (%1, %%eax,2), %%mm0 \n\t"
1548 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1549 "pand %%mm2, %%mm0 \n\t"
1550 "pand %%mm2, %%mm1 \n\t"
1551 "packuswb %%mm1, %%mm0 \n\t"
1552 "movq %%mm0, (%2, %%eax) \n\t"
1553 "addl $8, %%eax \n\t"
1555 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1560 for(i=0; i<width; i++)
1565 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1567 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1569 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1570 "movl %0, %%eax \n\t"
1572 "movq (%1, %%eax,4), %%mm0 \n\t"
1573 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1574 "movq (%2, %%eax,4), %%mm2 \n\t"
1575 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1578 "psrlw $8, %%mm0 \n\t"
1579 "psrlw $8, %%mm1 \n\t"
1580 "packuswb %%mm1, %%mm0 \n\t"
1581 "movq %%mm0, %%mm1 \n\t"
1582 "psrlw $8, %%mm0 \n\t"
1583 "pand %%mm4, %%mm1 \n\t"
1584 "packuswb %%mm0, %%mm0 \n\t"
1585 "packuswb %%mm1, %%mm1 \n\t"
1586 "movd %%mm0, (%4, %%eax) \n\t"
1587 "movd %%mm1, (%3, %%eax) \n\t"
1588 "addl $4, %%eax \n\t"
1590 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1595 for(i=0; i<width; i++)
1597 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1598 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1603 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1605 #ifdef HAVE_MMXFIXME
1608 for(i=0; i<width; i++)
1614 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1619 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1621 #ifdef HAVE_MMXFIXME
1624 for(i=0; i<width; i++)
1626 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1627 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1628 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1630 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1631 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1636 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1638 #ifdef HAVE_MMXFIXME
1641 for(i=0; i<width; i++)
1647 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1652 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1654 #ifdef HAVE_MMXFIXME
1657 for(i=0; i<width; i++)
1659 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1660 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1661 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1663 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1664 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1669 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1672 for(i=0; i<width; i++)
1678 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1682 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1685 for(i=0; i<width; i++)
1687 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1688 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1689 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1691 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1692 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1696 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1699 for(i=0; i<width; i++)
1705 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1709 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1712 for(i=0; i<width; i++)
1714 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1715 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1716 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1718 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1719 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1724 // Bilinear / Bicubic scaling
1725 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1726 int16_t *filter, int16_t *filterPos, int filterSize)
1729 if(filterSize==4) // allways true for upscaling, sometimes for down too
1731 int counter= -2*dstW;
1733 filterPos-= counter/2;
1736 "pxor %%mm7, %%mm7 \n\t"
1737 "movq "MANGLE(w02)", %%mm6 \n\t"
1738 "pushl %%ebp \n\t" // we use 7 regs here ...
1739 "movl %%eax, %%ebp \n\t"
1742 "movzwl (%2, %%ebp), %%eax \n\t"
1743 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1744 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1745 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1746 "movd (%3, %%eax), %%mm0 \n\t"
1747 "movd (%3, %%ebx), %%mm2 \n\t"
1748 "punpcklbw %%mm7, %%mm0 \n\t"
1749 "punpcklbw %%mm7, %%mm2 \n\t"
1750 "pmaddwd %%mm1, %%mm0 \n\t"
1751 "pmaddwd %%mm2, %%mm3 \n\t"
1752 "psrad $8, %%mm0 \n\t"
1753 "psrad $8, %%mm3 \n\t"
1754 "packssdw %%mm3, %%mm0 \n\t"
1755 "pmaddwd %%mm6, %%mm0 \n\t"
1756 "packssdw %%mm0, %%mm0 \n\t"
1757 "movd %%mm0, (%4, %%ebp) \n\t"
1758 "addl $4, %%ebp \n\t"
1763 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1767 else if(filterSize==8)
1769 int counter= -2*dstW;
1771 filterPos-= counter/2;
1774 "pxor %%mm7, %%mm7 \n\t"
1775 "movq "MANGLE(w02)", %%mm6 \n\t"
1776 "pushl %%ebp \n\t" // we use 7 regs here ...
1777 "movl %%eax, %%ebp \n\t"
1780 "movzwl (%2, %%ebp), %%eax \n\t"
1781 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1782 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1783 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1784 "movd (%3, %%eax), %%mm0 \n\t"
1785 "movd (%3, %%ebx), %%mm2 \n\t"
1786 "punpcklbw %%mm7, %%mm0 \n\t"
1787 "punpcklbw %%mm7, %%mm2 \n\t"
1788 "pmaddwd %%mm1, %%mm0 \n\t"
1789 "pmaddwd %%mm2, %%mm3 \n\t"
1791 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1792 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1793 "movd 4(%3, %%eax), %%mm4 \n\t"
1794 "movd 4(%3, %%ebx), %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm4 \n\t"
1796 "punpcklbw %%mm7, %%mm2 \n\t"
1797 "pmaddwd %%mm1, %%mm4 \n\t"
1798 "pmaddwd %%mm2, %%mm5 \n\t"
1799 "paddd %%mm4, %%mm0 \n\t"
1800 "paddd %%mm5, %%mm3 \n\t"
1802 "psrad $8, %%mm0 \n\t"
1803 "psrad $8, %%mm3 \n\t"
1804 "packssdw %%mm3, %%mm0 \n\t"
1805 "pmaddwd %%mm6, %%mm0 \n\t"
1806 "packssdw %%mm0, %%mm0 \n\t"
1807 "movd %%mm0, (%4, %%ebp) \n\t"
1808 "addl $4, %%ebp \n\t"
1813 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1819 int counter= -2*dstW;
1820 // filter-= counter*filterSize/2;
1821 filterPos-= counter/2;
1824 "pxor %%mm7, %%mm7 \n\t"
1825 "movq "MANGLE(w02)", %%mm6 \n\t"
1828 "movl %2, %%ecx \n\t"
1829 "movzwl (%%ecx, %0), %%eax \n\t"
1830 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1831 "movl %5, %%ecx \n\t"
1832 "pxor %%mm4, %%mm4 \n\t"
1833 "pxor %%mm5, %%mm5 \n\t"
1835 "movq (%1), %%mm1 \n\t"
1836 "movq (%1, %6), %%mm3 \n\t"
1837 "movd (%%ecx, %%eax), %%mm0 \n\t"
1838 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1839 "punpcklbw %%mm7, %%mm0 \n\t"
1840 "punpcklbw %%mm7, %%mm2 \n\t"
1841 "pmaddwd %%mm1, %%mm0 \n\t"
1842 "pmaddwd %%mm2, %%mm3 \n\t"
1843 "paddd %%mm3, %%mm5 \n\t"
1844 "paddd %%mm0, %%mm4 \n\t"
1846 "addl $4, %%ecx \n\t"
1847 "cmpl %4, %%ecx \n\t"
1850 "psrad $8, %%mm4 \n\t"
1851 "psrad $8, %%mm5 \n\t"
1852 "packssdw %%mm5, %%mm4 \n\t"
1853 "pmaddwd %%mm6, %%mm4 \n\t"
1854 "packssdw %%mm4, %%mm4 \n\t"
1855 "movl %3, %%eax \n\t"
1856 "movd %%mm4, (%%eax, %0) \n\t"
1860 : "+r" (counter), "+r" (filter)
1861 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1862 "m" (src), "r" (filterSize*2)
1863 : "%ebx", "%eax", "%ecx"
1868 for(i=0; i<dstW; i++)
1871 int srcPos= filterPos[i];
1873 // printf("filterPos: %d\n", filterPos[i]);
1874 for(j=0; j<filterSize; j++)
1876 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1877 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1879 // filter += hFilterSize;
1880 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1885 // *** horizontal scale Y line to temp buffer
1886 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
1887 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1888 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
1889 int srcFormat, uint8_t *formatConvBuffer)
1891 if(srcFormat==IMGFMT_YUY2)
1893 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
1894 src= formatConvBuffer;
1896 else if(srcFormat==IMGFMT_BGR32)
1898 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
1899 src= formatConvBuffer;
1901 else if(srcFormat==IMGFMT_BGR24)
1903 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
1904 src= formatConvBuffer;
1906 else if(srcFormat==IMGFMT_RGB32)
1908 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
1909 src= formatConvBuffer;
1911 else if(srcFormat==IMGFMT_RGB24)
1913 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
1914 src= formatConvBuffer;
1918 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1919 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
1921 if(!(flags&SWS_FAST_BILINEAR))
1924 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1926 else // Fast Bilinear upscale / crap downscale
1934 "pxor %%mm7, %%mm7 \n\t"
1935 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1936 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1937 "punpcklwd %%mm6, %%mm6 \n\t"
1938 "punpcklwd %%mm6, %%mm6 \n\t"
1939 "movq %%mm6, %%mm2 \n\t"
1940 "psllq $16, %%mm2 \n\t"
1941 "paddw %%mm6, %%mm2 \n\t"
1942 "psllq $16, %%mm2 \n\t"
1943 "paddw %%mm6, %%mm2 \n\t"
1944 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1945 "movq %%mm2, %%mm4 \n\t"
1946 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1947 "punpcklwd %%mm6, %%mm6 \n\t"
1948 "punpcklwd %%mm6, %%mm6 \n\t"
1949 "xorl %%eax, %%eax \n\t" // i
1950 "movl %0, %%esi \n\t" // src
1951 "movl %1, %%edi \n\t" // buf1
1952 "movl %3, %%edx \n\t" // (xInc*4)>>16
1953 "xorl %%ecx, %%ecx \n\t"
1954 "xorl %%ebx, %%ebx \n\t"
1955 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1957 #define FUNNY_Y_CODE \
1958 PREFETCH" 1024(%%esi) \n\t"\
1959 PREFETCH" 1056(%%esi) \n\t"\
1960 PREFETCH" 1088(%%esi) \n\t"\
1962 "movq %%mm4, %%mm2 \n\t"\
1963 "xorl %%ecx, %%ecx \n\t"
1974 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1975 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
1976 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1978 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
1983 //NO MMX just normal asm ...
1985 "xorl %%eax, %%eax \n\t" // i
1986 "xorl %%ebx, %%ebx \n\t" // xx
1987 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1990 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1991 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1992 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1993 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1994 "shll $16, %%edi \n\t"
1995 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1996 "movl %1, %%edi \n\t"
1997 "shrl $9, %%esi \n\t"
1998 "movw %%si, (%%edi, %%eax, 2) \n\t"
1999 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2000 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2002 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2003 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2004 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2005 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2006 "shll $16, %%edi \n\t"
2007 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2008 "movl %1, %%edi \n\t"
2009 "shrl $9, %%esi \n\t"
2010 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2011 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2012 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2015 "addl $2, %%eax \n\t"
2016 "cmpl %2, %%eax \n\t"
2020 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2021 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2024 } //if MMX2 cant be used
2028 unsigned int xpos=0;
2029 for(i=0;i<dstWidth;i++)
2031 register unsigned int xx=xpos>>16;
2032 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2033 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2040 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2041 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2042 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2043 int srcFormat, uint8_t *formatConvBuffer)
2045 if(srcFormat==IMGFMT_YUY2)
2047 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2048 src1= formatConvBuffer;
2049 src2= formatConvBuffer+2048;
2051 else if(srcFormat==IMGFMT_BGR32)
2053 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2054 src1= formatConvBuffer;
2055 src2= formatConvBuffer+2048;
2057 else if(srcFormat==IMGFMT_BGR24)
2059 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2060 src1= formatConvBuffer;
2061 src2= formatConvBuffer+2048;
2063 else if(srcFormat==IMGFMT_RGB32)
2065 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2066 src1= formatConvBuffer;
2067 src2= formatConvBuffer+2048;
2069 else if(srcFormat==IMGFMT_RGB24)
2071 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2072 src1= formatConvBuffer;
2073 src2= formatConvBuffer+2048;
2075 else if(isGray(srcFormat))
2081 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2082 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2084 if(!(flags&SWS_FAST_BILINEAR))
2087 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2088 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2090 else // Fast Bilinear upscale / crap downscale
2098 "pxor %%mm7, %%mm7 \n\t"
2099 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
2100 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
2101 "punpcklwd %%mm6, %%mm6 \n\t"
2102 "punpcklwd %%mm6, %%mm6 \n\t"
2103 "movq %%mm6, %%mm2 \n\t"
2104 "psllq $16, %%mm2 \n\t"
2105 "paddw %%mm6, %%mm2 \n\t"
2106 "psllq $16, %%mm2 \n\t"
2107 "paddw %%mm6, %%mm2 \n\t"
2108 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
2109 "movq %%mm2, %%mm4 \n\t"
2110 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
2111 "punpcklwd %%mm6, %%mm6 \n\t"
2112 "punpcklwd %%mm6, %%mm6 \n\t"
2113 "xorl %%eax, %%eax \n\t" // i
2114 "movl %0, %%esi \n\t" // src
2115 "movl %1, %%edi \n\t" // buf1
2116 "movl %3, %%edx \n\t" // (xInc*4)>>16
2117 "xorl %%ecx, %%ecx \n\t"
2118 "xorl %%ebx, %%ebx \n\t"
2119 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
2121 #define FUNNYUVCODE \
2122 PREFETCH" 1024(%%esi) \n\t"\
2123 PREFETCH" 1056(%%esi) \n\t"\
2124 PREFETCH" 1088(%%esi) \n\t"\
2126 "movq %%mm4, %%mm2 \n\t"\
2127 "xorl %%ecx, %%ecx \n\t"
2138 "xorl %%eax, %%eax \n\t" // i
2139 "movl %6, %%esi \n\t" // src
2140 "movl %1, %%edi \n\t" // buf1
2141 "addl $4096, %%edi \n\t"
2153 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2154 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
2155 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2157 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2159 // printf("%d %d %d\n", dstWidth, i, srcW);
2160 dst[i] = src1[srcW-1]*128;
2161 dst[i+2048] = src2[srcW-1]*128;
2168 "xorl %%eax, %%eax \n\t" // i
2169 "xorl %%ebx, %%ebx \n\t" // xx
2170 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2173 "movl %0, %%esi \n\t"
2174 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2175 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2176 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2177 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2178 "shll $16, %%edi \n\t"
2179 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2180 "movl %1, %%edi \n\t"
2181 "shrl $9, %%esi \n\t"
2182 "movw %%si, (%%edi, %%eax, 2) \n\t"
2184 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2185 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2186 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2187 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2188 "shll $16, %%edi \n\t"
2189 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2190 "movl %1, %%edi \n\t"
2191 "shrl $9, %%esi \n\t"
2192 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2194 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2195 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2196 "addl $1, %%eax \n\t"
2197 "cmpl %2, %%eax \n\t"
2200 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2202 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2205 } //if MMX2 cant be used
2209 unsigned int xpos=0;
2210 for(i=0;i<dstWidth;i++)
2212 register unsigned int xx=xpos>>16;
2213 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2214 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2215 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2217 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2218 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2226 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2227 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
2229 /* load a few things into local vars to make the code more readable? and faster */
2230 const int srcW= c->srcW;
2231 const int dstW= c->dstW;
2232 const int dstH= c->dstH;
2233 const int chrDstW= c->chrDstW;
2234 const int lumXInc= c->lumXInc;
2235 const int chrXInc= c->chrXInc;
2236 const int dstFormat= c->dstFormat;
2237 const int flags= c->flags;
2238 const int canMMX2BeUsed= c->canMMX2BeUsed;
2239 int16_t *vLumFilterPos= c->vLumFilterPos;
2240 int16_t *vChrFilterPos= c->vChrFilterPos;
2241 int16_t *hLumFilterPos= c->hLumFilterPos;
2242 int16_t *hChrFilterPos= c->hChrFilterPos;
2243 int16_t *vLumFilter= c->vLumFilter;
2244 int16_t *vChrFilter= c->vChrFilter;
2245 int16_t *hLumFilter= c->hLumFilter;
2246 int16_t *hChrFilter= c->hChrFilter;
2247 int16_t *lumMmxFilter= c->lumMmxFilter;
2248 int16_t *chrMmxFilter= c->chrMmxFilter;
2249 const int vLumFilterSize= c->vLumFilterSize;
2250 const int vChrFilterSize= c->vChrFilterSize;
2251 const int hLumFilterSize= c->hLumFilterSize;
2252 const int hChrFilterSize= c->hChrFilterSize;
2253 int16_t **lumPixBuf= c->lumPixBuf;
2254 int16_t **chrPixBuf= c->chrPixBuf;
2255 const int vLumBufSize= c->vLumBufSize;
2256 const int vChrBufSize= c->vChrBufSize;
2257 uint8_t *funnyYCode= c->funnyYCode;
2258 uint8_t *funnyUVCode= c->funnyUVCode;
2259 uint8_t *formatConvBuffer= c->formatConvBuffer;
2261 /* vars whch will change and which we need to storw back in the context */
2263 int lumBufIndex= c->lumBufIndex;
2264 int chrBufIndex= c->chrBufIndex;
2265 int lastInLumBuf= c->lastInLumBuf;
2266 int lastInChrBuf= c->lastInChrBuf;
2271 if(c->srcFormat == IMGFMT_I420){
2272 src[0]= srcParam[0];
2273 src[1]= srcParam[2];
2274 src[2]= srcParam[1];
2275 srcStride[0]= srcStrideParam[0];
2276 srcStride[1]= srcStrideParam[2];
2277 srcStride[2]= srcStrideParam[1];
2279 else if(c->srcFormat==IMGFMT_YV12){
2280 src[0]= srcParam[0];
2281 src[1]= srcParam[1];
2282 src[2]= srcParam[2];
2283 srcStride[0]= srcStrideParam[0];
2284 srcStride[1]= srcStrideParam[1];
2285 srcStride[2]= srcStrideParam[2];
2287 else if(isPacked(c->srcFormat)){
2290 src[2]= srcParam[0];
2291 srcStride[0]= srcStrideParam[0];
2293 srcStride[2]= srcStrideParam[0]<<1;
2295 else if(isGray(c->srcFormat)){
2296 src[0]= srcParam[0];
2299 srcStride[0]= srcStrideParam[0];
2304 if(c->dstFormat == IMGFMT_I420){
2305 dst[0]= dstParam[0];
2306 dst[1]= dstParam[2];
2307 dst[2]= dstParam[1];
2310 dst[0]= dstParam[0];
2311 dst[1]= dstParam[1];
2312 dst[2]= dstParam[2];
2315 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2316 //dstStride[0],dstStride[1],dstStride[2]);
2318 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2320 static int firstTime=1; //FIXME move this into the context perhaps
2321 if(flags & SWS_PRINT_INFO && firstTime)
2323 fprintf(stderr, "SwScaler: Warning: dstStride is not aligned!\n"
2324 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2329 /* Note the user might start scaling the picture in the middle so this will not get executed
2330 this is not really intended but works currently, so ppl might do it */
2339 for(;dstY < dstH; dstY++){
2340 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2341 unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2342 unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
2343 const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
2345 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2346 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2347 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2348 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2350 //handle holes (FAST_BILINEAR & weird filters)
2351 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2352 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2353 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2354 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2355 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2357 // Do we have enough lines in this slice to output the dstY line
2358 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2360 //Do horizontal scaling
2361 while(lastInLumBuf < lastLumSrcY)
2363 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2365 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2366 ASSERT(lumBufIndex < 2*vLumBufSize)
2367 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2368 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2369 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2370 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2371 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2372 funnyYCode, c->srcFormat, formatConvBuffer);
2375 while(lastInChrBuf < lastChrSrcY)
2377 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2378 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2380 ASSERT(chrBufIndex < 2*vChrBufSize)
2381 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2382 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2383 //FIXME replace parameters through context struct (some at least)
2384 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2385 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2386 funnyUVCode, c->srcFormat, formatConvBuffer);
2389 //wrap buf index around to stay inside the ring buffer
2390 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2391 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2393 else // not enough lines left in this slice -> load the rest in the buffer
2395 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2396 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2397 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2398 vChrBufSize, vLumBufSize);
2400 //Do horizontal scaling
2401 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2403 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2405 ASSERT(lumBufIndex < 2*vLumBufSize)
2406 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2407 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2408 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2409 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2410 funnyYCode, c->srcFormat, formatConvBuffer);
2413 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2415 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2416 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2418 ASSERT(chrBufIndex < 2*vChrBufSize)
2419 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2420 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2421 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2422 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2423 funnyUVCode, c->srcFormat, formatConvBuffer);
2426 //wrap buf index around to stay inside the ring buffer
2427 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2428 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2429 break; //we cant output a dstY line so lets try with the next slice
2433 b5Dither= dither8[dstY&1];
2434 g6Dither= dither4[dstY&1];
2435 g5Dither= dither8[dstY&1];
2436 r5Dither= dither8[(dstY+1)&1];
2440 if(isPlanarYUV(dstFormat)) //YV12 like
2442 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2443 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2445 int16_t *lumBuf = lumPixBuf[0];
2446 int16_t *chrBuf= chrPixBuf[0];
2447 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2451 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2452 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2454 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2455 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2456 dest, uDest, vDest, dstW,
2457 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2462 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2463 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2465 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2466 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2467 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2469 int chrAlpha= vChrFilter[2*dstY+1];
2471 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2472 dest, dstW, chrAlpha, dstFormat, flags);
2474 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2476 int lumAlpha= vLumFilter[2*dstY+1];
2477 int chrAlpha= vChrFilter[2*dstY+1];
2479 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2480 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2485 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2486 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2487 dest, dstW, dstFormat,
2488 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2492 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2494 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2495 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2496 if(isPlanarYUV(dstFormat)) //YV12
2498 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2500 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2501 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2502 dest, uDest, vDest, dstW);
2506 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2507 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2509 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2510 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2511 dest, dstW, dstFormat);
2517 __asm __volatile(SFENCE:::"memory");
2518 __asm __volatile(EMMS:::"memory");
2520 /* store changed local vars back in the context */
2522 c->lumBufIndex= lumBufIndex;
2523 c->chrBufIndex= chrBufIndex;
2524 c->lastInLumBuf= lastInLumBuf;
2525 c->lastInChrBuf= lastInChrBuf;