2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
191 #define YSCALEYUV2PACKEDX \
193 "xor %%"REG_a", %%"REG_a" \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
240 #define YSCALEYUV2PACKEDX_ACCURATE \
242 "xor %%"REG_a", %%"REG_a" \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
422 "packuswb %%mm1, %%mm1 \n\t"
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
850 "add $24, "#dst" \n\t"\
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
898 "add $24, "#dst" \n\t"\
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
935 if (c->flags & SWS_ACCURATE_RND){
937 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
944 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
959 #endif //!HAVE_ALTIVEC
960 #endif /* HAVE_MMX */
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
987 :: "r" (chrSrc + VOFW + chrDstW), "r" (vDest + chrDstW),
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
1001 for (i=0; i<dstW; i++)
1003 int val= lumSrc[i]>>7;
1014 for (i=0; i<chrDstW; i++)
1017 int v=chrSrc[i + VOFW]>>7;
1021 else if (u>255) u=255;
1023 else if (v>255) v=255;
1034 * vertical scale YV12 to RGB
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
1042 if (c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1045 YSCALEYUV2PACKEDX_ACCURATE
1047 WRITEBGR32(%4, %5, %%REGa)
1049 YSCALEYUV2PACKEDX_END
1052 YSCALEYUV2PACKEDX_ACCURATE
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1104 switch(c->dstFormat)
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1125 case PIX_FMT_BGR555:
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1130 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1138 case PIX_FMT_BGR565:
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1143 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1151 case PIX_FMT_YUYV422:
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1164 #endif /* HAVE_MMX */
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1182 * vertical bilinear scale YV12 to RGB
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1192 if (flags&SWS_FULL_CHR_H_INT)
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1216 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217 "m" (yalpha1), "m" (uvalpha1)
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
1248 "mov %4, %%"REG_b" \n\t"
1249 "add %%"REG_a", %%"REG_b" \n\t"
1253 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1256 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1261 "add $4, %%"REG_a" \n\t"
1262 "cmp %5, %%"REG_a" \n\t"
1265 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266 "m" (yalpha1), "m" (uvalpha1)
1267 : "%"REG_a, "%"REG_b
1270 case PIX_FMT_BGR555:
1275 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
1292 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1294 "add $4, %%"REG_a" \n\t"
1295 "cmp %5, %%"REG_a" \n\t"
1298 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299 "m" (yalpha1), "m" (uvalpha1)
1303 case PIX_FMT_BGR565:
1308 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
1325 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1327 "add $4, %%"REG_a" \n\t"
1328 "cmp %5, %%"REG_a" \n\t"
1331 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1336 #endif /* HAVE_MMX */
1341 if (dstFormat==PIX_FMT_RGB32)
1344 #ifdef WORDS_BIGENDIAN
1347 for (i=0;i<dstW;i++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1352 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1358 else if (dstFormat==PIX_FMT_BGR24)
1361 for (i=0;i<dstW;i++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1366 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1372 else if (dstFormat==PIX_FMT_BGR565)
1375 for (i=0;i<dstW;i++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1381 ((uint16_t*)dest)[i] =
1382 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1387 else if (dstFormat==PIX_FMT_BGR555)
1390 for (i=0;i<dstW;i++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1396 ((uint16_t*)dest)[i] =
1397 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1407 switch(c->dstFormat)
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1412 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413 "mov %4, %%"REG_b" \n\t"
1414 "push %%"REG_BP" \n\t"
1415 YSCALEYUV2RGB(%%REGBP, %5)
1416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417 "pop %%"REG_BP" \n\t"
1418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1420 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1426 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427 "mov %4, %%"REG_b" \n\t"
1428 "push %%"REG_BP" \n\t"
1429 YSCALEYUV2RGB(%%REGBP, %5)
1430 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1437 case PIX_FMT_BGR555:
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB(%%REGBP, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1445 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1450 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1454 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1458 case PIX_FMT_BGR565:
1460 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461 "mov %4, %%"REG_b" \n\t"
1462 "push %%"REG_BP" \n\t"
1463 YSCALEYUV2RGB(%%REGBP, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1466 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1471 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472 "pop %%"REG_BP" \n\t"
1473 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 case PIX_FMT_YUYV422:
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP, %5)
1484 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1498 * YV12 to RGB without scaling or interpolating
1500 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1503 const int yalpha1=0;
1506 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507 const int yalpha= 4096; //FIXME ...
1509 if (flags&SWS_FULL_CHR_H_INT)
1511 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1516 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1522 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523 "mov %4, %%"REG_b" \n\t"
1524 "push %%"REG_BP" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP, %5)
1526 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527 "pop %%"REG_BP" \n\t"
1528 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1530 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1536 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537 "mov %4, %%"REG_b" \n\t"
1538 "push %%"REG_BP" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP, %5)
1540 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1548 case PIX_FMT_BGR555:
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1560 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 case PIX_FMT_BGR565:
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1581 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582 "pop %%"REG_BP" \n\t"
1583 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1589 case PIX_FMT_YUYV422:
1591 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592 "mov %4, %%"REG_b" \n\t"
1593 "push %%"REG_BP" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP, %5)
1595 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596 "pop %%"REG_BP" \n\t"
1597 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1599 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP, %5)
1615 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1625 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626 "mov %4, %%"REG_b" \n\t"
1627 "push %%"REG_BP" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP, %5)
1629 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630 "pop %%"REG_BP" \n\t"
1631 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1633 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1637 case PIX_FMT_BGR555:
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1645 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1649 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 case PIX_FMT_BGR565:
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1670 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671 "pop %%"REG_BP" \n\t"
1672 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1674 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1678 case PIX_FMT_YUYV422:
1680 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681 "mov %4, %%"REG_b" \n\t"
1682 "push %%"REG_BP" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP, %5)
1684 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685 "pop %%"REG_BP" \n\t"
1686 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1688 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1694 #endif /* HAVE_MMX */
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1703 //FIXME yuy2* can read upto 7 samples to much
1705 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1709 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a" \n\t"
1712 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1720 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1725 for (i=0; i<width; i++)
1730 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1734 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a" \n\t"
1737 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1756 for (i=0; i<width; i++)
1758 dstU[i]= src1[4*i + 1];
1759 dstV[i]= src1[4*i + 3];
1762 assert(src1 == src2);
1765 /* This is almost identical to the previous, end exists only because
1766 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1767 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1771 "mov %0, %%"REG_a" \n\t"
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1786 for (i=0; i<width; i++)
1791 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1795 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1796 "mov %0, %%"REG_a" \n\t"
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1800 "pand %%mm4, %%mm0 \n\t"
1801 "pand %%mm4, %%mm1 \n\t"
1802 "packuswb %%mm1, %%mm0 \n\t"
1803 "movq %%mm0, %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm0, %%mm0 \n\t"
1807 "packuswb %%mm1, %%mm1 \n\t"
1808 "movd %%mm0, (%3, %%"REG_a") \n\t"
1809 "movd %%mm1, (%2, %%"REG_a") \n\t"
1810 "add $4, %%"REG_a" \n\t"
1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1817 for (i=0; i<width; i++)
1819 dstU[i]= src1[4*i + 0];
1820 dstV[i]= src1[4*i + 2];
1823 assert(src1 == src2);
1826 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1829 for (i=0; i<width; i++)
1831 int b= ((uint32_t*)src)[i]&0xFF;
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1839 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1842 assert(src1 == src2);
1843 for (i=0; i<width; i++)
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848 const int h= (a&0x00FF00) + (e&0x00FF00);
1849 const int b= l&0x3FF;
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1858 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1862 "mov %2, %%"REG_a" \n\t"
1863 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1864 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1865 "pxor %%mm7, %%mm7 \n\t"
1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1869 PREFETCH" 64(%0, %%"REG_d") \n\t"
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1872 "punpcklbw %%mm7, %%mm0 \n\t"
1873 "punpcklbw %%mm7, %%mm1 \n\t"
1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1876 "punpcklbw %%mm7, %%mm2 \n\t"
1877 "punpcklbw %%mm7, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm1 \n\t"
1880 "pmaddwd %%mm6, %%mm2 \n\t"
1881 "pmaddwd %%mm6, %%mm3 \n\t"
1882 #ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm0 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1888 "packssdw %%mm1, %%mm0 \n\t"
1889 "packssdw %%mm3, %%mm2 \n\t"
1890 "pmaddwd %%mm5, %%mm0 \n\t"
1891 "pmaddwd %%mm5, %%mm2 \n\t"
1892 "packssdw %%mm2, %%mm0 \n\t"
1893 "psraw $7, %%mm0 \n\t"
1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm4 \n\t"
1898 "punpcklbw %%mm7, %%mm1 \n\t"
1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm6, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm6, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 #ifndef FAST_BGR2YV12
1908 "psrad $8, %%mm4 \n\t"
1909 "psrad $8, %%mm1 \n\t"
1910 "psrad $8, %%mm2 \n\t"
1911 "psrad $8, %%mm3 \n\t"
1913 "packssdw %%mm1, %%mm4 \n\t"
1914 "packssdw %%mm3, %%mm2 \n\t"
1915 "pmaddwd %%mm5, %%mm4 \n\t"
1916 "pmaddwd %%mm5, %%mm2 \n\t"
1917 "add $24, %%"REG_d" \n\t"
1918 "packssdw %%mm2, %%mm4 \n\t"
1919 "psraw $7, %%mm4 \n\t"
1921 "packuswb %%mm4, %%mm0 \n\t"
1922 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1924 "movq %%mm0, (%1, %%"REG_a") \n\t"
1925 "add $8, %%"REG_a" \n\t"
1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928 : "%"REG_a, "%"REG_d
1932 for (i=0; i<width; i++)
1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1940 #endif /* HAVE_MMX */
1943 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1947 "mov %3, %%"REG_a" \n\t"
1948 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1949 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1950 "pxor %%mm7, %%mm7 \n\t"
1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1952 "add %%"REG_d", %%"REG_d" \n\t"
1955 PREFETCH" 64(%0, %%"REG_d") \n\t"
1956 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1957 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1959 "movq %%mm0, %%mm1 \n\t"
1960 "movq %%mm2, %%mm3 \n\t"
1961 "psrlq $24, %%mm0 \n\t"
1962 "psrlq $24, %%mm2 \n\t"
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1968 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm2 \n\t"
1972 "paddw %%mm2, %%mm0 \n\t"
1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1975 "punpcklbw %%mm7, %%mm4 \n\t"
1976 "punpcklbw %%mm7, %%mm2 \n\t"
1977 "paddw %%mm4, %%mm2 \n\t"
1978 "psrlw $1, %%mm0 \n\t"
1979 "psrlw $1, %%mm2 \n\t"
1981 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1982 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1984 "pmaddwd %%mm0, %%mm1 \n\t"
1985 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "pmaddwd %%mm6, %%mm0 \n\t"
1987 "pmaddwd %%mm6, %%mm2 \n\t"
1988 #ifndef FAST_BGR2YV12
1989 "psrad $8, %%mm0 \n\t"
1990 "psrad $8, %%mm1 \n\t"
1991 "psrad $8, %%mm2 \n\t"
1992 "psrad $8, %%mm3 \n\t"
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "packssdw %%mm3, %%mm1 \n\t"
1996 "pmaddwd %%mm5, %%mm0 \n\t"
1997 "pmaddwd %%mm5, %%mm1 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1999 "psraw $7, %%mm0 \n\t"
2001 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2004 "movq %%mm4, %%mm1 \n\t"
2005 "movq %%mm2, %%mm3 \n\t"
2006 "psrlq $24, %%mm4 \n\t"
2007 "psrlq $24, %%mm2 \n\t"
2010 "punpcklbw %%mm7, %%mm4 \n\t"
2011 "punpcklbw %%mm7, %%mm2 \n\t"
2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2015 "punpcklbw %%mm7, %%mm4 \n\t"
2016 "punpcklbw %%mm7, %%mm2 \n\t"
2017 "paddw %%mm2, %%mm4 \n\t"
2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2020 "punpcklbw %%mm7, %%mm5 \n\t"
2021 "punpcklbw %%mm7, %%mm2 \n\t"
2022 "paddw %%mm5, %%mm2 \n\t"
2023 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2024 "psrlw $2, %%mm4 \n\t"
2025 "psrlw $2, %%mm2 \n\t"
2027 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2028 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2030 "pmaddwd %%mm4, %%mm1 \n\t"
2031 "pmaddwd %%mm2, %%mm3 \n\t"
2032 "pmaddwd %%mm6, %%mm4 \n\t"
2033 "pmaddwd %%mm6, %%mm2 \n\t"
2034 #ifndef FAST_BGR2YV12
2035 "psrad $8, %%mm4 \n\t"
2036 "psrad $8, %%mm1 \n\t"
2037 "psrad $8, %%mm2 \n\t"
2038 "psrad $8, %%mm3 \n\t"
2040 "packssdw %%mm2, %%mm4 \n\t"
2041 "packssdw %%mm3, %%mm1 \n\t"
2042 "pmaddwd %%mm5, %%mm4 \n\t"
2043 "pmaddwd %%mm5, %%mm1 \n\t"
2044 "add $24, %%"REG_d" \n\t"
2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2046 "psraw $7, %%mm4 \n\t"
2048 "movq %%mm0, %%mm1 \n\t"
2049 "punpckldq %%mm4, %%mm0 \n\t"
2050 "punpckhdq %%mm4, %%mm1 \n\t"
2051 "packsswb %%mm1, %%mm0 \n\t"
2052 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2054 "movd %%mm0, (%1, %%"REG_a") \n\t"
2055 "punpckhdq %%mm0, %%mm0 \n\t"
2056 "movd %%mm0, (%2, %%"REG_a") \n\t"
2057 "add $4, %%"REG_a" \n\t"
2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060 : "%"REG_a, "%"REG_d
2064 for (i=0; i<width; i++)
2066 int b= src1[6*i + 0] + src1[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4];
2068 int r= src1[6*i + 2] + src1[6*i + 5];
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2073 #endif /* HAVE_MMX */
2074 assert(src1 == src2);
2077 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2080 for (i=0; i<width; i++)
2082 int d= ((uint16_t*)src)[i];
2085 int r= (d>>11)&0x1F;
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2091 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2095 for (i=0; i<width; i++)
2097 int d0= ((uint32_t*)src1)[i];
2099 int dl= (d0&0x07E0F81F);
2100 int dh= ((d0>>5)&0x07C0F83F);
2102 int dh2= (dh>>11) + (dh<<21);
2106 int r= (d>>11)&0x7F;
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2113 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2116 for (i=0; i<width; i++)
2118 int d= ((uint16_t*)src)[i];
2121 int r= (d>>10)&0x1F;
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2127 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2131 for (i=0; i<width; i++)
2133 int d0= ((uint32_t*)src1)[i];
2135 int dl= (d0&0x03E07C1F);
2136 int dh= ((d0>>5)&0x03E0F81F);
2138 int dh2= (dh>>11) + (dh<<21);
2142 int r= (d>>10)&0x7F;
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2150 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2153 for (i=0; i<width; i++)
2155 int r= ((uint32_t*)src)[i]&0xFF;
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2163 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2167 for (i=0; i<width; i++)
2169 const int a= ((uint32_t*)src1)[2*i+0];
2170 const int e= ((uint32_t*)src1)[2*i+1];
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172 const int h= (a&0x00FF00) + (e&0x00FF00);
2173 const int r= l&0x3FF;
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2182 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2185 for (i=0; i<width; i++)
2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2195 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2199 for (i=0; i<width; i++)
2201 int r= src1[6*i + 0] + src1[6*i + 3];
2202 int g= src1[6*i + 1] + src1[6*i + 4];
2203 int b= src1[6*i + 2] + src1[6*i + 5];
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2210 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2213 for (i=0; i<width; i++)
2215 int d= ((uint16_t*)src)[i];
2218 int b= (d>>11)&0x1F;
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2224 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2227 assert(src1 == src2);
2228 for (i=0; i<width; i++)
2230 int d0= ((uint32_t*)src1)[i];
2232 int dl= (d0&0x07E0F81F);
2233 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2236 int b= (d>>11)&0x3F;
2238 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2243 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2246 for (i=0; i<width; i++)
2248 int d= ((uint16_t*)src)[i];
2251 int b= (d>>10)&0x1F;
2253 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2257 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2260 assert(src1 == src2);
2261 for (i=0; i<width; i++)
2263 int d0= ((uint32_t*)src1)[i];
2265 int dl= (d0&0x03E07C1F);
2266 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2269 int b= (d>>10)&0x3F;
2271 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2276 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2279 for (i=0; i<width; i++)
2283 dst[i]= pal[d] & 0xFF;
2287 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2290 assert(src1 == src2);
2291 for (i=0; i<width; i++)
2293 int p= pal[src1[i]];
2300 // Bilinear / Bicubic scaling
2301 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2302 int16_t *filter, int16_t *filterPos, long filterSize)
2305 assert(filterSize % 4 == 0 && filterSize>0);
2306 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2308 long counter= -2*dstW;
2310 filterPos-= counter/2;
2314 "push %%"REG_b" \n\t"
2316 "pxor %%mm7, %%mm7 \n\t"
2317 "movq "MANGLE(w02)", %%mm6 \n\t"
2318 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2319 "mov %%"REG_a", %%"REG_BP" \n\t"
2322 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2323 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2324 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2325 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2326 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2327 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm0 \n\t"
2329 "punpcklbw %%mm7, %%mm2 \n\t"
2330 "pmaddwd %%mm1, %%mm0 \n\t"
2331 "pmaddwd %%mm2, %%mm3 \n\t"
2332 "psrad $8, %%mm0 \n\t"
2333 "psrad $8, %%mm3 \n\t"
2334 "packssdw %%mm3, %%mm0 \n\t"
2335 "pmaddwd %%mm6, %%mm0 \n\t"
2336 "packssdw %%mm0, %%mm0 \n\t"
2337 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2338 "add $4, %%"REG_BP" \n\t"
2341 "pop %%"REG_BP" \n\t"
2343 "pop %%"REG_b" \n\t"
2346 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2352 else if (filterSize==8)
2354 long counter= -2*dstW;
2356 filterPos-= counter/2;
2360 "push %%"REG_b" \n\t"
2362 "pxor %%mm7, %%mm7 \n\t"
2363 "movq "MANGLE(w02)", %%mm6 \n\t"
2364 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2365 "mov %%"REG_a", %%"REG_BP" \n\t"
2368 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2369 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2370 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2371 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2372 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2373 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2374 "punpcklbw %%mm7, %%mm0 \n\t"
2375 "punpcklbw %%mm7, %%mm2 \n\t"
2376 "pmaddwd %%mm1, %%mm0 \n\t"
2377 "pmaddwd %%mm2, %%mm3 \n\t"
2379 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2380 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2381 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2382 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2383 "punpcklbw %%mm7, %%mm4 \n\t"
2384 "punpcklbw %%mm7, %%mm2 \n\t"
2385 "pmaddwd %%mm1, %%mm4 \n\t"
2386 "pmaddwd %%mm2, %%mm5 \n\t"
2387 "paddd %%mm4, %%mm0 \n\t"
2388 "paddd %%mm5, %%mm3 \n\t"
2390 "psrad $8, %%mm0 \n\t"
2391 "psrad $8, %%mm3 \n\t"
2392 "packssdw %%mm3, %%mm0 \n\t"
2393 "pmaddwd %%mm6, %%mm0 \n\t"
2394 "packssdw %%mm0, %%mm0 \n\t"
2395 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2396 "add $4, %%"REG_BP" \n\t"
2399 "pop %%"REG_BP" \n\t"
2401 "pop %%"REG_b" \n\t"
2404 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2412 uint8_t *offset = src+filterSize;
2413 long counter= -2*dstW;
2414 //filter-= counter*filterSize/2;
2415 filterPos-= counter/2;
2418 "pxor %%mm7, %%mm7 \n\t"
2419 "movq "MANGLE(w02)", %%mm6 \n\t"
2422 "mov %2, %%"REG_c" \n\t"
2423 "movzwl (%%"REG_c", %0), %%eax \n\t"
2424 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2425 "mov %5, %%"REG_c" \n\t"
2426 "pxor %%mm4, %%mm4 \n\t"
2427 "pxor %%mm5, %%mm5 \n\t"
2429 "movq (%1), %%mm1 \n\t"
2430 "movq (%1, %6), %%mm3 \n\t"
2431 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2432 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2433 "punpcklbw %%mm7, %%mm0 \n\t"
2434 "punpcklbw %%mm7, %%mm2 \n\t"
2435 "pmaddwd %%mm1, %%mm0 \n\t"
2436 "pmaddwd %%mm2, %%mm3 \n\t"
2437 "paddd %%mm3, %%mm5 \n\t"
2438 "paddd %%mm0, %%mm4 \n\t"
2440 "add $4, %%"REG_c" \n\t"
2441 "cmp %4, %%"REG_c" \n\t"
2444 "psrad $8, %%mm4 \n\t"
2445 "psrad $8, %%mm5 \n\t"
2446 "packssdw %%mm5, %%mm4 \n\t"
2447 "pmaddwd %%mm6, %%mm4 \n\t"
2448 "packssdw %%mm4, %%mm4 \n\t"
2449 "mov %3, %%"REG_a" \n\t"
2450 "movd %%mm4, (%%"REG_a", %0) \n\t"
2454 : "+r" (counter), "+r" (filter)
2455 : "m" (filterPos), "m" (dst), "m"(offset),
2456 "m" (src), "r" (filterSize*2)
2457 : "%"REG_a, "%"REG_c, "%"REG_d
2462 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2465 for (i=0; i<dstW; i++)
2468 int srcPos= filterPos[i];
2470 //printf("filterPos: %d\n", filterPos[i]);
2471 for (j=0; j<filterSize; j++)
2473 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2474 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2476 //filter += hFilterSize;
2477 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2480 #endif /* HAVE_ALTIVEC */
2481 #endif /* HAVE_MMX */
2483 // *** horizontal scale Y line to temp buffer
2484 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2485 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2486 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2487 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2488 int32_t *mmx2FilterPos, uint8_t *pal)
2490 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2492 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2493 src= formatConvBuffer;
2495 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2497 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2498 src= formatConvBuffer;
2500 else if (srcFormat==PIX_FMT_RGB32)
2502 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2503 src= formatConvBuffer;
2505 else if (srcFormat==PIX_FMT_BGR24)
2507 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2508 src= formatConvBuffer;
2510 else if (srcFormat==PIX_FMT_BGR565)
2512 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2513 src= formatConvBuffer;
2515 else if (srcFormat==PIX_FMT_BGR555)
2517 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2518 src= formatConvBuffer;
2520 else if (srcFormat==PIX_FMT_BGR32)
2522 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2523 src= formatConvBuffer;
2525 else if (srcFormat==PIX_FMT_RGB24)
2527 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2528 src= formatConvBuffer;
2530 else if (srcFormat==PIX_FMT_RGB565)
2532 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2533 src= formatConvBuffer;
2535 else if (srcFormat==PIX_FMT_RGB555)
2537 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2538 src= formatConvBuffer;
2540 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2542 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2543 src= formatConvBuffer;
2547 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2548 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2550 if (!(flags&SWS_FAST_BILINEAR))
2553 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2555 else // Fast Bilinear upscale / crap downscale
2557 #if defined(ARCH_X86)
2561 uint64_t ebxsave __attribute__((aligned(8)));
2567 "mov %%"REG_b", %5 \n\t"
2569 "pxor %%mm7, %%mm7 \n\t"
2570 "mov %0, %%"REG_c" \n\t"
2571 "mov %1, %%"REG_D" \n\t"
2572 "mov %2, %%"REG_d" \n\t"
2573 "mov %3, %%"REG_b" \n\t"
2574 "xor %%"REG_a", %%"REG_a" \n\t" // i
2575 PREFETCH" (%%"REG_c") \n\t"
2576 PREFETCH" 32(%%"REG_c") \n\t"
2577 PREFETCH" 64(%%"REG_c") \n\t"
2581 #define FUNNY_Y_CODE \
2582 "movl (%%"REG_b"), %%esi \n\t"\
2584 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2585 "add %%"REG_S", %%"REG_c" \n\t"\
2586 "add %%"REG_a", %%"REG_D" \n\t"\
2587 "xor %%"REG_a", %%"REG_a" \n\t"\
2591 #define FUNNY_Y_CODE \
2592 "movl (%%"REG_b"), %%esi \n\t"\
2594 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2595 "add %%"REG_a", %%"REG_D" \n\t"\
2596 "xor %%"REG_a", %%"REG_a" \n\t"\
2598 #endif /* ARCH_X86_64 */
2610 "mov %5, %%"REG_b" \n\t"
2612 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2617 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2622 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2626 #endif /* HAVE_MMX2 */
2627 long xInc_shr16 = xInc >> 16;
2628 uint16_t xInc_mask = xInc & 0xffff;
2629 //NO MMX just normal asm ...
2631 "xor %%"REG_a", %%"REG_a" \n\t" // i
2632 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2633 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2636 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2637 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2638 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2639 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2640 "shll $16, %%edi \n\t"
2641 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2642 "mov %1, %%"REG_D" \n\t"
2643 "shrl $9, %%esi \n\t"
2644 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2645 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2646 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2648 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2649 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2650 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2651 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2652 "shll $16, %%edi \n\t"
2653 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2654 "mov %1, %%"REG_D" \n\t"
2655 "shrl $9, %%esi \n\t"
2656 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2657 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2658 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2661 "add $2, %%"REG_a" \n\t"
2662 "cmp %2, %%"REG_a" \n\t"
2666 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2667 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2670 } //if MMX2 can't be used
2674 unsigned int xpos=0;
2675 for (i=0;i<dstWidth;i++)
2677 register unsigned int xx=xpos>>16;
2678 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2679 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2682 #endif /* defined(ARCH_X86) */
2686 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2687 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2688 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2689 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2690 int32_t *mmx2FilterPos, uint8_t *pal)
2692 if (srcFormat==PIX_FMT_YUYV422)
2694 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2695 src1= formatConvBuffer;
2696 src2= formatConvBuffer+VOFW;
2698 else if (srcFormat==PIX_FMT_UYVY422)
2700 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2701 src1= formatConvBuffer;
2702 src2= formatConvBuffer+VOFW;
2704 else if (srcFormat==PIX_FMT_RGB32)
2706 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2707 src1= formatConvBuffer;
2708 src2= formatConvBuffer+VOFW;
2710 else if (srcFormat==PIX_FMT_BGR24)
2712 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2713 src1= formatConvBuffer;
2714 src2= formatConvBuffer+VOFW;
2716 else if (srcFormat==PIX_FMT_BGR565)
2718 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2719 src1= formatConvBuffer;
2720 src2= formatConvBuffer+VOFW;
2722 else if (srcFormat==PIX_FMT_BGR555)
2724 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2725 src1= formatConvBuffer;
2726 src2= formatConvBuffer+VOFW;
2728 else if (srcFormat==PIX_FMT_BGR32)
2730 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2731 src1= formatConvBuffer;
2732 src2= formatConvBuffer+VOFW;
2734 else if (srcFormat==PIX_FMT_RGB24)
2736 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2737 src1= formatConvBuffer;
2738 src2= formatConvBuffer+VOFW;
2740 else if (srcFormat==PIX_FMT_RGB565)
2742 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2743 src1= formatConvBuffer;
2744 src2= formatConvBuffer+VOFW;
2746 else if (srcFormat==PIX_FMT_RGB555)
2748 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2749 src1= formatConvBuffer;
2750 src2= formatConvBuffer+VOFW;
2752 else if (isGray(srcFormat))
2756 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2758 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2759 src1= formatConvBuffer;
2760 src2= formatConvBuffer+VOFW;
2764 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2765 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2767 if (!(flags&SWS_FAST_BILINEAR))
2770 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2773 else // Fast Bilinear upscale / crap downscale
2775 #if defined(ARCH_X86)
2779 uint64_t ebxsave __attribute__((aligned(8)));
2785 "mov %%"REG_b", %6 \n\t"
2787 "pxor %%mm7, %%mm7 \n\t"
2788 "mov %0, %%"REG_c" \n\t"
2789 "mov %1, %%"REG_D" \n\t"
2790 "mov %2, %%"REG_d" \n\t"
2791 "mov %3, %%"REG_b" \n\t"
2792 "xor %%"REG_a", %%"REG_a" \n\t" // i
2793 PREFETCH" (%%"REG_c") \n\t"
2794 PREFETCH" 32(%%"REG_c") \n\t"
2795 PREFETCH" 64(%%"REG_c") \n\t"
2799 #define FUNNY_UV_CODE \
2800 "movl (%%"REG_b"), %%esi \n\t"\
2802 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2803 "add %%"REG_S", %%"REG_c" \n\t"\
2804 "add %%"REG_a", %%"REG_D" \n\t"\
2805 "xor %%"REG_a", %%"REG_a" \n\t"\
2809 #define FUNNY_UV_CODE \
2810 "movl (%%"REG_b"), %%esi \n\t"\
2812 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2813 "add %%"REG_a", %%"REG_D" \n\t"\
2814 "xor %%"REG_a", %%"REG_a" \n\t"\
2816 #endif /* ARCH_X86_64 */
2822 "xor %%"REG_a", %%"REG_a" \n\t" // i
2823 "mov %5, %%"REG_c" \n\t" // src
2824 "mov %1, %%"REG_D" \n\t" // buf1
2825 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2826 PREFETCH" (%%"REG_c") \n\t"
2827 PREFETCH" 32(%%"REG_c") \n\t"
2828 PREFETCH" 64(%%"REG_c") \n\t"
2836 "mov %6, %%"REG_b" \n\t"
2838 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2839 "m" (funnyUVCode), "m" (src2)
2843 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2848 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2850 //printf("%d %d %d\n", dstWidth, i, srcW);
2851 dst[i] = src1[srcW-1]*128;
2852 dst[i+VOFW] = src2[srcW-1]*128;
2857 #endif /* HAVE_MMX2 */
2858 long xInc_shr16 = (long) (xInc >> 16);
2859 uint16_t xInc_mask = xInc & 0xffff;
2861 "xor %%"REG_a", %%"REG_a" \n\t" // i
2862 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2863 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2866 "mov %0, %%"REG_S" \n\t"
2867 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2868 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2869 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2870 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2871 "shll $16, %%edi \n\t"
2872 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2873 "mov %1, %%"REG_D" \n\t"
2874 "shrl $9, %%esi \n\t"
2875 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2877 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2878 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2879 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2880 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2881 "shll $16, %%edi \n\t"
2882 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2883 "mov %1, %%"REG_D" \n\t"
2884 "shrl $9, %%esi \n\t"
2885 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2887 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2888 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2889 "add $1, %%"REG_a" \n\t"
2890 "cmp %2, %%"REG_a" \n\t"
2893 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894 which is needed to support GCC-4.0 */
2895 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2896 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2898 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2901 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2904 } //if MMX2 can't be used
2908 unsigned int xpos=0;
2909 for (i=0;i<dstWidth;i++)
2911 register unsigned int xx=xpos>>16;
2912 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2913 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2914 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2916 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2917 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2921 #endif /* defined(ARCH_X86) */
2925 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2926 int srcSliceH, uint8_t* dst[], int dstStride[]){
2928 /* load a few things into local vars to make the code more readable? and faster */
2929 const int srcW= c->srcW;
2930 const int dstW= c->dstW;
2931 const int dstH= c->dstH;
2932 const int chrDstW= c->chrDstW;
2933 const int chrSrcW= c->chrSrcW;
2934 const int lumXInc= c->lumXInc;
2935 const int chrXInc= c->chrXInc;
2936 const int dstFormat= c->dstFormat;
2937 const int srcFormat= c->srcFormat;
2938 const int flags= c->flags;
2939 const int canMMX2BeUsed= c->canMMX2BeUsed;
2940 int16_t *vLumFilterPos= c->vLumFilterPos;
2941 int16_t *vChrFilterPos= c->vChrFilterPos;
2942 int16_t *hLumFilterPos= c->hLumFilterPos;
2943 int16_t *hChrFilterPos= c->hChrFilterPos;
2944 int16_t *vLumFilter= c->vLumFilter;
2945 int16_t *vChrFilter= c->vChrFilter;
2946 int16_t *hLumFilter= c->hLumFilter;
2947 int16_t *hChrFilter= c->hChrFilter;
2948 int32_t *lumMmxFilter= c->lumMmxFilter;
2949 int32_t *chrMmxFilter= c->chrMmxFilter;
2950 const int vLumFilterSize= c->vLumFilterSize;
2951 const int vChrFilterSize= c->vChrFilterSize;
2952 const int hLumFilterSize= c->hLumFilterSize;
2953 const int hChrFilterSize= c->hChrFilterSize;
2954 int16_t **lumPixBuf= c->lumPixBuf;
2955 int16_t **chrPixBuf= c->chrPixBuf;
2956 const int vLumBufSize= c->vLumBufSize;
2957 const int vChrBufSize= c->vChrBufSize;
2958 uint8_t *funnyYCode= c->funnyYCode;
2959 uint8_t *funnyUVCode= c->funnyUVCode;
2960 uint8_t *formatConvBuffer= c->formatConvBuffer;
2961 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2962 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2966 /* vars whch will change and which we need to storw back in the context */
2968 int lumBufIndex= c->lumBufIndex;
2969 int chrBufIndex= c->chrBufIndex;
2970 int lastInLumBuf= c->lastInLumBuf;
2971 int lastInChrBuf= c->lastInChrBuf;
2973 if (isPacked(c->srcFormat)){
2980 srcStride[2]= srcStride[0];
2982 srcStride[1]<<= c->vChrDrop;
2983 srcStride[2]<<= c->vChrDrop;
2985 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2986 // (int)dst[0], (int)dst[1], (int)dst[2]);
2988 #if 0 //self test FIXME move to a vfilter or something
2990 static volatile int i=0;
2992 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2993 selfTest(src, srcStride, c->srcW, c->srcH);
2998 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2999 //dstStride[0],dstStride[1],dstStride[2]);
3001 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3003 static int firstTime=1; //FIXME move this into the context perhaps
3004 if (flags & SWS_PRINT_INFO && firstTime)
3006 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3007 " ->cannot do aligned memory acesses anymore\n");
3012 /* Note the user might start scaling the picture in the middle so this will not get executed
3013 this is not really intended but works currently, so ppl might do it */
3024 for (;dstY < dstH; dstY++){
3025 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3026 const int chrDstY= dstY>>c->chrDstVSubSample;
3027 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3028 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3030 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3031 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3032 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3033 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3035 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3036 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3037 //handle holes (FAST_BILINEAR & weird filters)
3038 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3039 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3040 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3041 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3042 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3044 // Do we have enough lines in this slice to output the dstY line
3045 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3047 //Do horizontal scaling
3048 while(lastInLumBuf < lastLumSrcY)
3050 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3052 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3053 ASSERT(lumBufIndex < 2*vLumBufSize)
3054 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3055 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3056 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3057 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3058 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3059 funnyYCode, c->srcFormat, formatConvBuffer,
3060 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3063 while(lastInChrBuf < lastChrSrcY)
3065 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3066 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3068 ASSERT(chrBufIndex < 2*vChrBufSize)
3069 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3070 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3071 //FIXME replace parameters through context struct (some at least)
3073 if (!(isGray(srcFormat) || isGray(dstFormat)))
3074 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3075 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3076 funnyUVCode, c->srcFormat, formatConvBuffer,
3077 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3080 //wrap buf index around to stay inside the ring buffer
3081 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3082 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3084 else // not enough lines left in this slice -> load the rest in the buffer
3086 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3087 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3088 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3089 vChrBufSize, vLumBufSize);*/
3091 //Do horizontal scaling
3092 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3094 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3096 ASSERT(lumBufIndex < 2*vLumBufSize)
3097 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3098 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3099 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3100 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3101 funnyYCode, c->srcFormat, formatConvBuffer,
3102 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3105 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3107 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3108 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3110 ASSERT(chrBufIndex < 2*vChrBufSize)
3111 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3112 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3114 if (!(isGray(srcFormat) || isGray(dstFormat)))
3115 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3116 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3117 funnyUVCode, c->srcFormat, formatConvBuffer,
3118 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3121 //wrap buf index around to stay inside the ring buffer
3122 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3123 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3124 break; //we can't output a dstY line so let's try with the next slice
3128 b5Dither= ff_dither8[dstY&1];
3129 g6Dither= ff_dither4[dstY&1];
3130 g5Dither= ff_dither8[dstY&1];
3131 r5Dither= ff_dither8[(dstY+1)&1];
3135 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3136 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3139 if (flags & SWS_ACCURATE_RND){
3140 for (i=0; i<vLumFilterSize; i+=2){
3141 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3142 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3143 lumMmxFilter[2*i+2]=
3144 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3145 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3147 for (i=0; i<vChrFilterSize; i+=2){
3148 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3149 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3150 chrMmxFilter[2*i+2]=
3151 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3152 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3155 for (i=0; i<vLumFilterSize; i++)
3157 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3158 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3159 lumMmxFilter[4*i+2]=
3160 lumMmxFilter[4*i+3]=
3161 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3163 for (i=0; i<vChrFilterSize; i++)
3165 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3166 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3167 chrMmxFilter[4*i+2]=
3168 chrMmxFilter[4*i+3]=
3169 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3173 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3174 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3175 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3176 RENAME(yuv2nv12X)(c,
3177 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3178 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3179 dest, uDest, dstW, chrDstW, dstFormat);
3181 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3183 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3184 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3185 if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3187 int16_t *lumBuf = lumPixBuf[0];
3188 int16_t *chrBuf= chrPixBuf[0];
3189 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3194 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3195 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3196 dest, uDest, vDest, dstW, chrDstW);
3201 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3202 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3203 if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3205 int chrAlpha= vChrFilter[2*dstY+1];
3206 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3207 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3209 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3211 int lumAlpha= vLumFilter[2*dstY+1];
3212 int chrAlpha= vChrFilter[2*dstY+1];
3214 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3216 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3217 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3218 dest, dstW, lumAlpha, chrAlpha, dstY);
3222 RENAME(yuv2packedX)(c,
3223 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3224 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3229 else // hmm looks like we can't use MMX here without overwriting this array's tail
3231 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3232 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3233 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3234 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3235 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3237 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3238 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3239 dest, uDest, dstW, chrDstW, dstFormat);
3241 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3243 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3244 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3246 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3247 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3248 dest, uDest, vDest, dstW, chrDstW);
3252 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3253 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3255 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3256 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3263 asm volatile(SFENCE:::"memory");
3264 asm volatile(EMMS:::"memory");
3266 /* store changed local vars back in the context */
3268 c->lumBufIndex= lumBufIndex;
3269 c->chrBufIndex= chrBufIndex;
3270 c->lastInLumBuf= lastInLumBuf;
3271 c->lastInChrBuf= lastInChrBuf;
3273 return dstY - lastDstY;