2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if(!(c->flags & SWS_BITEXACT)){
954 if (c->flags & SWS_ACCURATE_RND){
956 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
960 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
963 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
967 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
973 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974 chrFilter, chrSrc, chrFilterSize,
975 dest, uDest, vDest, dstW, chrDstW);
977 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
979 dest, uDest, vDest, dstW, chrDstW);
980 #endif //!HAVE_ALTIVEC
983 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
987 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988 chrFilter, chrSrc, chrFilterSize,
989 dest, uDest, dstW, chrDstW, dstFormat);
992 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
997 if(!(c->flags & SWS_BITEXACT)){
998 long p= uDest ? 3 : 1;
999 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000 uint8_t *dst[3]= {dest, uDest, vDest};
1001 long counter[3] = {dstW, chrDstW, chrDstW};
1003 if (c->flags & SWS_ACCURATE_RND){
1006 YSCALEYUV2YV121_ACCURATE
1007 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1016 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1025 for (i=0; i<dstW; i++)
1027 int val= (lumSrc[i]+64)>>7;
1038 for (i=0; i<chrDstW; i++)
1040 int u=(chrSrc[i ]+64)>>7;
1041 int v=(chrSrc[i + VOFW]+64)>>7;
1045 else if (u>255) u=255;
1047 else if (v>255) v=255;
1057 * vertical scale YV12 to RGB
1059 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061 uint8_t *dest, long dstW, long dstY)
1065 if(!(c->flags & SWS_BITEXACT)){
1066 if (c->flags & SWS_ACCURATE_RND){
1067 switch(c->dstFormat){
1069 YSCALEYUV2PACKEDX_ACCURATE
1071 WRITEBGR32(%4, %5, %%REGa)
1073 YSCALEYUV2PACKEDX_END
1076 YSCALEYUV2PACKEDX_ACCURATE
1078 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079 "add %4, %%"REG_c" \n\t"
1080 WRITEBGR24(%%REGc, %5, %%REGa)
1083 :: "r" (&c->redDither),
1084 "m" (dummy), "m" (dummy), "m" (dummy),
1085 "r" (dest), "m" (dstW)
1086 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1089 case PIX_FMT_RGB555:
1090 YSCALEYUV2PACKEDX_ACCURATE
1092 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1095 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1096 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1099 WRITERGB15(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1102 case PIX_FMT_RGB565:
1103 YSCALEYUV2PACKEDX_ACCURATE
1105 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1107 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1108 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1109 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1112 WRITERGB16(%4, %5, %%REGa)
1113 YSCALEYUV2PACKEDX_END
1115 case PIX_FMT_YUYV422:
1116 YSCALEYUV2PACKEDX_ACCURATE
1117 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1119 "psraw $3, %%mm3 \n\t"
1120 "psraw $3, %%mm4 \n\t"
1121 "psraw $3, %%mm1 \n\t"
1122 "psraw $3, %%mm7 \n\t"
1123 WRITEYUY2(%4, %5, %%REGa)
1124 YSCALEYUV2PACKEDX_END
1128 switch(c->dstFormat)
1133 WRITEBGR32(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1139 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1140 "add %4, %%"REG_c" \n\t"
1141 WRITEBGR24(%%REGc, %5, %%REGa)
1143 :: "r" (&c->redDither),
1144 "m" (dummy), "m" (dummy), "m" (dummy),
1145 "r" (dest), "m" (dstW)
1146 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149 case PIX_FMT_RGB555:
1152 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1155 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1156 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1159 WRITERGB15(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1162 case PIX_FMT_RGB565:
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1175 case PIX_FMT_YUYV422:
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1189 #endif /* HAVE_MMX */
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of altivec_yuv2packedX() */
1193 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1194 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1196 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1201 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1207 * vertical bilinear scale YV12 to RGB
1209 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212 int yalpha1=4095- yalpha;
1213 int uvalpha1=4095-uvalpha;
1217 if (flags&SWS_FULL_CHR_H_INT)
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1235 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1237 "add $4, %%"REG_a" \n\t"
1238 "cmp %5, %%"REG_a" \n\t"
1241 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242 "m" (yalpha1), "m" (uvalpha1)
1252 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1253 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1255 "movq %%mm3, %%mm1 \n\t"
1256 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1257 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1259 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1260 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1261 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1262 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1263 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1264 "movq %%mm1, %%mm2 \n\t"
1265 "psllq $48, %%mm1 \n\t" // 000000BG
1266 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1268 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1269 "psrld $16, %%mm2 \n\t" // R000R000
1270 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1271 "por %%mm2, %%mm1 \n\t" // RBGRR000
1273 "mov %4, %%"REG_b" \n\t"
1274 "add %%"REG_a", %%"REG_b" \n\t"
1278 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1282 "psrlq $32, %%mm3 \n\t"
1283 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1284 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1286 "add $4, %%"REG_a" \n\t"
1287 "cmp %5, %%"REG_a" \n\t"
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1292 : "%"REG_a, "%"REG_b
1295 case PIX_FMT_BGR555:
1300 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1301 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1302 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1304 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1305 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1306 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1308 "psrlw $3, %%mm3 \n\t"
1309 "psllw $2, %%mm1 \n\t"
1310 "psllw $7, %%mm0 \n\t"
1311 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1312 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1314 "por %%mm3, %%mm1 \n\t"
1315 "por %%mm1, %%mm0 \n\t"
1317 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1319 "add $4, %%"REG_a" \n\t"
1320 "cmp %5, %%"REG_a" \n\t"
1323 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324 "m" (yalpha1), "m" (uvalpha1)
1328 case PIX_FMT_BGR565:
1333 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1334 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1335 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1337 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1338 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1339 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1341 "psrlw $3, %%mm3 \n\t"
1342 "psllw $3, %%mm1 \n\t"
1343 "psllw $8, %%mm0 \n\t"
1344 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1345 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1347 "por %%mm3, %%mm1 \n\t"
1348 "por %%mm1, %%mm0 \n\t"
1350 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1352 "add $4, %%"REG_a" \n\t"
1353 "cmp %5, %%"REG_a" \n\t"
1356 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1361 #endif /* HAVE_MMX */
1366 if (dstFormat==PIX_FMT_RGB32)
1369 #ifdef WORDS_BIGENDIAN
1372 for (i=0;i<dstW;i++){
1373 // vertical linear interpolation && yuv2rgb in a single step:
1374 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1383 else if (dstFormat==PIX_FMT_BGR24)
1386 for (i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1397 else if (dstFormat==PIX_FMT_BGR565)
1400 for (i=0;i<dstW;i++){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1406 ((uint16_t*)dest)[i] =
1407 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1412 else if (dstFormat==PIX_FMT_BGR555)
1415 for (i=0;i<dstW;i++){
1416 // vertical linear interpolation && yuv2rgb in a single step:
1417 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1421 ((uint16_t*)dest)[i] =
1422 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1432 if(!(c->flags & SWS_BITEXACT)){
1433 switch(c->dstFormat)
1435 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB(%%REGBP, %5)
1456 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 case PIX_FMT_RGB555:
1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1466 "mov %4, %%"REG_b" \n\t"
1467 "push %%"REG_BP" \n\t"
1468 YSCALEYUV2RGB(%%REGBP, %5)
1469 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1471 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1472 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1473 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1476 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484 case PIX_FMT_RGB565:
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB(%%REGBP, %5)
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1492 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1493 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1494 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1497 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498 "pop %%"REG_BP" \n\t"
1499 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1500 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 case PIX_FMT_YUYV422:
1506 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1507 "mov %4, %%"REG_b" \n\t"
1508 "push %%"REG_BP" \n\t"
1509 YSCALEYUV2PACKED(%%REGBP, %5)
1510 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511 "pop %%"REG_BP" \n\t"
1512 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1521 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1525 * YV12 to RGB without scaling or interpolating
1527 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1530 const int yalpha1=0;
1533 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534 const int yalpha= 4096; //FIXME ...
1536 if (flags&SWS_FULL_CHR_H_INT)
1538 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1543 if(!(flags & SWS_BITEXACT)){
1544 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555 "pop %%"REG_BP" \n\t"
1556 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1558 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1564 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1565 "mov %4, %%"REG_b" \n\t"
1566 "push %%"REG_BP" \n\t"
1567 YSCALEYUV2RGB1(%%REGBP, %5)
1568 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 case PIX_FMT_RGB555:
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2RGB1(%%REGBP, %5)
1582 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1584 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1585 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1586 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1588 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589 "pop %%"REG_BP" \n\t"
1590 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1592 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1596 case PIX_FMT_RGB565:
1598 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1599 "mov %4, %%"REG_b" \n\t"
1600 "push %%"REG_BP" \n\t"
1601 YSCALEYUV2RGB1(%%REGBP, %5)
1602 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1604 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1605 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1606 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1609 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610 "pop %%"REG_BP" \n\t"
1611 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1613 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1617 case PIX_FMT_YUYV422:
1619 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1620 "mov %4, %%"REG_b" \n\t"
1621 "push %%"REG_BP" \n\t"
1622 YSCALEYUV2PACKED1(%%REGBP, %5)
1623 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624 "pop %%"REG_BP" \n\t"
1625 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1627 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644 "pop %%"REG_BP" \n\t"
1645 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1647 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1653 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1654 "mov %4, %%"REG_b" \n\t"
1655 "push %%"REG_BP" \n\t"
1656 YSCALEYUV2RGB1b(%%REGBP, %5)
1657 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658 "pop %%"REG_BP" \n\t"
1659 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1661 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1665 case PIX_FMT_RGB555:
1667 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1668 "mov %4, %%"REG_b" \n\t"
1669 "push %%"REG_BP" \n\t"
1670 YSCALEYUV2RGB1b(%%REGBP, %5)
1671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1673 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1674 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1675 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1677 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678 "pop %%"REG_BP" \n\t"
1679 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1681 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1685 case PIX_FMT_RGB565:
1687 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1688 "mov %4, %%"REG_b" \n\t"
1689 "push %%"REG_BP" \n\t"
1690 YSCALEYUV2RGB1b(%%REGBP, %5)
1691 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1693 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1694 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1695 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1698 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699 "pop %%"REG_BP" \n\t"
1700 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1702 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1706 case PIX_FMT_YUYV422:
1708 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1709 "mov %4, %%"REG_b" \n\t"
1710 "push %%"REG_BP" \n\t"
1711 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713 "pop %%"REG_BP" \n\t"
1714 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1716 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1723 #endif /* HAVE_MMX */
1726 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1728 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1732 //FIXME yuy2* can read up to 7 samples too much
1734 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1738 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1739 "mov %0, %%"REG_a" \n\t"
1741 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1742 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1743 "pand %%mm2, %%mm0 \n\t"
1744 "pand %%mm2, %%mm1 \n\t"
1745 "packuswb %%mm1, %%mm0 \n\t"
1746 "movq %%mm0, (%2, %%"REG_a") \n\t"
1747 "add $8, %%"REG_a" \n\t"
1749 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1754 for (i=0; i<width; i++)
1759 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1763 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1764 "mov %0, %%"REG_a" \n\t"
1766 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1767 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1768 "psrlw $8, %%mm0 \n\t"
1769 "psrlw $8, %%mm1 \n\t"
1770 "packuswb %%mm1, %%mm0 \n\t"
1771 "movq %%mm0, %%mm1 \n\t"
1772 "psrlw $8, %%mm0 \n\t"
1773 "pand %%mm4, %%mm1 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "packuswb %%mm1, %%mm1 \n\t"
1776 "movd %%mm0, (%3, %%"REG_a") \n\t"
1777 "movd %%mm1, (%2, %%"REG_a") \n\t"
1778 "add $4, %%"REG_a" \n\t"
1780 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1785 for (i=0; i<width; i++)
1787 dstU[i]= src1[4*i + 1];
1788 dstV[i]= src1[4*i + 3];
1791 assert(src1 == src2);
1794 /* This is almost identical to the previous, end exists only because
1795 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1800 "mov %0, %%"REG_a" \n\t"
1802 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1803 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "psrlw $8, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1810 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1815 for (i=0; i<width; i++)
1820 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1824 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1825 "mov %0, %%"REG_a" \n\t"
1827 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1828 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1829 "pand %%mm4, %%mm0 \n\t"
1830 "pand %%mm4, %%mm1 \n\t"
1831 "packuswb %%mm1, %%mm0 \n\t"
1832 "movq %%mm0, %%mm1 \n\t"
1833 "psrlw $8, %%mm0 \n\t"
1834 "pand %%mm4, %%mm1 \n\t"
1835 "packuswb %%mm0, %%mm0 \n\t"
1836 "packuswb %%mm1, %%mm1 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a") \n\t"
1838 "movd %%mm1, (%2, %%"REG_a") \n\t"
1839 "add $4, %%"REG_a" \n\t"
1841 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1846 for (i=0; i<width; i++)
1848 dstU[i]= src1[4*i + 0];
1849 dstV[i]= src1[4*i + 2];
1852 assert(src1 == src2);
1855 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1859 for (i=0; i<width; i++)\
1861 int b= (((type*)src)[i]>>shb)&maskb;\
1862 int g= (((type*)src)[i]>>shg)&maskg;\
1863 int r= (((type*)src)[i]>>shr)&maskr;\
1865 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1869 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1870 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1871 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1872 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1873 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1876 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1880 for (i=0; i<width; i++)\
1882 int b= (((type*)src)[i]&maskb)>>shb;\
1883 int g= (((type*)src)[i]&maskg)>>shg;\
1884 int r= (((type*)src)[i]&maskr)>>shr;\
1886 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1890 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1893 for (i=0; i<width; i++)\
1895 int pix0= ((type*)src)[2*i+0];\
1896 int pix1= ((type*)src)[2*i+1];\
1897 int g= (pix0&maskg)+(pix1&maskg);\
1898 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1903 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1908 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1909 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1910 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1911 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1912 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1916 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1919 if(srcFormat == PIX_FMT_BGR24){
1921 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1922 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1927 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1928 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1934 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1935 "mov %2, %%"REG_a" \n\t"
1936 "pxor %%mm7, %%mm7 \n\t"
1938 PREFETCH" 64(%0) \n\t"
1939 "movd (%0), %%mm0 \n\t"
1940 "movd 2(%0), %%mm1 \n\t"
1941 "movd 6(%0), %%mm2 \n\t"
1942 "movd 8(%0), %%mm3 \n\t"
1944 "punpcklbw %%mm7, %%mm0 \n\t"
1945 "punpcklbw %%mm7, %%mm1 \n\t"
1946 "punpcklbw %%mm7, %%mm2 \n\t"
1947 "punpcklbw %%mm7, %%mm3 \n\t"
1948 "pmaddwd %%mm5, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm2 \n\t"
1951 "pmaddwd %%mm6, %%mm3 \n\t"
1952 "paddd %%mm1, %%mm0 \n\t"
1953 "paddd %%mm3, %%mm2 \n\t"
1954 "paddd %%mm4, %%mm0 \n\t"
1955 "paddd %%mm4, %%mm2 \n\t"
1956 "psrad $15, %%mm0 \n\t"
1957 "psrad $15, %%mm2 \n\t"
1958 "packssdw %%mm2, %%mm0 \n\t"
1959 "packuswb %%mm0, %%mm0 \n\t"
1960 "movd %%mm0, (%1, %%"REG_a") \n\t"
1961 "add $4, %%"REG_a" \n\t"
1964 : "r" (dst+width), "g" (-width)
1969 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1972 "movq 24+%4, %%mm6 \n\t"
1973 "mov %3, %%"REG_a" \n\t"
1974 "pxor %%mm7, %%mm7 \n\t"
1976 PREFETCH" 64(%0) \n\t"
1977 "movd (%0), %%mm0 \n\t"
1978 "movd 2(%0), %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "movq %%mm0, %%mm2 \n\t"
1982 "movq %%mm1, %%mm3 \n\t"
1983 "pmaddwd %4, %%mm0 \n\t"
1984 "pmaddwd 8+%4, %%mm1 \n\t"
1985 "pmaddwd 16+%4, %%mm2 \n\t"
1986 "pmaddwd %%mm6, %%mm3 \n\t"
1987 "paddd %%mm1, %%mm0 \n\t"
1988 "paddd %%mm3, %%mm2 \n\t"
1990 "movd 6(%0), %%mm1 \n\t"
1991 "movd 8(%0), %%mm3 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm3 \n\t"
1995 "movq %%mm1, %%mm4 \n\t"
1996 "movq %%mm3, %%mm5 \n\t"
1997 "pmaddwd %4, %%mm1 \n\t"
1998 "pmaddwd 8+%4, %%mm3 \n\t"
1999 "pmaddwd 16+%4, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm5 \n\t"
2001 "paddd %%mm3, %%mm1 \n\t"
2002 "paddd %%mm5, %%mm4 \n\t"
2004 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
2005 "paddd %%mm3, %%mm0 \n\t"
2006 "paddd %%mm3, %%mm2 \n\t"
2007 "paddd %%mm3, %%mm1 \n\t"
2008 "paddd %%mm3, %%mm4 \n\t"
2009 "psrad $15, %%mm0 \n\t"
2010 "psrad $15, %%mm2 \n\t"
2011 "psrad $15, %%mm1 \n\t"
2012 "psrad $15, %%mm4 \n\t"
2013 "packssdw %%mm1, %%mm0 \n\t"
2014 "packssdw %%mm4, %%mm2 \n\t"
2015 "packuswb %%mm0, %%mm0 \n\t"
2016 "packuswb %%mm2, %%mm2 \n\t"
2017 "movd %%mm0, (%1, %%"REG_a") \n\t"
2018 "movd %%mm2, (%2, %%"REG_a") \n\t"
2019 "add $4, %%"REG_a" \n\t"
2022 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2028 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2031 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
2034 for (i=0; i<width; i++)
2040 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2042 #endif /* HAVE_MMX */
2045 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2048 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
2051 for (i=0; i<width; i++)
2053 int b= src1[3*i + 0];
2054 int g= src1[3*i + 1];
2055 int r= src1[3*i + 2];
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2060 #endif /* HAVE_MMX */
2061 assert(src1 == src2);
2064 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2067 for (i=0; i<width; i++)
2069 int b= src1[6*i + 0] + src1[6*i + 3];
2070 int g= src1[6*i + 1] + src1[6*i + 4];
2071 int r= src1[6*i + 2] + src1[6*i + 5];
2073 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2076 assert(src1 == src2);
2079 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2082 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2085 for (i=0; i<width; i++)
2091 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2096 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2101 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2103 for (i=0; i<width; i++)
2105 int r= src1[3*i + 0];
2106 int g= src1[3*i + 1];
2107 int b= src1[3*i + 2];
2109 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2115 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2119 for (i=0; i<width; i++)
2121 int r= src1[6*i + 0] + src1[6*i + 3];
2122 int g= src1[6*i + 1] + src1[6*i + 4];
2123 int b= src1[6*i + 2] + src1[6*i + 5];
2125 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2131 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2134 for (i=0; i<width; i++)
2138 dst[i]= pal[d] & 0xFF;
2142 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2145 assert(src1 == src2);
2146 for (i=0; i<width; i++)
2148 int p= pal[src1[i]];
2155 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2158 for (i=0; i<width/8; i++){
2161 dst[8*i+j]= ((d>>(7-j))&1)*255;
2165 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2168 for (i=0; i<width/8; i++){
2171 dst[8*i+j]= ((d>>(7-j))&1)*255;
2175 // bilinear / bicubic scaling
2176 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2177 int16_t *filter, int16_t *filterPos, long filterSize)
2180 assert(filterSize % 4 == 0 && filterSize>0);
2181 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2183 long counter= -2*dstW;
2185 filterPos-= counter/2;
2189 "push %%"REG_b" \n\t"
2191 "pxor %%mm7, %%mm7 \n\t"
2192 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2193 "mov %%"REG_a", %%"REG_BP" \n\t"
2196 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2197 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2198 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2199 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2200 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2201 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2202 "punpcklbw %%mm7, %%mm0 \n\t"
2203 "punpcklbw %%mm7, %%mm2 \n\t"
2204 "pmaddwd %%mm1, %%mm0 \n\t"
2205 "pmaddwd %%mm2, %%mm3 \n\t"
2206 "movq %%mm0, %%mm4 \n\t"
2207 "punpckldq %%mm3, %%mm0 \n\t"
2208 "punpckhdq %%mm3, %%mm4 \n\t"
2209 "paddd %%mm4, %%mm0 \n\t"
2210 "psrad $7, %%mm0 \n\t"
2211 "packssdw %%mm0, %%mm0 \n\t"
2212 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2213 "add $4, %%"REG_BP" \n\t"
2216 "pop %%"REG_BP" \n\t"
2218 "pop %%"REG_b" \n\t"
2221 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2227 else if (filterSize==8)
2229 long counter= -2*dstW;
2231 filterPos-= counter/2;
2235 "push %%"REG_b" \n\t"
2237 "pxor %%mm7, %%mm7 \n\t"
2238 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2239 "mov %%"REG_a", %%"REG_BP" \n\t"
2242 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2243 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2244 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2245 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2246 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2247 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2248 "punpcklbw %%mm7, %%mm0 \n\t"
2249 "punpcklbw %%mm7, %%mm2 \n\t"
2250 "pmaddwd %%mm1, %%mm0 \n\t"
2251 "pmaddwd %%mm2, %%mm3 \n\t"
2253 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2254 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2255 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2256 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2257 "punpcklbw %%mm7, %%mm4 \n\t"
2258 "punpcklbw %%mm7, %%mm2 \n\t"
2259 "pmaddwd %%mm1, %%mm4 \n\t"
2260 "pmaddwd %%mm2, %%mm5 \n\t"
2261 "paddd %%mm4, %%mm0 \n\t"
2262 "paddd %%mm5, %%mm3 \n\t"
2263 "movq %%mm0, %%mm4 \n\t"
2264 "punpckldq %%mm3, %%mm0 \n\t"
2265 "punpckhdq %%mm3, %%mm4 \n\t"
2266 "paddd %%mm4, %%mm0 \n\t"
2267 "psrad $7, %%mm0 \n\t"
2268 "packssdw %%mm0, %%mm0 \n\t"
2269 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2270 "add $4, %%"REG_BP" \n\t"
2273 "pop %%"REG_BP" \n\t"
2275 "pop %%"REG_b" \n\t"
2278 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2286 uint8_t *offset = src+filterSize;
2287 long counter= -2*dstW;
2288 //filter-= counter*filterSize/2;
2289 filterPos-= counter/2;
2292 "pxor %%mm7, %%mm7 \n\t"
2295 "mov %2, %%"REG_c" \n\t"
2296 "movzwl (%%"REG_c", %0), %%eax \n\t"
2297 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2298 "mov %5, %%"REG_c" \n\t"
2299 "pxor %%mm4, %%mm4 \n\t"
2300 "pxor %%mm5, %%mm5 \n\t"
2302 "movq (%1), %%mm1 \n\t"
2303 "movq (%1, %6), %%mm3 \n\t"
2304 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2305 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm0 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 "pmaddwd %%mm1, %%mm0 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "paddd %%mm3, %%mm5 \n\t"
2311 "paddd %%mm0, %%mm4 \n\t"
2313 "add $4, %%"REG_c" \n\t"
2314 "cmp %4, %%"REG_c" \n\t"
2317 "movq %%mm4, %%mm0 \n\t"
2318 "punpckldq %%mm5, %%mm4 \n\t"
2319 "punpckhdq %%mm5, %%mm0 \n\t"
2320 "paddd %%mm0, %%mm4 \n\t"
2321 "psrad $7, %%mm4 \n\t"
2322 "packssdw %%mm4, %%mm4 \n\t"
2323 "mov %3, %%"REG_a" \n\t"
2324 "movd %%mm4, (%%"REG_a", %0) \n\t"
2328 : "+r" (counter), "+r" (filter)
2329 : "m" (filterPos), "m" (dst), "m"(offset),
2330 "m" (src), "r" (filterSize*2)
2331 : "%"REG_a, "%"REG_c, "%"REG_d
2336 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2339 for (i=0; i<dstW; i++)
2342 int srcPos= filterPos[i];
2344 //printf("filterPos: %d\n", filterPos[i]);
2345 for (j=0; j<filterSize; j++)
2347 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2350 //filter += hFilterSize;
2351 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2354 #endif /* HAVE_ALTIVEC */
2355 #endif /* HAVE_MMX */
2357 // *** horizontal scale Y line to temp buffer
2358 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2359 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2362 int32_t *mmx2FilterPos, uint32_t *pal)
2364 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2366 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2367 src= formatConvBuffer;
2369 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2371 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2372 src= formatConvBuffer;
2374 else if (srcFormat==PIX_FMT_RGB32)
2376 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2377 src= formatConvBuffer;
2379 else if (srcFormat==PIX_FMT_RGB32_1)
2381 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2382 src= formatConvBuffer;
2384 else if (srcFormat==PIX_FMT_BGR24)
2386 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2387 src= formatConvBuffer;
2389 else if (srcFormat==PIX_FMT_BGR565)
2391 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2392 src= formatConvBuffer;
2394 else if (srcFormat==PIX_FMT_BGR555)
2396 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2397 src= formatConvBuffer;
2399 else if (srcFormat==PIX_FMT_BGR32)
2401 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2402 src= formatConvBuffer;
2404 else if (srcFormat==PIX_FMT_BGR32_1)
2406 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2407 src= formatConvBuffer;
2409 else if (srcFormat==PIX_FMT_RGB24)
2411 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2412 src= formatConvBuffer;
2414 else if (srcFormat==PIX_FMT_RGB565)
2416 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2417 src= formatConvBuffer;
2419 else if (srcFormat==PIX_FMT_RGB555)
2421 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2422 src= formatConvBuffer;
2424 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2426 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2427 src= formatConvBuffer;
2429 else if (srcFormat==PIX_FMT_MONOBLACK)
2431 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2432 src= formatConvBuffer;
2434 else if (srcFormat==PIX_FMT_MONOWHITE)
2436 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2437 src= formatConvBuffer;
2441 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2442 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2444 if (!(flags&SWS_FAST_BILINEAR))
2447 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2449 else // fast bilinear upscale / crap downscale
2451 #if defined(ARCH_X86)
2455 uint64_t ebxsave __attribute__((aligned(8)));
2461 "mov %%"REG_b", %5 \n\t"
2463 "pxor %%mm7, %%mm7 \n\t"
2464 "mov %0, %%"REG_c" \n\t"
2465 "mov %1, %%"REG_D" \n\t"
2466 "mov %2, %%"REG_d" \n\t"
2467 "mov %3, %%"REG_b" \n\t"
2468 "xor %%"REG_a", %%"REG_a" \n\t" // i
2469 PREFETCH" (%%"REG_c") \n\t"
2470 PREFETCH" 32(%%"REG_c") \n\t"
2471 PREFETCH" 64(%%"REG_c") \n\t"
2475 #define FUNNY_Y_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2478 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2479 "add %%"REG_S", %%"REG_c" \n\t"\
2480 "add %%"REG_a", %%"REG_D" \n\t"\
2481 "xor %%"REG_a", %%"REG_a" \n\t"\
2485 #define FUNNY_Y_CODE \
2486 "movl (%%"REG_b"), %%esi \n\t"\
2488 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489 "add %%"REG_a", %%"REG_D" \n\t"\
2490 "xor %%"REG_a", %%"REG_a" \n\t"\
2492 #endif /* ARCH_X86_64 */
2504 "mov %5, %%"REG_b" \n\t"
2506 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2511 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2516 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2520 #endif /* HAVE_MMX2 */
2521 long xInc_shr16 = xInc >> 16;
2522 uint16_t xInc_mask = xInc & 0xffff;
2523 //NO MMX just normal asm ...
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i
2526 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2530 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2531 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2532 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2533 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534 "shll $16, %%edi \n\t"
2535 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536 "mov %1, %%"REG_D" \n\t"
2537 "shrl $9, %%esi \n\t"
2538 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2539 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2540 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2542 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2543 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2544 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2545 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546 "shll $16, %%edi \n\t"
2547 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548 "mov %1, %%"REG_D" \n\t"
2549 "shrl $9, %%esi \n\t"
2550 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2552 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2555 "add $2, %%"REG_a" \n\t"
2556 "cmp %2, %%"REG_a" \n\t"
2560 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2564 } //if MMX2 can't be used
2568 unsigned int xpos=0;
2569 for (i=0;i<dstWidth;i++)
2571 register unsigned int xx=xpos>>16;
2572 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2576 #endif /* defined(ARCH_X86) */
2579 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2581 //FIXME all pal and rgb srcFormats could do this convertion as well
2582 //FIXME all scalers more complex than bilinear could do half of this transform
2584 for (i=0; i<dstWidth; i++)
2585 dst[i]= (dst[i]*14071 + 33561947)>>14;
2587 for (i=0; i<dstWidth; i++)
2588 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2593 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2594 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2597 int32_t *mmx2FilterPos, uint32_t *pal)
2599 if (srcFormat==PIX_FMT_YUYV422)
2601 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2602 src1= formatConvBuffer;
2603 src2= formatConvBuffer+VOFW;
2605 else if (srcFormat==PIX_FMT_UYVY422)
2607 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2608 src1= formatConvBuffer;
2609 src2= formatConvBuffer+VOFW;
2611 else if (srcFormat==PIX_FMT_RGB32)
2613 if(c->chrSrcHSubSample)
2614 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2616 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2617 src1= formatConvBuffer;
2618 src2= formatConvBuffer+VOFW;
2620 else if (srcFormat==PIX_FMT_RGB32_1)
2622 if(c->chrSrcHSubSample)
2623 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2625 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2626 src1= formatConvBuffer;
2627 src2= formatConvBuffer+VOFW;
2629 else if (srcFormat==PIX_FMT_BGR24)
2631 if(c->chrSrcHSubSample)
2632 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2634 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2635 src1= formatConvBuffer;
2636 src2= formatConvBuffer+VOFW;
2638 else if (srcFormat==PIX_FMT_BGR565)
2640 if(c->chrSrcHSubSample)
2641 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2643 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2644 src1= formatConvBuffer;
2645 src2= formatConvBuffer+VOFW;
2647 else if (srcFormat==PIX_FMT_BGR555)
2649 if(c->chrSrcHSubSample)
2650 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2652 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2653 src1= formatConvBuffer;
2654 src2= formatConvBuffer+VOFW;
2656 else if (srcFormat==PIX_FMT_BGR32)
2658 if(c->chrSrcHSubSample)
2659 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2661 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2662 src1= formatConvBuffer;
2663 src2= formatConvBuffer+VOFW;
2665 else if (srcFormat==PIX_FMT_BGR32_1)
2667 if(c->chrSrcHSubSample)
2668 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2670 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2671 src1= formatConvBuffer;
2672 src2= formatConvBuffer+VOFW;
2674 else if (srcFormat==PIX_FMT_RGB24)
2676 if(c->chrSrcHSubSample)
2677 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2679 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2680 src1= formatConvBuffer;
2681 src2= formatConvBuffer+VOFW;
2683 else if (srcFormat==PIX_FMT_RGB565)
2685 if(c->chrSrcHSubSample)
2686 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2688 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2689 src1= formatConvBuffer;
2690 src2= formatConvBuffer+VOFW;
2692 else if (srcFormat==PIX_FMT_RGB555)
2694 if(c->chrSrcHSubSample)
2695 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2697 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2698 src1= formatConvBuffer;
2699 src2= formatConvBuffer+VOFW;
2701 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2705 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2707 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2708 src1= formatConvBuffer;
2709 src2= formatConvBuffer+VOFW;
2713 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2714 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2716 if (!(flags&SWS_FAST_BILINEAR))
2719 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2720 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2722 else // fast bilinear upscale / crap downscale
2724 #if defined(ARCH_X86)
2728 uint64_t ebxsave __attribute__((aligned(8)));
2734 "mov %%"REG_b", %6 \n\t"
2736 "pxor %%mm7, %%mm7 \n\t"
2737 "mov %0, %%"REG_c" \n\t"
2738 "mov %1, %%"REG_D" \n\t"
2739 "mov %2, %%"REG_d" \n\t"
2740 "mov %3, %%"REG_b" \n\t"
2741 "xor %%"REG_a", %%"REG_a" \n\t" // i
2742 PREFETCH" (%%"REG_c") \n\t"
2743 PREFETCH" 32(%%"REG_c") \n\t"
2744 PREFETCH" 64(%%"REG_c") \n\t"
2748 #define FUNNY_UV_CODE \
2749 "movl (%%"REG_b"), %%esi \n\t"\
2751 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752 "add %%"REG_S", %%"REG_c" \n\t"\
2753 "add %%"REG_a", %%"REG_D" \n\t"\
2754 "xor %%"REG_a", %%"REG_a" \n\t"\
2758 #define FUNNY_UV_CODE \
2759 "movl (%%"REG_b"), %%esi \n\t"\
2761 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762 "add %%"REG_a", %%"REG_D" \n\t"\
2763 "xor %%"REG_a", %%"REG_a" \n\t"\
2765 #endif /* ARCH_X86_64 */
2771 "xor %%"REG_a", %%"REG_a" \n\t" // i
2772 "mov %5, %%"REG_c" \n\t" // src
2773 "mov %1, %%"REG_D" \n\t" // buf1
2774 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2775 PREFETCH" (%%"REG_c") \n\t"
2776 PREFETCH" 32(%%"REG_c") \n\t"
2777 PREFETCH" 64(%%"REG_c") \n\t"
2785 "mov %6, %%"REG_b" \n\t"
2787 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788 "m" (funnyUVCode), "m" (src2)
2792 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2797 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2799 //printf("%d %d %d\n", dstWidth, i, srcW);
2800 dst[i] = src1[srcW-1]*128;
2801 dst[i+VOFW] = src2[srcW-1]*128;
2806 #endif /* HAVE_MMX2 */
2807 long xInc_shr16 = (long) (xInc >> 16);
2808 uint16_t xInc_mask = xInc & 0xffff;
2810 "xor %%"REG_a", %%"REG_a" \n\t" // i
2811 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2812 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2815 "mov %0, %%"REG_S" \n\t"
2816 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2817 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2818 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2819 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820 "shll $16, %%edi \n\t"
2821 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822 "mov %1, %%"REG_D" \n\t"
2823 "shrl $9, %%esi \n\t"
2824 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2826 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2827 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2828 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2829 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830 "shll $16, %%edi \n\t"
2831 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832 "mov %1, %%"REG_D" \n\t"
2833 "shrl $9, %%esi \n\t"
2834 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2836 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2837 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2838 "add $1, %%"REG_a" \n\t"
2839 "cmp %2, %%"REG_a" \n\t"
2842 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843 which is needed to support GCC 4.0. */
2844 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2845 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2847 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2850 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2853 } //if MMX2 can't be used
2857 unsigned int xpos=0;
2858 for (i=0;i<dstWidth;i++)
2860 register unsigned int xx=xpos>>16;
2861 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2863 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2865 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2866 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2870 #endif /* defined(ARCH_X86) */
2872 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2874 //FIXME all pal and rgb srcFormats could do this convertion as well
2875 //FIXME all scalers more complex than bilinear could do half of this transform
2877 for (i=0; i<dstWidth; i++){
2878 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2879 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2882 for (i=0; i<dstWidth; i++){
2883 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2884 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2890 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2891 int srcSliceH, uint8_t* dst[], int dstStride[]){
2893 /* load a few things into local vars to make the code more readable? and faster */
2894 const int srcW= c->srcW;
2895 const int dstW= c->dstW;
2896 const int dstH= c->dstH;
2897 const int chrDstW= c->chrDstW;
2898 const int chrSrcW= c->chrSrcW;
2899 const int lumXInc= c->lumXInc;
2900 const int chrXInc= c->chrXInc;
2901 const int dstFormat= c->dstFormat;
2902 const int srcFormat= c->srcFormat;
2903 const int flags= c->flags;
2904 const int canMMX2BeUsed= c->canMMX2BeUsed;
2905 int16_t *vLumFilterPos= c->vLumFilterPos;
2906 int16_t *vChrFilterPos= c->vChrFilterPos;
2907 int16_t *hLumFilterPos= c->hLumFilterPos;
2908 int16_t *hChrFilterPos= c->hChrFilterPos;
2909 int16_t *vLumFilter= c->vLumFilter;
2910 int16_t *vChrFilter= c->vChrFilter;
2911 int16_t *hLumFilter= c->hLumFilter;
2912 int16_t *hChrFilter= c->hChrFilter;
2913 int32_t *lumMmxFilter= c->lumMmxFilter;
2914 int32_t *chrMmxFilter= c->chrMmxFilter;
2915 const int vLumFilterSize= c->vLumFilterSize;
2916 const int vChrFilterSize= c->vChrFilterSize;
2917 const int hLumFilterSize= c->hLumFilterSize;
2918 const int hChrFilterSize= c->hChrFilterSize;
2919 int16_t **lumPixBuf= c->lumPixBuf;
2920 int16_t **chrPixBuf= c->chrPixBuf;
2921 const int vLumBufSize= c->vLumBufSize;
2922 const int vChrBufSize= c->vChrBufSize;
2923 uint8_t *funnyYCode= c->funnyYCode;
2924 uint8_t *funnyUVCode= c->funnyUVCode;
2925 uint8_t *formatConvBuffer= c->formatConvBuffer;
2926 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2931 /* vars which will change and which we need to store back in the context */
2933 int lumBufIndex= c->lumBufIndex;
2934 int chrBufIndex= c->chrBufIndex;
2935 int lastInLumBuf= c->lastInLumBuf;
2936 int lastInChrBuf= c->lastInChrBuf;
2938 if (isPacked(c->srcFormat)){
2939 pal= (uint32_t *)src[1];
2945 srcStride[2]= srcStride[0];
2947 srcStride[1]<<= c->vChrDrop;
2948 srcStride[2]<<= c->vChrDrop;
2950 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2951 // (int)dst[0], (int)dst[1], (int)dst[2]);
2953 #if 0 //self test FIXME move to a vfilter or something
2955 static volatile int i=0;
2957 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2958 selfTest(src, srcStride, c->srcW, c->srcH);
2963 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2964 //dstStride[0],dstStride[1],dstStride[2]);
2966 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2968 static int firstTime=1; //FIXME move this into the context perhaps
2969 if (flags & SWS_PRINT_INFO && firstTime)
2971 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2972 " ->cannot do aligned memory accesses anymore\n");
2977 /* Note the user might start scaling the picture in the middle so this
2978 will not get executed. This is not really intended but works
2979 currently, so people might do it. */
2990 for (;dstY < dstH; dstY++){
2991 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2992 const int chrDstY= dstY>>c->chrDstVSubSample;
2993 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2994 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2996 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2997 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2998 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2999 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3001 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3002 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3003 //handle holes (FAST_BILINEAR & weird filters)
3004 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3005 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3006 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3007 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3008 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3010 // Do we have enough lines in this slice to output the dstY line
3011 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3013 //Do horizontal scaling
3014 while(lastInLumBuf < lastLumSrcY)
3016 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3018 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3019 assert(lumBufIndex < 2*vLumBufSize);
3020 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3021 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3022 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3023 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3024 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3025 funnyYCode, c->srcFormat, formatConvBuffer,
3026 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3029 while(lastInChrBuf < lastChrSrcY)
3031 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3032 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3034 assert(chrBufIndex < 2*vChrBufSize);
3035 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3036 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3037 //FIXME replace parameters through context struct (some at least)
3039 if (!(isGray(srcFormat) || isGray(dstFormat)))
3040 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3041 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3042 funnyUVCode, c->srcFormat, formatConvBuffer,
3043 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3046 //wrap buf index around to stay inside the ring buffer
3047 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3048 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3050 else // not enough lines left in this slice -> load the rest in the buffer
3052 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3053 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3054 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3055 vChrBufSize, vLumBufSize);*/
3057 //Do horizontal scaling
3058 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3060 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3062 assert(lumBufIndex < 2*vLumBufSize);
3063 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3064 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3065 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3066 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3067 funnyYCode, c->srcFormat, formatConvBuffer,
3068 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3071 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3073 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3074 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3076 assert(chrBufIndex < 2*vChrBufSize);
3077 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3078 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3080 if (!(isGray(srcFormat) || isGray(dstFormat)))
3081 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3082 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3083 funnyUVCode, c->srcFormat, formatConvBuffer,
3084 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3087 //wrap buf index around to stay inside the ring buffer
3088 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3089 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3090 break; //we can't output a dstY line so let's try with the next slice
3094 c->blueDither= ff_dither8[dstY&1];
3095 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3096 c->greenDither= ff_dither8[dstY&1];
3098 c->greenDither= ff_dither4[dstY&1];
3099 c->redDither= ff_dither8[(dstY+1)&1];
3103 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3104 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3107 if (flags & SWS_ACCURATE_RND){
3108 int s= APCK_SIZE / 8;
3109 for (i=0; i<vLumFilterSize; i+=2){
3110 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3111 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3112 lumMmxFilter[s*i+APCK_COEF/4 ]=
3113 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3114 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3116 for (i=0; i<vChrFilterSize; i+=2){
3117 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3118 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3119 chrMmxFilter[s*i+APCK_COEF/4 ]=
3120 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3121 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3124 for (i=0; i<vLumFilterSize; i++)
3126 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3127 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3128 lumMmxFilter[4*i+2]=
3129 lumMmxFilter[4*i+3]=
3130 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3132 for (i=0; i<vChrFilterSize; i++)
3134 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3135 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3136 chrMmxFilter[4*i+2]=
3137 chrMmxFilter[4*i+3]=
3138 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3142 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3143 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3144 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3145 RENAME(yuv2nv12X)(c,
3146 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3147 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3148 dest, uDest, dstW, chrDstW, dstFormat);
3150 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3152 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3153 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3154 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3156 int16_t *lumBuf = lumPixBuf[0];
3157 int16_t *chrBuf= chrPixBuf[0];
3158 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3163 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3164 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3165 dest, uDest, vDest, dstW, chrDstW);
3170 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3171 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3172 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3174 int chrAlpha= vChrFilter[2*dstY+1];
3175 if(flags & SWS_FULL_CHR_H_INT){
3176 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3177 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3178 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3181 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3182 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3185 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3187 int lumAlpha= vLumFilter[2*dstY+1];
3188 int chrAlpha= vChrFilter[2*dstY+1];
3190 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3192 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3193 if(flags & SWS_FULL_CHR_H_INT){
3194 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3195 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3196 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3199 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3200 dest, dstW, lumAlpha, chrAlpha, dstY);
3205 if(flags & SWS_FULL_CHR_H_INT){
3207 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3208 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3211 RENAME(yuv2packedX)(c,
3212 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3213 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3219 else // hmm looks like we can't use MMX here without overwriting this array's tail
3221 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3222 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3223 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3224 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3225 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3227 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3228 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3229 dest, uDest, dstW, chrDstW, dstFormat);
3231 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3233 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3234 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3236 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3237 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3238 dest, uDest, vDest, dstW, chrDstW);
3242 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3243 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3244 if(flags & SWS_FULL_CHR_H_INT){
3246 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3247 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3251 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3252 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3260 asm volatile(SFENCE:::"memory");
3261 asm volatile(EMMS:::"memory");
3263 /* store changed local vars back in the context */
3265 c->lumBufIndex= lumBufIndex;
3266 c->chrBufIndex= chrBufIndex;
3267 c->lastInLumBuf= lastInLumBuf;
3268 c->lastInChrBuf= lastInChrBuf;
3270 return dstY - lastDstY;