2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if(!(c->flags & SWS_BITEXACT)){
954 if (c->flags & SWS_ACCURATE_RND){
956 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
960 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
963 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
967 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
973 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974 chrFilter, chrSrc, chrFilterSize,
975 dest, uDest, vDest, dstW, chrDstW);
977 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
979 dest, uDest, vDest, dstW, chrDstW);
980 #endif //!HAVE_ALTIVEC
983 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
987 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988 chrFilter, chrSrc, chrFilterSize,
989 dest, uDest, dstW, chrDstW, dstFormat);
992 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
997 if(!(c->flags & SWS_BITEXACT)){
998 long p= uDest ? 3 : 1;
999 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000 uint8_t *dst[3]= {dest, uDest, vDest};
1001 long counter[3] = {dstW, chrDstW, chrDstW};
1003 if (c->flags & SWS_ACCURATE_RND){
1006 YSCALEYUV2YV121_ACCURATE
1007 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1016 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1025 for (i=0; i<dstW; i++)
1027 int val= (lumSrc[i]+64)>>7;
1038 for (i=0; i<chrDstW; i++)
1040 int u=(chrSrc[i ]+64)>>7;
1041 int v=(chrSrc[i + VOFW]+64)>>7;
1045 else if (u>255) u=255;
1047 else if (v>255) v=255;
1057 * vertical scale YV12 to RGB
1059 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061 uint8_t *dest, long dstW, long dstY)
1065 if(!(c->flags & SWS_BITEXACT)){
1066 if (c->flags & SWS_ACCURATE_RND){
1067 switch(c->dstFormat){
1069 YSCALEYUV2PACKEDX_ACCURATE
1071 WRITEBGR32(%4, %5, %%REGa)
1073 YSCALEYUV2PACKEDX_END
1076 YSCALEYUV2PACKEDX_ACCURATE
1078 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079 "add %4, %%"REG_c" \n\t"
1080 WRITEBGR24(%%REGc, %5, %%REGa)
1083 :: "r" (&c->redDither),
1084 "m" (dummy), "m" (dummy), "m" (dummy),
1085 "r" (dest), "m" (dstW)
1086 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1089 case PIX_FMT_RGB555:
1090 YSCALEYUV2PACKEDX_ACCURATE
1092 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1095 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1096 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1099 WRITERGB15(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1102 case PIX_FMT_RGB565:
1103 YSCALEYUV2PACKEDX_ACCURATE
1105 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1107 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1108 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1109 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1112 WRITERGB16(%4, %5, %%REGa)
1113 YSCALEYUV2PACKEDX_END
1115 case PIX_FMT_YUYV422:
1116 YSCALEYUV2PACKEDX_ACCURATE
1117 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1119 "psraw $3, %%mm3 \n\t"
1120 "psraw $3, %%mm4 \n\t"
1121 "psraw $3, %%mm1 \n\t"
1122 "psraw $3, %%mm7 \n\t"
1123 WRITEYUY2(%4, %5, %%REGa)
1124 YSCALEYUV2PACKEDX_END
1128 switch(c->dstFormat)
1133 WRITEBGR32(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1139 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1140 "add %4, %%"REG_c" \n\t"
1141 WRITEBGR24(%%REGc, %5, %%REGa)
1143 :: "r" (&c->redDither),
1144 "m" (dummy), "m" (dummy), "m" (dummy),
1145 "r" (dest), "m" (dstW)
1146 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149 case PIX_FMT_RGB555:
1152 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1155 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1156 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1159 WRITERGB15(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1162 case PIX_FMT_RGB565:
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1175 case PIX_FMT_YUYV422:
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1189 #endif /* HAVE_MMX */
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of altivec_yuv2packedX() */
1193 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1194 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1196 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1201 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1207 * vertical bilinear scale YV12 to RGB
1209 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212 int yalpha1=4095- yalpha;
1213 int uvalpha1=4095-uvalpha;
1217 if (flags&SWS_FULL_CHR_H_INT)
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1235 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1237 "add $4, %%"REG_a" \n\t"
1238 "cmp %5, %%"REG_a" \n\t"
1241 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242 "m" (yalpha1), "m" (uvalpha1)
1252 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1253 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1255 "movq %%mm3, %%mm1 \n\t"
1256 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1257 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1259 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1260 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1261 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1262 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1263 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1264 "movq %%mm1, %%mm2 \n\t"
1265 "psllq $48, %%mm1 \n\t" // 000000BG
1266 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1268 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1269 "psrld $16, %%mm2 \n\t" // R000R000
1270 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1271 "por %%mm2, %%mm1 \n\t" // RBGRR000
1273 "mov %4, %%"REG_b" \n\t"
1274 "add %%"REG_a", %%"REG_b" \n\t"
1278 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1282 "psrlq $32, %%mm3 \n\t"
1283 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1284 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1286 "add $4, %%"REG_a" \n\t"
1287 "cmp %5, %%"REG_a" \n\t"
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1292 : "%"REG_a, "%"REG_b
1295 case PIX_FMT_BGR555:
1300 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1301 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1302 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1304 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1305 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1306 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1308 "psrlw $3, %%mm3 \n\t"
1309 "psllw $2, %%mm1 \n\t"
1310 "psllw $7, %%mm0 \n\t"
1311 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1312 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1314 "por %%mm3, %%mm1 \n\t"
1315 "por %%mm1, %%mm0 \n\t"
1317 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1319 "add $4, %%"REG_a" \n\t"
1320 "cmp %5, %%"REG_a" \n\t"
1323 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324 "m" (yalpha1), "m" (uvalpha1)
1328 case PIX_FMT_BGR565:
1333 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1334 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1335 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1337 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1338 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1339 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1341 "psrlw $3, %%mm3 \n\t"
1342 "psllw $3, %%mm1 \n\t"
1343 "psllw $8, %%mm0 \n\t"
1344 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1345 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1347 "por %%mm3, %%mm1 \n\t"
1348 "por %%mm1, %%mm0 \n\t"
1350 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1352 "add $4, %%"REG_a" \n\t"
1353 "cmp %5, %%"REG_a" \n\t"
1356 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1361 #endif /* HAVE_MMX */
1366 if (dstFormat==PIX_FMT_RGB32)
1369 #ifdef WORDS_BIGENDIAN
1372 for (i=0;i<dstW;i++){
1373 // vertical linear interpolation && yuv2rgb in a single step:
1374 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1383 else if (dstFormat==PIX_FMT_BGR24)
1386 for (i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1397 else if (dstFormat==PIX_FMT_BGR565)
1400 for (i=0;i<dstW;i++){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1406 ((uint16_t*)dest)[i] =
1407 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1412 else if (dstFormat==PIX_FMT_BGR555)
1415 for (i=0;i<dstW;i++){
1416 // vertical linear interpolation && yuv2rgb in a single step:
1417 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1421 ((uint16_t*)dest)[i] =
1422 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1432 if(!(c->flags & SWS_BITEXACT)){
1433 switch(c->dstFormat)
1435 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB(%%REGBP, %5)
1456 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 case PIX_FMT_RGB555:
1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1466 "mov %4, %%"REG_b" \n\t"
1467 "push %%"REG_BP" \n\t"
1468 YSCALEYUV2RGB(%%REGBP, %5)
1469 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1471 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1472 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1473 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1476 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484 case PIX_FMT_RGB565:
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB(%%REGBP, %5)
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1492 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1493 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1494 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1497 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498 "pop %%"REG_BP" \n\t"
1499 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1500 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 case PIX_FMT_YUYV422:
1506 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1507 "mov %4, %%"REG_b" \n\t"
1508 "push %%"REG_BP" \n\t"
1509 YSCALEYUV2PACKED(%%REGBP, %5)
1510 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511 "pop %%"REG_BP" \n\t"
1512 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1521 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1525 * YV12 to RGB without scaling or interpolating
1527 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1530 const int yalpha1=0;
1533 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534 const int yalpha= 4096; //FIXME ...
1536 if (flags&SWS_FULL_CHR_H_INT)
1538 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1543 if(!(flags & SWS_BITEXACT)){
1544 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555 "pop %%"REG_BP" \n\t"
1556 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1558 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1564 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1565 "mov %4, %%"REG_b" \n\t"
1566 "push %%"REG_BP" \n\t"
1567 YSCALEYUV2RGB1(%%REGBP, %5)
1568 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 case PIX_FMT_RGB555:
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2RGB1(%%REGBP, %5)
1582 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1584 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1585 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1586 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1588 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589 "pop %%"REG_BP" \n\t"
1590 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1592 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1596 case PIX_FMT_RGB565:
1598 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1599 "mov %4, %%"REG_b" \n\t"
1600 "push %%"REG_BP" \n\t"
1601 YSCALEYUV2RGB1(%%REGBP, %5)
1602 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1604 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1605 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1606 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1609 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610 "pop %%"REG_BP" \n\t"
1611 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1613 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1617 case PIX_FMT_YUYV422:
1619 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1620 "mov %4, %%"REG_b" \n\t"
1621 "push %%"REG_BP" \n\t"
1622 YSCALEYUV2PACKED1(%%REGBP, %5)
1623 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624 "pop %%"REG_BP" \n\t"
1625 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1627 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644 "pop %%"REG_BP" \n\t"
1645 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1647 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1653 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1654 "mov %4, %%"REG_b" \n\t"
1655 "push %%"REG_BP" \n\t"
1656 YSCALEYUV2RGB1b(%%REGBP, %5)
1657 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658 "pop %%"REG_BP" \n\t"
1659 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1661 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1665 case PIX_FMT_RGB555:
1667 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1668 "mov %4, %%"REG_b" \n\t"
1669 "push %%"REG_BP" \n\t"
1670 YSCALEYUV2RGB1b(%%REGBP, %5)
1671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1673 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1674 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1675 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1677 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678 "pop %%"REG_BP" \n\t"
1679 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1681 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1685 case PIX_FMT_RGB565:
1687 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1688 "mov %4, %%"REG_b" \n\t"
1689 "push %%"REG_BP" \n\t"
1690 YSCALEYUV2RGB1b(%%REGBP, %5)
1691 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1693 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1694 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1695 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1698 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699 "pop %%"REG_BP" \n\t"
1700 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1702 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1706 case PIX_FMT_YUYV422:
1708 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1709 "mov %4, %%"REG_b" \n\t"
1710 "push %%"REG_BP" \n\t"
1711 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713 "pop %%"REG_BP" \n\t"
1714 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1716 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1723 #endif /* HAVE_MMX */
1726 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1728 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1732 //FIXME yuy2* can read up to 7 samples too much
1734 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1738 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1739 "mov %0, %%"REG_a" \n\t"
1741 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1742 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1743 "pand %%mm2, %%mm0 \n\t"
1744 "pand %%mm2, %%mm1 \n\t"
1745 "packuswb %%mm1, %%mm0 \n\t"
1746 "movq %%mm0, (%2, %%"REG_a") \n\t"
1747 "add $8, %%"REG_a" \n\t"
1749 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1754 for (i=0; i<width; i++)
1759 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1763 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1764 "mov %0, %%"REG_a" \n\t"
1766 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1767 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1768 "psrlw $8, %%mm0 \n\t"
1769 "psrlw $8, %%mm1 \n\t"
1770 "packuswb %%mm1, %%mm0 \n\t"
1771 "movq %%mm0, %%mm1 \n\t"
1772 "psrlw $8, %%mm0 \n\t"
1773 "pand %%mm4, %%mm1 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "packuswb %%mm1, %%mm1 \n\t"
1776 "movd %%mm0, (%3, %%"REG_a") \n\t"
1777 "movd %%mm1, (%2, %%"REG_a") \n\t"
1778 "add $4, %%"REG_a" \n\t"
1780 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1785 for (i=0; i<width; i++)
1787 dstU[i]= src1[4*i + 1];
1788 dstV[i]= src1[4*i + 3];
1791 assert(src1 == src2);
1794 /* This is almost identical to the previous, end exists only because
1795 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1800 "mov %0, %%"REG_a" \n\t"
1802 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1803 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "psrlw $8, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1810 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1815 for (i=0; i<width; i++)
1820 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1824 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1825 "mov %0, %%"REG_a" \n\t"
1827 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1828 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1829 "pand %%mm4, %%mm0 \n\t"
1830 "pand %%mm4, %%mm1 \n\t"
1831 "packuswb %%mm1, %%mm0 \n\t"
1832 "movq %%mm0, %%mm1 \n\t"
1833 "psrlw $8, %%mm0 \n\t"
1834 "pand %%mm4, %%mm1 \n\t"
1835 "packuswb %%mm0, %%mm0 \n\t"
1836 "packuswb %%mm1, %%mm1 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a") \n\t"
1838 "movd %%mm1, (%2, %%"REG_a") \n\t"
1839 "add $4, %%"REG_a" \n\t"
1841 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1846 for (i=0; i<width; i++)
1848 dstU[i]= src1[4*i + 0];
1849 dstV[i]= src1[4*i + 2];
1852 assert(src1 == src2);
1855 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1859 for (i=0; i<width; i++)\
1861 int b= (((type*)src)[i]>>shb)&maskb;\
1862 int g= (((type*)src)[i]>>shg)&maskg;\
1863 int r= (((type*)src)[i]>>shr)&maskr;\
1865 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1869 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1870 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1871 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1872 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1873 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1876 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1880 for (i=0; i<width; i++)\
1882 int b= (((type*)src)[i]&maskb)>>shb;\
1883 int g= (((type*)src)[i]&maskg)>>shg;\
1884 int r= (((type*)src)[i]&maskr)>>shr;\
1886 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1890 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1893 for (i=0; i<width; i++)\
1895 int pix0= ((type*)src)[2*i+0];\
1896 int pix1= ((type*)src)[2*i+1];\
1897 int g= (pix0&maskg)+(pix1&maskg);\
1898 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1903 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1908 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1909 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1910 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1911 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1912 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1916 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1919 if(srcFormat == PIX_FMT_BGR24){
1921 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1922 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1927 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1928 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1934 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1935 "mov %2, %%"REG_a" \n\t"
1936 "pxor %%mm7, %%mm7 \n\t"
1938 PREFETCH" 64(%0) \n\t"
1939 "movd (%0), %%mm0 \n\t"
1940 "movd 2(%0), %%mm1 \n\t"
1941 "movd 6(%0), %%mm2 \n\t"
1942 "movd 8(%0), %%mm3 \n\t"
1944 "punpcklbw %%mm7, %%mm0 \n\t"
1945 "punpcklbw %%mm7, %%mm1 \n\t"
1946 "punpcklbw %%mm7, %%mm2 \n\t"
1947 "punpcklbw %%mm7, %%mm3 \n\t"
1948 "pmaddwd %%mm5, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm2 \n\t"
1951 "pmaddwd %%mm6, %%mm3 \n\t"
1952 "paddd %%mm1, %%mm0 \n\t"
1953 "paddd %%mm3, %%mm2 \n\t"
1954 "paddd %%mm4, %%mm0 \n\t"
1955 "paddd %%mm4, %%mm2 \n\t"
1956 "psrad $15, %%mm0 \n\t"
1957 "psrad $15, %%mm2 \n\t"
1958 "packssdw %%mm2, %%mm0 \n\t"
1959 "packuswb %%mm0, %%mm0 \n\t"
1960 "movd %%mm0, (%1, %%"REG_a") \n\t"
1961 "add $4, %%"REG_a" \n\t"
1964 : "r" (dst+width), "g" (-width)
1969 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1972 "movq 24+%4, %%mm6 \n\t"
1973 "mov %3, %%"REG_a" \n\t"
1974 "pxor %%mm7, %%mm7 \n\t"
1976 PREFETCH" 64(%0) \n\t"
1977 "movd (%0), %%mm0 \n\t"
1978 "movd 2(%0), %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "movq %%mm0, %%mm2 \n\t"
1982 "movq %%mm1, %%mm3 \n\t"
1983 "pmaddwd %4, %%mm0 \n\t"
1984 "pmaddwd 8+%4, %%mm1 \n\t"
1985 "pmaddwd 16+%4, %%mm2 \n\t"
1986 "pmaddwd %%mm6, %%mm3 \n\t"
1987 "paddd %%mm1, %%mm0 \n\t"
1988 "paddd %%mm3, %%mm2 \n\t"
1990 "movd 6(%0), %%mm1 \n\t"
1991 "movd 8(%0), %%mm3 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm3 \n\t"
1995 "movq %%mm1, %%mm4 \n\t"
1996 "movq %%mm3, %%mm5 \n\t"
1997 "pmaddwd %4, %%mm1 \n\t"
1998 "pmaddwd 8+%4, %%mm3 \n\t"
1999 "pmaddwd 16+%4, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm5 \n\t"
2001 "paddd %%mm3, %%mm1 \n\t"
2002 "paddd %%mm5, %%mm4 \n\t"
2004 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
2005 "paddd %%mm3, %%mm0 \n\t"
2006 "paddd %%mm3, %%mm2 \n\t"
2007 "paddd %%mm3, %%mm1 \n\t"
2008 "paddd %%mm3, %%mm4 \n\t"
2009 "psrad $15, %%mm0 \n\t"
2010 "psrad $15, %%mm2 \n\t"
2011 "psrad $15, %%mm1 \n\t"
2012 "psrad $15, %%mm4 \n\t"
2013 "packssdw %%mm1, %%mm0 \n\t"
2014 "packssdw %%mm4, %%mm2 \n\t"
2015 "packuswb %%mm0, %%mm0 \n\t"
2016 "packuswb %%mm2, %%mm2 \n\t"
2017 "movd %%mm0, (%1, %%"REG_a") \n\t"
2018 "movd %%mm2, (%2, %%"REG_a") \n\t"
2019 "add $4, %%"REG_a" \n\t"
2022 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2028 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2031 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
2034 for (i=0; i<width; i++)
2040 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2042 #endif /* HAVE_MMX */
2045 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2048 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
2051 for (i=0; i<width; i++)
2053 int b= src1[3*i + 0];
2054 int g= src1[3*i + 1];
2055 int r= src1[3*i + 2];
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2060 #endif /* HAVE_MMX */
2061 assert(src1 == src2);
2064 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2067 for (i=0; i<width; i++)
2069 int b= src1[6*i + 0] + src1[6*i + 3];
2070 int g= src1[6*i + 1] + src1[6*i + 4];
2071 int r= src1[6*i + 2] + src1[6*i + 5];
2073 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2076 assert(src1 == src2);
2079 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2082 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2085 for (i=0; i<width; i++)
2091 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2096 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2101 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2103 for (i=0; i<width; i++)
2105 int r= src1[3*i + 0];
2106 int g= src1[3*i + 1];
2107 int b= src1[3*i + 2];
2109 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2115 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2119 for (i=0; i<width; i++)
2121 int r= src1[6*i + 0] + src1[6*i + 3];
2122 int g= src1[6*i + 1] + src1[6*i + 4];
2123 int b= src1[6*i + 2] + src1[6*i + 5];
2125 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2131 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2134 for (i=0; i<width; i++)
2138 dst[i]= pal[d] & 0xFF;
2142 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2145 assert(src1 == src2);
2146 for (i=0; i<width; i++)
2148 int p= pal[src1[i]];
2155 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2158 for (i=0; i<width/8; i++){
2161 dst[8*i+j]= ((d>>(7-j))&1)*255;
2165 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2168 for (i=0; i<width/8; i++){
2171 dst[8*i+j]= ((d>>(7-j))&1)*255;
2175 // bilinear / bicubic scaling
2176 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2177 int16_t *filter, int16_t *filterPos, long filterSize)
2180 assert(filterSize % 4 == 0 && filterSize>0);
2181 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2183 long counter= -2*dstW;
2185 filterPos-= counter/2;
2189 "push %%"REG_b" \n\t"
2191 "pxor %%mm7, %%mm7 \n\t"
2192 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2193 "mov %%"REG_a", %%"REG_BP" \n\t"
2196 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2197 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2198 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2199 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2200 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2201 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2202 "punpcklbw %%mm7, %%mm0 \n\t"
2203 "punpcklbw %%mm7, %%mm2 \n\t"
2204 "pmaddwd %%mm1, %%mm0 \n\t"
2205 "pmaddwd %%mm2, %%mm3 \n\t"
2206 "movq %%mm0, %%mm4 \n\t"
2207 "punpckldq %%mm3, %%mm0 \n\t"
2208 "punpckhdq %%mm3, %%mm4 \n\t"
2209 "paddd %%mm4, %%mm0 \n\t"
2210 "psrad $7, %%mm0 \n\t"
2211 "packssdw %%mm0, %%mm0 \n\t"
2212 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2213 "add $4, %%"REG_BP" \n\t"
2216 "pop %%"REG_BP" \n\t"
2218 "pop %%"REG_b" \n\t"
2221 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2227 else if (filterSize==8)
2229 long counter= -2*dstW;
2231 filterPos-= counter/2;
2235 "push %%"REG_b" \n\t"
2237 "pxor %%mm7, %%mm7 \n\t"
2238 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2239 "mov %%"REG_a", %%"REG_BP" \n\t"
2242 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2243 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2244 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2245 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2246 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2247 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2248 "punpcklbw %%mm7, %%mm0 \n\t"
2249 "punpcklbw %%mm7, %%mm2 \n\t"
2250 "pmaddwd %%mm1, %%mm0 \n\t"
2251 "pmaddwd %%mm2, %%mm3 \n\t"
2253 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2254 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2255 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2256 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2257 "punpcklbw %%mm7, %%mm4 \n\t"
2258 "punpcklbw %%mm7, %%mm2 \n\t"
2259 "pmaddwd %%mm1, %%mm4 \n\t"
2260 "pmaddwd %%mm2, %%mm5 \n\t"
2261 "paddd %%mm4, %%mm0 \n\t"
2262 "paddd %%mm5, %%mm3 \n\t"
2263 "movq %%mm0, %%mm4 \n\t"
2264 "punpckldq %%mm3, %%mm0 \n\t"
2265 "punpckhdq %%mm3, %%mm4 \n\t"
2266 "paddd %%mm4, %%mm0 \n\t"
2267 "psrad $7, %%mm0 \n\t"
2268 "packssdw %%mm0, %%mm0 \n\t"
2269 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2270 "add $4, %%"REG_BP" \n\t"
2273 "pop %%"REG_BP" \n\t"
2275 "pop %%"REG_b" \n\t"
2278 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2286 uint8_t *offset = src+filterSize;
2287 long counter= -2*dstW;
2288 //filter-= counter*filterSize/2;
2289 filterPos-= counter/2;
2292 "pxor %%mm7, %%mm7 \n\t"
2295 "mov %2, %%"REG_c" \n\t"
2296 "movzwl (%%"REG_c", %0), %%eax \n\t"
2297 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2298 "mov %5, %%"REG_c" \n\t"
2299 "pxor %%mm4, %%mm4 \n\t"
2300 "pxor %%mm5, %%mm5 \n\t"
2302 "movq (%1), %%mm1 \n\t"
2303 "movq (%1, %6), %%mm3 \n\t"
2304 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2305 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm0 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 "pmaddwd %%mm1, %%mm0 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "paddd %%mm3, %%mm5 \n\t"
2311 "paddd %%mm0, %%mm4 \n\t"
2313 "add $4, %%"REG_c" \n\t"
2314 "cmp %4, %%"REG_c" \n\t"
2317 "movq %%mm4, %%mm0 \n\t"
2318 "punpckldq %%mm5, %%mm4 \n\t"
2319 "punpckhdq %%mm5, %%mm0 \n\t"
2320 "paddd %%mm0, %%mm4 \n\t"
2321 "psrad $7, %%mm4 \n\t"
2322 "packssdw %%mm4, %%mm4 \n\t"
2323 "mov %3, %%"REG_a" \n\t"
2324 "movd %%mm4, (%%"REG_a", %0) \n\t"
2328 : "+r" (counter), "+r" (filter)
2329 : "m" (filterPos), "m" (dst), "m"(offset),
2330 "m" (src), "r" (filterSize*2)
2331 : "%"REG_a, "%"REG_c, "%"REG_d
2336 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2339 for (i=0; i<dstW; i++)
2342 int srcPos= filterPos[i];
2344 //printf("filterPos: %d\n", filterPos[i]);
2345 for (j=0; j<filterSize; j++)
2347 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2350 //filter += hFilterSize;
2351 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2354 #endif /* HAVE_ALTIVEC */
2355 #endif /* HAVE_MMX */
2357 // *** horizontal scale Y line to temp buffer
2358 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2359 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2362 int32_t *mmx2FilterPos, uint32_t *pal)
2364 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2366 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2367 src= formatConvBuffer;
2369 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2371 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2372 src= formatConvBuffer;
2374 else if (srcFormat==PIX_FMT_RGB32)
2376 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2377 src= formatConvBuffer;
2379 else if (srcFormat==PIX_FMT_RGB32_1)
2381 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2382 src= formatConvBuffer;
2384 else if (srcFormat==PIX_FMT_BGR24)
2386 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2387 src= formatConvBuffer;
2389 else if (srcFormat==PIX_FMT_BGR565)
2391 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2392 src= formatConvBuffer;
2394 else if (srcFormat==PIX_FMT_BGR555)
2396 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2397 src= formatConvBuffer;
2399 else if (srcFormat==PIX_FMT_BGR32)
2401 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2402 src= formatConvBuffer;
2404 else if (srcFormat==PIX_FMT_BGR32_1)
2406 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2407 src= formatConvBuffer;
2409 else if (srcFormat==PIX_FMT_RGB24)
2411 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2412 src= formatConvBuffer;
2414 else if (srcFormat==PIX_FMT_RGB565)
2416 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2417 src= formatConvBuffer;
2419 else if (srcFormat==PIX_FMT_RGB555)
2421 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2422 src= formatConvBuffer;
2424 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2426 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2427 src= formatConvBuffer;
2429 else if (srcFormat==PIX_FMT_MONOBLACK)
2431 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2432 src= formatConvBuffer;
2434 else if (srcFormat==PIX_FMT_MONOWHITE)
2436 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2437 src= formatConvBuffer;
2441 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2442 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2444 if (!(flags&SWS_FAST_BILINEAR))
2447 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2449 else // fast bilinear upscale / crap downscale
2451 #if defined(ARCH_X86)
2455 uint64_t ebxsave __attribute__((aligned(8)));
2461 "mov %%"REG_b", %5 \n\t"
2463 "pxor %%mm7, %%mm7 \n\t"
2464 "mov %0, %%"REG_c" \n\t"
2465 "mov %1, %%"REG_D" \n\t"
2466 "mov %2, %%"REG_d" \n\t"
2467 "mov %3, %%"REG_b" \n\t"
2468 "xor %%"REG_a", %%"REG_a" \n\t" // i
2469 PREFETCH" (%%"REG_c") \n\t"
2470 PREFETCH" 32(%%"REG_c") \n\t"
2471 PREFETCH" 64(%%"REG_c") \n\t"
2475 #define FUNNY_Y_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2478 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2479 "add %%"REG_S", %%"REG_c" \n\t"\
2480 "add %%"REG_a", %%"REG_D" \n\t"\
2481 "xor %%"REG_a", %%"REG_a" \n\t"\
2485 #define FUNNY_Y_CODE \
2486 "movl (%%"REG_b"), %%esi \n\t"\
2488 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489 "add %%"REG_a", %%"REG_D" \n\t"\
2490 "xor %%"REG_a", %%"REG_a" \n\t"\
2492 #endif /* ARCH_X86_64 */
2504 "mov %5, %%"REG_b" \n\t"
2506 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2511 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2516 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2520 #endif /* HAVE_MMX2 */
2521 long xInc_shr16 = xInc >> 16;
2522 uint16_t xInc_mask = xInc & 0xffff;
2523 //NO MMX just normal asm ...
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i
2526 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2530 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2531 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2532 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2533 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534 "shll $16, %%edi \n\t"
2535 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536 "mov %1, %%"REG_D" \n\t"
2537 "shrl $9, %%esi \n\t"
2538 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2539 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2540 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2542 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2543 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2544 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2545 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546 "shll $16, %%edi \n\t"
2547 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548 "mov %1, %%"REG_D" \n\t"
2549 "shrl $9, %%esi \n\t"
2550 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2552 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2555 "add $2, %%"REG_a" \n\t"
2556 "cmp %2, %%"REG_a" \n\t"
2560 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2564 } //if MMX2 can't be used
2568 unsigned int xpos=0;
2569 for (i=0;i<dstWidth;i++)
2571 register unsigned int xx=xpos>>16;
2572 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2576 #endif /* defined(ARCH_X86) */
2579 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2581 //FIXME all pal and rgb srcFormats could do this convertion as well
2582 //FIXME all scalers more complex than bilinear could do half of this transform
2584 for (i=0; i<dstWidth; i++)
2585 dst[i]= (dst[i]*14071 + 33561947)>>14;
2587 for (i=0; i<dstWidth; i++)
2588 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2593 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2594 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2597 int32_t *mmx2FilterPos, uint32_t *pal)
2599 if (srcFormat==PIX_FMT_YUYV422)
2601 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2602 src1= formatConvBuffer;
2603 src2= formatConvBuffer+VOFW;
2605 else if (srcFormat==PIX_FMT_UYVY422)
2607 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2608 src1= formatConvBuffer;
2609 src2= formatConvBuffer+VOFW;
2611 else if (srcFormat==PIX_FMT_RGB32)
2613 if(c->chrSrcHSubSample)
2614 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2616 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2617 src1= formatConvBuffer;
2618 src2= formatConvBuffer+VOFW;
2620 else if (srcFormat==PIX_FMT_RGB32_1)
2622 if(c->chrSrcHSubSample)
2623 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2625 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2626 src1= formatConvBuffer;
2627 src2= formatConvBuffer+VOFW;
2629 else if (srcFormat==PIX_FMT_BGR24)
2631 if(c->chrSrcHSubSample)
2632 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2634 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2635 src1= formatConvBuffer;
2636 src2= formatConvBuffer+VOFW;
2638 else if (srcFormat==PIX_FMT_BGR565)
2640 if(c->chrSrcHSubSample)
2641 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2643 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2644 src1= formatConvBuffer;
2645 src2= formatConvBuffer+VOFW;
2647 else if (srcFormat==PIX_FMT_BGR555)
2649 if(c->chrSrcHSubSample)
2650 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2652 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2653 src1= formatConvBuffer;
2654 src2= formatConvBuffer+VOFW;
2656 else if (srcFormat==PIX_FMT_BGR32)
2658 if(c->chrSrcHSubSample)
2659 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2661 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2662 src1= formatConvBuffer;
2663 src2= formatConvBuffer+VOFW;
2665 else if (srcFormat==PIX_FMT_BGR32_1)
2667 if(c->chrSrcHSubSample)
2668 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2670 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2671 src1= formatConvBuffer;
2672 src2= formatConvBuffer+VOFW;
2674 else if (srcFormat==PIX_FMT_RGB24)
2676 if(c->chrSrcHSubSample)
2677 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2679 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2680 src1= formatConvBuffer;
2681 src2= formatConvBuffer+VOFW;
2683 else if (srcFormat==PIX_FMT_RGB565)
2685 if(c->chrSrcHSubSample)
2686 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2688 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2689 src1= formatConvBuffer;
2690 src2= formatConvBuffer+VOFW;
2692 else if (srcFormat==PIX_FMT_RGB555)
2694 if(c->chrSrcHSubSample)
2695 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2697 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2698 src1= formatConvBuffer;
2699 src2= formatConvBuffer+VOFW;
2701 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2705 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2707 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2708 src1= formatConvBuffer;
2709 src2= formatConvBuffer+VOFW;
2713 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2714 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2716 if (!(flags&SWS_FAST_BILINEAR))
2719 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2720 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2722 else // fast bilinear upscale / crap downscale
2724 #if defined(ARCH_X86)
2728 uint64_t ebxsave __attribute__((aligned(8)));
2734 "mov %%"REG_b", %6 \n\t"
2736 "pxor %%mm7, %%mm7 \n\t"
2737 "mov %0, %%"REG_c" \n\t"
2738 "mov %1, %%"REG_D" \n\t"
2739 "mov %2, %%"REG_d" \n\t"
2740 "mov %3, %%"REG_b" \n\t"
2741 "xor %%"REG_a", %%"REG_a" \n\t" // i
2742 PREFETCH" (%%"REG_c") \n\t"
2743 PREFETCH" 32(%%"REG_c") \n\t"
2744 PREFETCH" 64(%%"REG_c") \n\t"
2748 #define FUNNY_UV_CODE \
2749 "movl (%%"REG_b"), %%esi \n\t"\
2751 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752 "add %%"REG_S", %%"REG_c" \n\t"\
2753 "add %%"REG_a", %%"REG_D" \n\t"\
2754 "xor %%"REG_a", %%"REG_a" \n\t"\
2758 #define FUNNY_UV_CODE \
2759 "movl (%%"REG_b"), %%esi \n\t"\
2761 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762 "add %%"REG_a", %%"REG_D" \n\t"\
2763 "xor %%"REG_a", %%"REG_a" \n\t"\
2765 #endif /* ARCH_X86_64 */
2771 "xor %%"REG_a", %%"REG_a" \n\t" // i
2772 "mov %5, %%"REG_c" \n\t" // src
2773 "mov %1, %%"REG_D" \n\t" // buf1
2774 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2775 PREFETCH" (%%"REG_c") \n\t"
2776 PREFETCH" 32(%%"REG_c") \n\t"
2777 PREFETCH" 64(%%"REG_c") \n\t"
2785 "mov %6, %%"REG_b" \n\t"
2787 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788 "m" (funnyUVCode), "m" (src2)
2792 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2797 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2799 //printf("%d %d %d\n", dstWidth, i, srcW);
2800 dst[i] = src1[srcW-1]*128;
2801 dst[i+VOFW] = src2[srcW-1]*128;
2806 #endif /* HAVE_MMX2 */
2807 long xInc_shr16 = (long) (xInc >> 16);
2808 uint16_t xInc_mask = xInc & 0xffff;
2810 "xor %%"REG_a", %%"REG_a" \n\t" // i
2811 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2812 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2815 "mov %0, %%"REG_S" \n\t"
2816 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2817 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2818 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2819 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820 "shll $16, %%edi \n\t"
2821 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822 "mov %1, %%"REG_D" \n\t"
2823 "shrl $9, %%esi \n\t"
2824 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2826 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2827 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2828 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2829 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830 "shll $16, %%edi \n\t"
2831 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832 "mov %1, %%"REG_D" \n\t"
2833 "shrl $9, %%esi \n\t"
2834 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2836 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2837 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2838 "add $1, %%"REG_a" \n\t"
2839 "cmp %2, %%"REG_a" \n\t"
2842 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843 which is needed to support GCC 4.0. */
2844 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2845 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2847 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2850 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2853 } //if MMX2 can't be used
2857 unsigned int xpos=0;
2858 for (i=0;i<dstWidth;i++)
2860 register unsigned int xx=xpos>>16;
2861 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2863 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2865 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2866 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2870 #endif /* defined(ARCH_X86) */
2872 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2874 //FIXME all pal and rgb srcFormats could do this convertion as well
2875 //FIXME all scalers more complex than bilinear could do half of this transform
2877 for (i=0; i<dstWidth; i++){
2878 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2879 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2882 for (i=0; i<dstWidth; i++){
2883 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2884 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2890 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2891 int srcSliceH, uint8_t* dst[], int dstStride[]){
2893 /* load a few things into local vars to make the code more readable? and faster */
2894 const int srcW= c->srcW;
2895 const int dstW= c->dstW;
2896 const int dstH= c->dstH;
2897 const int chrDstW= c->chrDstW;
2898 const int chrSrcW= c->chrSrcW;
2899 const int lumXInc= c->lumXInc;
2900 const int chrXInc= c->chrXInc;
2901 const int dstFormat= c->dstFormat;
2902 const int srcFormat= c->srcFormat;
2903 const int flags= c->flags;
2904 const int canMMX2BeUsed= c->canMMX2BeUsed;
2905 int16_t *vLumFilterPos= c->vLumFilterPos;
2906 int16_t *vChrFilterPos= c->vChrFilterPos;
2907 int16_t *hLumFilterPos= c->hLumFilterPos;
2908 int16_t *hChrFilterPos= c->hChrFilterPos;
2909 int16_t *vLumFilter= c->vLumFilter;
2910 int16_t *vChrFilter= c->vChrFilter;
2911 int16_t *hLumFilter= c->hLumFilter;
2912 int16_t *hChrFilter= c->hChrFilter;
2913 int32_t *lumMmxFilter= c->lumMmxFilter;
2914 int32_t *chrMmxFilter= c->chrMmxFilter;
2915 const int vLumFilterSize= c->vLumFilterSize;
2916 const int vChrFilterSize= c->vChrFilterSize;
2917 const int hLumFilterSize= c->hLumFilterSize;
2918 const int hChrFilterSize= c->hChrFilterSize;
2919 int16_t **lumPixBuf= c->lumPixBuf;
2920 int16_t **chrPixBuf= c->chrPixBuf;
2921 const int vLumBufSize= c->vLumBufSize;
2922 const int vChrBufSize= c->vChrBufSize;
2923 uint8_t *funnyYCode= c->funnyYCode;
2924 uint8_t *funnyUVCode= c->funnyUVCode;
2925 uint8_t *formatConvBuffer= c->formatConvBuffer;
2926 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2929 uint32_t *pal=c->pal_yuv;
2931 /* vars which will change and which we need to store back in the context */
2933 int lumBufIndex= c->lumBufIndex;
2934 int chrBufIndex= c->chrBufIndex;
2935 int lastInLumBuf= c->lastInLumBuf;
2936 int lastInChrBuf= c->lastInChrBuf;
2938 if (isPacked(c->srcFormat)){
2944 srcStride[2]= srcStride[0];
2946 srcStride[1]<<= c->vChrDrop;
2947 srcStride[2]<<= c->vChrDrop;
2949 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2950 // (int)dst[0], (int)dst[1], (int)dst[2]);
2952 #if 0 //self test FIXME move to a vfilter or something
2954 static volatile int i=0;
2956 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2957 selfTest(src, srcStride, c->srcW, c->srcH);
2962 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2963 //dstStride[0],dstStride[1],dstStride[2]);
2965 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2967 static int warnedAlready=0; //FIXME move this into the context perhaps
2968 if (flags & SWS_PRINT_INFO && !warnedAlready)
2970 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2971 " ->cannot do aligned memory accesses anymore\n");
2976 /* Note the user might start scaling the picture in the middle so this
2977 will not get executed. This is not really intended but works
2978 currently, so people might do it. */
2989 for (;dstY < dstH; dstY++){
2990 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2991 const int chrDstY= dstY>>c->chrDstVSubSample;
2992 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2993 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2995 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2996 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2997 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2998 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3000 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3001 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3002 //handle holes (FAST_BILINEAR & weird filters)
3003 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3004 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3005 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3006 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3007 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3009 // Do we have enough lines in this slice to output the dstY line
3010 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3012 //Do horizontal scaling
3013 while(lastInLumBuf < lastLumSrcY)
3015 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3017 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3018 assert(lumBufIndex < 2*vLumBufSize);
3019 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3020 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3021 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3022 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3023 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3024 funnyYCode, c->srcFormat, formatConvBuffer,
3025 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3028 while(lastInChrBuf < lastChrSrcY)
3030 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3031 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3033 assert(chrBufIndex < 2*vChrBufSize);
3034 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3035 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3036 //FIXME replace parameters through context struct (some at least)
3038 if (!(isGray(srcFormat) || isGray(dstFormat)))
3039 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3040 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3041 funnyUVCode, c->srcFormat, formatConvBuffer,
3042 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3045 //wrap buf index around to stay inside the ring buffer
3046 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3047 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3049 else // not enough lines left in this slice -> load the rest in the buffer
3051 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3052 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3053 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3054 vChrBufSize, vLumBufSize);*/
3056 //Do horizontal scaling
3057 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3059 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3061 assert(lumBufIndex < 2*vLumBufSize);
3062 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3063 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3064 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3065 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3066 funnyYCode, c->srcFormat, formatConvBuffer,
3067 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3070 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3072 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3073 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3075 assert(chrBufIndex < 2*vChrBufSize);
3076 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3077 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3079 if (!(isGray(srcFormat) || isGray(dstFormat)))
3080 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3081 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3082 funnyUVCode, c->srcFormat, formatConvBuffer,
3083 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3086 //wrap buf index around to stay inside the ring buffer
3087 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3088 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3089 break; //we can't output a dstY line so let's try with the next slice
3093 c->blueDither= ff_dither8[dstY&1];
3094 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3095 c->greenDither= ff_dither8[dstY&1];
3097 c->greenDither= ff_dither4[dstY&1];
3098 c->redDither= ff_dither8[(dstY+1)&1];
3102 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3103 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3106 if (flags & SWS_ACCURATE_RND){
3107 int s= APCK_SIZE / 8;
3108 for (i=0; i<vLumFilterSize; i+=2){
3109 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3110 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3111 lumMmxFilter[s*i+APCK_COEF/4 ]=
3112 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3113 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3115 for (i=0; i<vChrFilterSize; i+=2){
3116 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3117 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3118 chrMmxFilter[s*i+APCK_COEF/4 ]=
3119 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3120 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3123 for (i=0; i<vLumFilterSize; i++)
3125 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3126 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3127 lumMmxFilter[4*i+2]=
3128 lumMmxFilter[4*i+3]=
3129 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3131 for (i=0; i<vChrFilterSize; i++)
3133 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3134 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3135 chrMmxFilter[4*i+2]=
3136 chrMmxFilter[4*i+3]=
3137 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3141 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3142 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3143 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3144 RENAME(yuv2nv12X)(c,
3145 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3146 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3147 dest, uDest, dstW, chrDstW, dstFormat);
3149 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3151 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3152 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3153 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3155 int16_t *lumBuf = lumPixBuf[0];
3156 int16_t *chrBuf= chrPixBuf[0];
3157 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3162 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3163 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3164 dest, uDest, vDest, dstW, chrDstW);
3169 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3170 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3171 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3173 int chrAlpha= vChrFilter[2*dstY+1];
3174 if(flags & SWS_FULL_CHR_H_INT){
3175 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3176 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3177 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3180 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3181 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3184 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3186 int lumAlpha= vLumFilter[2*dstY+1];
3187 int chrAlpha= vChrFilter[2*dstY+1];
3189 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3191 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3192 if(flags & SWS_FULL_CHR_H_INT){
3193 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3194 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3195 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3198 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3199 dest, dstW, lumAlpha, chrAlpha, dstY);
3204 if(flags & SWS_FULL_CHR_H_INT){
3206 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3207 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3210 RENAME(yuv2packedX)(c,
3211 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3212 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3218 else // hmm looks like we can't use MMX here without overwriting this array's tail
3220 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3221 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3222 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3223 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3224 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3226 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3227 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3228 dest, uDest, dstW, chrDstW, dstFormat);
3230 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3232 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3233 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3235 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3236 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3237 dest, uDest, vDest, dstW, chrDstW);
3241 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3242 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3243 if(flags & SWS_FULL_CHR_H_INT){
3245 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3246 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3250 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3251 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3259 __asm__ volatile(SFENCE:::"memory");
3260 __asm__ volatile(EMMS:::"memory");
3262 /* store changed local vars back in the context */
3264 c->lumBufIndex= lumBufIndex;
3265 c->chrBufIndex= chrBufIndex;
3266 c->lastInLumBuf= lastInLumBuf;
3267 c->lastInChrBuf= lastInChrBuf;
3269 return dstY - lastDstY;