2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if (c->flags & SWS_ACCURATE_RND){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
970 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
974 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977 #endif //!HAVE_ALTIVEC
978 #endif /* HAVE_MMX */
981 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
985 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
990 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
999 if (c->flags & SWS_ACCURATE_RND){
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1021 for (i=0; i<dstW; i++)
1023 int val= (lumSrc[i]+64)>>7;
1034 for (i=0; i<chrDstW; i++)
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1041 else if (u>255) u=255;
1043 else if (v>255) v=255;
1054 * vertical scale YV12 to RGB
1056 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1065 YSCALEYUV2PACKEDX_ACCURATE
1067 WRITEBGR32(%4, %5, %%REGa)
1069 YSCALEYUV2PACKEDX_END
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1085 case PIX_FMT_RGB555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1095 WRITERGB15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1098 case PIX_FMT_RGB565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1108 WRITERGB16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1124 switch(c->dstFormat)
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 case PIX_FMT_RGB555:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1155 WRITERGB15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1158 case PIX_FMT_RGB565:
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1168 WRITERGB16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1171 case PIX_FMT_YUYV422:
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1184 #endif /* HAVE_MMX */
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1202 * vertical bilinear scale YV12 to RGB
1204 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1207 int yalpha1=4095- yalpha;
1208 int uvalpha1=4095-uvalpha;
1212 if (flags&SWS_FULL_CHR_H_INT)
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1290 case PIX_FMT_BGR555:
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1323 case PIX_FMT_BGR565:
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1356 #endif /* HAVE_MMX */
1361 if (dstFormat==PIX_FMT_RGB32)
1364 #ifdef WORDS_BIGENDIAN
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1378 else if (dstFormat==PIX_FMT_BGR24)
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1392 else if (dstFormat==PIX_FMT_BGR565)
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1407 else if (dstFormat==PIX_FMT_BGR555)
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1427 switch(c->dstFormat)
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1457 case PIX_FMT_RGB555:
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1470 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 case PIX_FMT_RGB565:
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1491 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 case PIX_FMT_YUYV422:
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1518 * YV12 to RGB without scaling or interpolating
1520 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1523 const int yalpha1=0;
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1529 if (flags&SWS_FULL_CHR_H_INT)
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 case PIX_FMT_RGB555:
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1580 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1588 case PIX_FMT_RGB565:
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1601 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1609 case PIX_FMT_YUYV422:
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 case PIX_FMT_RGB555:
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1669 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1677 case PIX_FMT_RGB565:
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1690 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1698 case PIX_FMT_YUYV422:
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1714 #endif /* HAVE_MMX */
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1723 //FIXME yuy2* can read up to 7 samples too much
1725 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1745 for (i=0; i<width; i++)
1750 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1776 for (i=0; i<width; i++)
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1782 assert(src1 == src2);
1785 /* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1791 "mov %0, %%"REG_a" \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1806 for (i=0; i<width; i++)
1811 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1837 for (i=0; i<width; i++)
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1843 assert(src1 == src2);
1846 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1847 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1850 for (i=0; i<width; i++)\
1852 int b= (((type*)src)[i]>>shb)&maskb;\
1853 int g= (((type*)src)[i]>>shg)&maskg;\
1854 int r= (((type*)src)[i]>>shr)&maskr;\
1856 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1860 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1861 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1862 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1863 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1864 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1865 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1867 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1868 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1871 for (i=0; i<width; i++)\
1873 int b= (((type*)src)[i]&maskb)>>shb;\
1874 int g= (((type*)src)[i]&maskg)>>shg;\
1875 int r= (((type*)src)[i]&maskr)>>shr;\
1877 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1878 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1881 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1884 for (i=0; i<width; i++)\
1886 int pix0= ((type*)src)[2*i+0];\
1887 int pix1= ((type*)src)[2*i+1];\
1888 int g= (pix0&maskg)+(pix1&maskg);\
1889 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1890 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1894 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1895 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1899 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1900 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1901 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1902 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1903 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1904 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1907 static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1910 if(srcFormat == PIX_FMT_BGR24){
1912 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1913 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1918 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1919 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1925 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1926 "mov %2, %%"REG_a" \n\t"
1927 "pxor %%mm7, %%mm7 \n\t"
1929 PREFETCH" 64(%0) \n\t"
1930 "movd (%0), %%mm0 \n\t"
1931 "movd 2(%0), %%mm1 \n\t"
1932 "movd 6(%0), %%mm2 \n\t"
1933 "movd 8(%0), %%mm3 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm1 \n\t"
1937 "punpcklbw %%mm7, %%mm2 \n\t"
1938 "punpcklbw %%mm7, %%mm3 \n\t"
1939 "pmaddwd %%mm5, %%mm0 \n\t"
1940 "pmaddwd %%mm6, %%mm1 \n\t"
1941 "pmaddwd %%mm5, %%mm2 \n\t"
1942 "pmaddwd %%mm6, %%mm3 \n\t"
1943 "paddd %%mm1, %%mm0 \n\t"
1944 "paddd %%mm3, %%mm2 \n\t"
1945 "paddd %%mm4, %%mm0 \n\t"
1946 "paddd %%mm4, %%mm2 \n\t"
1947 "psrad $15, %%mm0 \n\t"
1948 "psrad $15, %%mm2 \n\t"
1949 "packssdw %%mm2, %%mm0 \n\t"
1950 "packuswb %%mm0, %%mm0 \n\t"
1951 "movd %%mm0, (%1, %%"REG_a") \n\t"
1952 "add $4, %%"REG_a" \n\t"
1955 : "r" (dst+width), "g" (-width)
1960 static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1963 "movq 24+%4, %%mm6 \n\t"
1964 "mov %3, %%"REG_a" \n\t"
1965 "pxor %%mm7, %%mm7 \n\t"
1967 PREFETCH" 64(%0) \n\t"
1968 "movd (%0), %%mm0 \n\t"
1969 "movd 2(%0), %%mm1 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm1 \n\t"
1972 "movq %%mm0, %%mm2 \n\t"
1973 "movq %%mm1, %%mm3 \n\t"
1974 "pmaddwd %4, %%mm0 \n\t"
1975 "pmaddwd 8+%4, %%mm1 \n\t"
1976 "pmaddwd 16+%4, %%mm2 \n\t"
1977 "pmaddwd %%mm6, %%mm3 \n\t"
1978 "paddd %%mm1, %%mm0 \n\t"
1979 "paddd %%mm3, %%mm2 \n\t"
1981 "movd 6(%0), %%mm1 \n\t"
1982 "movd 8(%0), %%mm3 \n\t"
1984 "punpcklbw %%mm7, %%mm1 \n\t"
1985 "punpcklbw %%mm7, %%mm3 \n\t"
1986 "movq %%mm1, %%mm4 \n\t"
1987 "movq %%mm3, %%mm5 \n\t"
1988 "pmaddwd %4, %%mm1 \n\t"
1989 "pmaddwd 8+%4, %%mm3 \n\t"
1990 "pmaddwd 16+%4, %%mm4 \n\t"
1991 "pmaddwd %%mm6, %%mm5 \n\t"
1992 "paddd %%mm3, %%mm1 \n\t"
1993 "paddd %%mm5, %%mm4 \n\t"
1995 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1996 "paddd %%mm3, %%mm0 \n\t"
1997 "paddd %%mm3, %%mm2 \n\t"
1998 "paddd %%mm3, %%mm1 \n\t"
1999 "paddd %%mm3, %%mm4 \n\t"
2000 "psrad $15, %%mm0 \n\t"
2001 "psrad $15, %%mm2 \n\t"
2002 "psrad $15, %%mm1 \n\t"
2003 "psrad $15, %%mm4 \n\t"
2004 "packssdw %%mm1, %%mm0 \n\t"
2005 "packssdw %%mm4, %%mm2 \n\t"
2006 "packuswb %%mm0, %%mm0 \n\t"
2007 "packuswb %%mm2, %%mm2 \n\t"
2008 "movd %%mm0, (%1, %%"REG_a") \n\t"
2009 "movd %%mm2, (%2, %%"REG_a") \n\t"
2010 "add $4, %%"REG_a" \n\t"
2013 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2019 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2022 bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2025 for (i=0; i<width; i++)
2031 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2033 #endif /* HAVE_MMX */
2036 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2039 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2042 for (i=0; i<width; i++)
2044 int b= src1[3*i + 0];
2045 int g= src1[3*i + 1];
2046 int r= src1[3*i + 2];
2048 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2049 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2051 #endif /* HAVE_MMX */
2052 assert(src1 == src2);
2055 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2058 for (i=0; i<width; i++)
2060 int b= src1[6*i + 0] + src1[6*i + 3];
2061 int g= src1[6*i + 1] + src1[6*i + 4];
2062 int r= src1[6*i + 2] + src1[6*i + 5];
2064 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2065 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2067 assert(src1 == src2);
2070 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2073 bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2076 for (i=0; i<width; i++)
2082 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2087 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2092 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2094 for (i=0; i<width; i++)
2096 int r= src1[3*i + 0];
2097 int g= src1[3*i + 1];
2098 int b= src1[3*i + 2];
2100 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2101 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2106 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2110 for (i=0; i<width; i++)
2112 int r= src1[6*i + 0] + src1[6*i + 3];
2113 int g= src1[6*i + 1] + src1[6*i + 4];
2114 int b= src1[6*i + 2] + src1[6*i + 5];
2116 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2117 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2122 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2125 for (i=0; i<width; i++)
2129 dst[i]= pal[d] & 0xFF;
2133 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2136 assert(src1 == src2);
2137 for (i=0; i<width; i++)
2139 int p= pal[src1[i]];
2146 static inline void RENAME(mono2Y)(uint8_t *dst, uint8_t *src, long width, int format)
2149 for (i=0; i<width/8; i++){
2150 int d= format == PIX_FMT_MONOBLACK ? src[i] : ~src[i];
2152 dst[i]= ((d>>j)&1)*255;
2156 // bilinear / bicubic scaling
2157 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2158 int16_t *filter, int16_t *filterPos, long filterSize)
2161 assert(filterSize % 4 == 0 && filterSize>0);
2162 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2164 long counter= -2*dstW;
2166 filterPos-= counter/2;
2170 "push %%"REG_b" \n\t"
2172 "pxor %%mm7, %%mm7 \n\t"
2173 "movq "MANGLE(w02)", %%mm6 \n\t"
2174 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2175 "mov %%"REG_a", %%"REG_BP" \n\t"
2178 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2179 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2180 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2181 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2182 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2183 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2184 "punpcklbw %%mm7, %%mm0 \n\t"
2185 "punpcklbw %%mm7, %%mm2 \n\t"
2186 "pmaddwd %%mm1, %%mm0 \n\t"
2187 "pmaddwd %%mm2, %%mm3 \n\t"
2188 "psrad $8, %%mm0 \n\t"
2189 "psrad $8, %%mm3 \n\t"
2190 "packssdw %%mm3, %%mm0 \n\t"
2191 "pmaddwd %%mm6, %%mm0 \n\t"
2192 "packssdw %%mm0, %%mm0 \n\t"
2193 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2194 "add $4, %%"REG_BP" \n\t"
2197 "pop %%"REG_BP" \n\t"
2199 "pop %%"REG_b" \n\t"
2202 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2208 else if (filterSize==8)
2210 long counter= -2*dstW;
2212 filterPos-= counter/2;
2216 "push %%"REG_b" \n\t"
2218 "pxor %%mm7, %%mm7 \n\t"
2219 "movq "MANGLE(w02)", %%mm6 \n\t"
2220 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2221 "mov %%"REG_a", %%"REG_BP" \n\t"
2224 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2225 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2226 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2227 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2228 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2229 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2230 "punpcklbw %%mm7, %%mm0 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 "pmaddwd %%mm1, %%mm0 \n\t"
2233 "pmaddwd %%mm2, %%mm3 \n\t"
2235 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2236 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2237 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2238 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2239 "punpcklbw %%mm7, %%mm4 \n\t"
2240 "punpcklbw %%mm7, %%mm2 \n\t"
2241 "pmaddwd %%mm1, %%mm4 \n\t"
2242 "pmaddwd %%mm2, %%mm5 \n\t"
2243 "paddd %%mm4, %%mm0 \n\t"
2244 "paddd %%mm5, %%mm3 \n\t"
2246 "psrad $8, %%mm0 \n\t"
2247 "psrad $8, %%mm3 \n\t"
2248 "packssdw %%mm3, %%mm0 \n\t"
2249 "pmaddwd %%mm6, %%mm0 \n\t"
2250 "packssdw %%mm0, %%mm0 \n\t"
2251 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2252 "add $4, %%"REG_BP" \n\t"
2255 "pop %%"REG_BP" \n\t"
2257 "pop %%"REG_b" \n\t"
2260 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2268 uint8_t *offset = src+filterSize;
2269 long counter= -2*dstW;
2270 //filter-= counter*filterSize/2;
2271 filterPos-= counter/2;
2274 "pxor %%mm7, %%mm7 \n\t"
2275 "movq "MANGLE(w02)", %%mm6 \n\t"
2278 "mov %2, %%"REG_c" \n\t"
2279 "movzwl (%%"REG_c", %0), %%eax \n\t"
2280 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2281 "mov %5, %%"REG_c" \n\t"
2282 "pxor %%mm4, %%mm4 \n\t"
2283 "pxor %%mm5, %%mm5 \n\t"
2285 "movq (%1), %%mm1 \n\t"
2286 "movq (%1, %6), %%mm3 \n\t"
2287 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2288 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2289 "punpcklbw %%mm7, %%mm0 \n\t"
2290 "punpcklbw %%mm7, %%mm2 \n\t"
2291 "pmaddwd %%mm1, %%mm0 \n\t"
2292 "pmaddwd %%mm2, %%mm3 \n\t"
2293 "paddd %%mm3, %%mm5 \n\t"
2294 "paddd %%mm0, %%mm4 \n\t"
2296 "add $4, %%"REG_c" \n\t"
2297 "cmp %4, %%"REG_c" \n\t"
2300 "psrad $8, %%mm4 \n\t"
2301 "psrad $8, %%mm5 \n\t"
2302 "packssdw %%mm5, %%mm4 \n\t"
2303 "pmaddwd %%mm6, %%mm4 \n\t"
2304 "packssdw %%mm4, %%mm4 \n\t"
2305 "mov %3, %%"REG_a" \n\t"
2306 "movd %%mm4, (%%"REG_a", %0) \n\t"
2310 : "+r" (counter), "+r" (filter)
2311 : "m" (filterPos), "m" (dst), "m"(offset),
2312 "m" (src), "r" (filterSize*2)
2313 : "%"REG_a, "%"REG_c, "%"REG_d
2318 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2321 for (i=0; i<dstW; i++)
2324 int srcPos= filterPos[i];
2326 //printf("filterPos: %d\n", filterPos[i]);
2327 for (j=0; j<filterSize; j++)
2329 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2330 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2332 //filter += hFilterSize;
2333 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2336 #endif /* HAVE_ALTIVEC */
2337 #endif /* HAVE_MMX */
2339 // *** horizontal scale Y line to temp buffer
2340 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2341 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2342 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2343 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2344 int32_t *mmx2FilterPos, uint8_t *pal)
2346 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2348 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2349 src= formatConvBuffer;
2351 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2353 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2354 src= formatConvBuffer;
2356 else if (srcFormat==PIX_FMT_RGB32)
2358 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2359 src= formatConvBuffer;
2361 else if (srcFormat==PIX_FMT_RGB32_1)
2363 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2364 src= formatConvBuffer;
2366 else if (srcFormat==PIX_FMT_BGR24)
2368 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2369 src= formatConvBuffer;
2371 else if (srcFormat==PIX_FMT_BGR565)
2373 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2374 src= formatConvBuffer;
2376 else if (srcFormat==PIX_FMT_BGR555)
2378 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2379 src= formatConvBuffer;
2381 else if (srcFormat==PIX_FMT_BGR32)
2383 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2384 src= formatConvBuffer;
2386 else if (srcFormat==PIX_FMT_BGR32_1)
2388 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2389 src= formatConvBuffer;
2391 else if (srcFormat==PIX_FMT_RGB24)
2393 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2394 src= formatConvBuffer;
2396 else if (srcFormat==PIX_FMT_RGB565)
2398 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2399 src= formatConvBuffer;
2401 else if (srcFormat==PIX_FMT_RGB555)
2403 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2404 src= formatConvBuffer;
2406 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2408 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2409 src= formatConvBuffer;
2411 else if (srcFormat==PIX_FMT_MONOBLACK ||srcFormat==PIX_FMT_MONOWHITE)
2413 RENAME(mono2Y)(formatConvBuffer, src, srcW, srcFormat);
2414 src= formatConvBuffer;
2418 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2419 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2421 if (!(flags&SWS_FAST_BILINEAR))
2424 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2426 else // fast bilinear upscale / crap downscale
2428 #if defined(ARCH_X86)
2432 uint64_t ebxsave __attribute__((aligned(8)));
2438 "mov %%"REG_b", %5 \n\t"
2440 "pxor %%mm7, %%mm7 \n\t"
2441 "mov %0, %%"REG_c" \n\t"
2442 "mov %1, %%"REG_D" \n\t"
2443 "mov %2, %%"REG_d" \n\t"
2444 "mov %3, %%"REG_b" \n\t"
2445 "xor %%"REG_a", %%"REG_a" \n\t" // i
2446 PREFETCH" (%%"REG_c") \n\t"
2447 PREFETCH" 32(%%"REG_c") \n\t"
2448 PREFETCH" 64(%%"REG_c") \n\t"
2452 #define FUNNY_Y_CODE \
2453 "movl (%%"REG_b"), %%esi \n\t"\
2455 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2456 "add %%"REG_S", %%"REG_c" \n\t"\
2457 "add %%"REG_a", %%"REG_D" \n\t"\
2458 "xor %%"REG_a", %%"REG_a" \n\t"\
2462 #define FUNNY_Y_CODE \
2463 "movl (%%"REG_b"), %%esi \n\t"\
2465 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2466 "add %%"REG_a", %%"REG_D" \n\t"\
2467 "xor %%"REG_a", %%"REG_a" \n\t"\
2469 #endif /* ARCH_X86_64 */
2481 "mov %5, %%"REG_b" \n\t"
2483 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2488 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2493 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2497 #endif /* HAVE_MMX2 */
2498 long xInc_shr16 = xInc >> 16;
2499 uint16_t xInc_mask = xInc & 0xffff;
2500 //NO MMX just normal asm ...
2502 "xor %%"REG_a", %%"REG_a" \n\t" // i
2503 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2504 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2507 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2508 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2509 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2510 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2511 "shll $16, %%edi \n\t"
2512 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2513 "mov %1, %%"REG_D" \n\t"
2514 "shrl $9, %%esi \n\t"
2515 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2516 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2517 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2519 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2520 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2521 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2522 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2523 "shll $16, %%edi \n\t"
2524 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2525 "mov %1, %%"REG_D" \n\t"
2526 "shrl $9, %%esi \n\t"
2527 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2528 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2529 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2532 "add $2, %%"REG_a" \n\t"
2533 "cmp %2, %%"REG_a" \n\t"
2537 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2538 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2541 } //if MMX2 can't be used
2545 unsigned int xpos=0;
2546 for (i=0;i<dstWidth;i++)
2548 register unsigned int xx=xpos>>16;
2549 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2550 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2553 #endif /* defined(ARCH_X86) */
2556 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2558 //FIXME all pal and rgb srcFormats could do this convertion as well
2559 //FIXME all scalers more complex than bilinear could do half of this transform
2561 for (i=0; i<dstWidth; i++)
2562 dst[i]= (dst[i]*14071 + 33561947)>>14;
2564 for (i=0; i<dstWidth; i++)
2565 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2570 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2571 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2572 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2573 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2574 int32_t *mmx2FilterPos, uint8_t *pal)
2576 if (srcFormat==PIX_FMT_YUYV422)
2578 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2579 src1= formatConvBuffer;
2580 src2= formatConvBuffer+VOFW;
2582 else if (srcFormat==PIX_FMT_UYVY422)
2584 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2585 src1= formatConvBuffer;
2586 src2= formatConvBuffer+VOFW;
2588 else if (srcFormat==PIX_FMT_RGB32)
2590 if(c->chrSrcHSubSample)
2591 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2593 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2594 src1= formatConvBuffer;
2595 src2= formatConvBuffer+VOFW;
2597 else if (srcFormat==PIX_FMT_RGB32_1)
2599 if(c->chrSrcHSubSample)
2600 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2602 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2603 src1= formatConvBuffer;
2604 src2= formatConvBuffer+VOFW;
2606 else if (srcFormat==PIX_FMT_BGR24)
2608 if(c->chrSrcHSubSample)
2609 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2611 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2612 src1= formatConvBuffer;
2613 src2= formatConvBuffer+VOFW;
2615 else if (srcFormat==PIX_FMT_BGR565)
2617 if(c->chrSrcHSubSample)
2618 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2620 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2621 src1= formatConvBuffer;
2622 src2= formatConvBuffer+VOFW;
2624 else if (srcFormat==PIX_FMT_BGR555)
2626 if(c->chrSrcHSubSample)
2627 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2629 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2630 src1= formatConvBuffer;
2631 src2= formatConvBuffer+VOFW;
2633 else if (srcFormat==PIX_FMT_BGR32)
2635 if(c->chrSrcHSubSample)
2636 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2638 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2639 src1= formatConvBuffer;
2640 src2= formatConvBuffer+VOFW;
2642 else if (srcFormat==PIX_FMT_BGR32_1)
2644 if(c->chrSrcHSubSample)
2645 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2647 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2648 src1= formatConvBuffer;
2649 src2= formatConvBuffer+VOFW;
2651 else if (srcFormat==PIX_FMT_RGB24)
2653 if(c->chrSrcHSubSample)
2654 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2656 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2657 src1= formatConvBuffer;
2658 src2= formatConvBuffer+VOFW;
2660 else if (srcFormat==PIX_FMT_RGB565)
2662 if(c->chrSrcHSubSample)
2663 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2665 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2666 src1= formatConvBuffer;
2667 src2= formatConvBuffer+VOFW;
2669 else if (srcFormat==PIX_FMT_RGB555)
2671 if(c->chrSrcHSubSample)
2672 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2674 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2675 src1= formatConvBuffer;
2676 src2= formatConvBuffer+VOFW;
2678 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2682 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2684 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2685 src1= formatConvBuffer;
2686 src2= formatConvBuffer+VOFW;
2690 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2691 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2693 if (!(flags&SWS_FAST_BILINEAR))
2696 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2697 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2699 else // fast bilinear upscale / crap downscale
2701 #if defined(ARCH_X86)
2705 uint64_t ebxsave __attribute__((aligned(8)));
2711 "mov %%"REG_b", %6 \n\t"
2713 "pxor %%mm7, %%mm7 \n\t"
2714 "mov %0, %%"REG_c" \n\t"
2715 "mov %1, %%"REG_D" \n\t"
2716 "mov %2, %%"REG_d" \n\t"
2717 "mov %3, %%"REG_b" \n\t"
2718 "xor %%"REG_a", %%"REG_a" \n\t" // i
2719 PREFETCH" (%%"REG_c") \n\t"
2720 PREFETCH" 32(%%"REG_c") \n\t"
2721 PREFETCH" 64(%%"REG_c") \n\t"
2725 #define FUNNY_UV_CODE \
2726 "movl (%%"REG_b"), %%esi \n\t"\
2728 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2729 "add %%"REG_S", %%"REG_c" \n\t"\
2730 "add %%"REG_a", %%"REG_D" \n\t"\
2731 "xor %%"REG_a", %%"REG_a" \n\t"\
2735 #define FUNNY_UV_CODE \
2736 "movl (%%"REG_b"), %%esi \n\t"\
2738 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2739 "add %%"REG_a", %%"REG_D" \n\t"\
2740 "xor %%"REG_a", %%"REG_a" \n\t"\
2742 #endif /* ARCH_X86_64 */
2748 "xor %%"REG_a", %%"REG_a" \n\t" // i
2749 "mov %5, %%"REG_c" \n\t" // src
2750 "mov %1, %%"REG_D" \n\t" // buf1
2751 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2752 PREFETCH" (%%"REG_c") \n\t"
2753 PREFETCH" 32(%%"REG_c") \n\t"
2754 PREFETCH" 64(%%"REG_c") \n\t"
2762 "mov %6, %%"REG_b" \n\t"
2764 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2765 "m" (funnyUVCode), "m" (src2)
2769 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2774 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2776 //printf("%d %d %d\n", dstWidth, i, srcW);
2777 dst[i] = src1[srcW-1]*128;
2778 dst[i+VOFW] = src2[srcW-1]*128;
2783 #endif /* HAVE_MMX2 */
2784 long xInc_shr16 = (long) (xInc >> 16);
2785 uint16_t xInc_mask = xInc & 0xffff;
2787 "xor %%"REG_a", %%"REG_a" \n\t" // i
2788 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2789 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2792 "mov %0, %%"REG_S" \n\t"
2793 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2794 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2795 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2796 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2797 "shll $16, %%edi \n\t"
2798 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2799 "mov %1, %%"REG_D" \n\t"
2800 "shrl $9, %%esi \n\t"
2801 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2803 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2804 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2805 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2806 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2807 "shll $16, %%edi \n\t"
2808 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2809 "mov %1, %%"REG_D" \n\t"
2810 "shrl $9, %%esi \n\t"
2811 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2813 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2814 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2815 "add $1, %%"REG_a" \n\t"
2816 "cmp %2, %%"REG_a" \n\t"
2819 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2820 which is needed to support GCC 4.0. */
2821 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2822 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2824 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2827 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2830 } //if MMX2 can't be used
2834 unsigned int xpos=0;
2835 for (i=0;i<dstWidth;i++)
2837 register unsigned int xx=xpos>>16;
2838 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2839 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2840 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2842 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2843 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2847 #endif /* defined(ARCH_X86) */
2849 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2851 //FIXME all pal and rgb srcFormats could do this convertion as well
2852 //FIXME all scalers more complex than bilinear could do half of this transform
2854 for (i=0; i<dstWidth; i++){
2855 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2856 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2859 for (i=0; i<dstWidth; i++){
2860 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2861 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2867 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2868 int srcSliceH, uint8_t* dst[], int dstStride[]){
2870 /* load a few things into local vars to make the code more readable? and faster */
2871 const int srcW= c->srcW;
2872 const int dstW= c->dstW;
2873 const int dstH= c->dstH;
2874 const int chrDstW= c->chrDstW;
2875 const int chrSrcW= c->chrSrcW;
2876 const int lumXInc= c->lumXInc;
2877 const int chrXInc= c->chrXInc;
2878 const int dstFormat= c->dstFormat;
2879 const int srcFormat= c->srcFormat;
2880 const int flags= c->flags;
2881 const int canMMX2BeUsed= c->canMMX2BeUsed;
2882 int16_t *vLumFilterPos= c->vLumFilterPos;
2883 int16_t *vChrFilterPos= c->vChrFilterPos;
2884 int16_t *hLumFilterPos= c->hLumFilterPos;
2885 int16_t *hChrFilterPos= c->hChrFilterPos;
2886 int16_t *vLumFilter= c->vLumFilter;
2887 int16_t *vChrFilter= c->vChrFilter;
2888 int16_t *hLumFilter= c->hLumFilter;
2889 int16_t *hChrFilter= c->hChrFilter;
2890 int32_t *lumMmxFilter= c->lumMmxFilter;
2891 int32_t *chrMmxFilter= c->chrMmxFilter;
2892 const int vLumFilterSize= c->vLumFilterSize;
2893 const int vChrFilterSize= c->vChrFilterSize;
2894 const int hLumFilterSize= c->hLumFilterSize;
2895 const int hChrFilterSize= c->hChrFilterSize;
2896 int16_t **lumPixBuf= c->lumPixBuf;
2897 int16_t **chrPixBuf= c->chrPixBuf;
2898 const int vLumBufSize= c->vLumBufSize;
2899 const int vChrBufSize= c->vChrBufSize;
2900 uint8_t *funnyYCode= c->funnyYCode;
2901 uint8_t *funnyUVCode= c->funnyUVCode;
2902 uint8_t *formatConvBuffer= c->formatConvBuffer;
2903 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2904 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2908 /* vars which will change and which we need to store back in the context */
2910 int lumBufIndex= c->lumBufIndex;
2911 int chrBufIndex= c->chrBufIndex;
2912 int lastInLumBuf= c->lastInLumBuf;
2913 int lastInChrBuf= c->lastInChrBuf;
2915 if (isPacked(c->srcFormat)){
2922 srcStride[2]= srcStride[0];
2924 srcStride[1]<<= c->vChrDrop;
2925 srcStride[2]<<= c->vChrDrop;
2927 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2928 // (int)dst[0], (int)dst[1], (int)dst[2]);
2930 #if 0 //self test FIXME move to a vfilter or something
2932 static volatile int i=0;
2934 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2935 selfTest(src, srcStride, c->srcW, c->srcH);
2940 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2941 //dstStride[0],dstStride[1],dstStride[2]);
2943 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2945 static int firstTime=1; //FIXME move this into the context perhaps
2946 if (flags & SWS_PRINT_INFO && firstTime)
2948 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2949 " ->cannot do aligned memory accesses anymore\n");
2954 /* Note the user might start scaling the picture in the middle so this
2955 will not get executed. This is not really intended but works
2956 currently, so people might do it. */
2967 for (;dstY < dstH; dstY++){
2968 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2969 const int chrDstY= dstY>>c->chrDstVSubSample;
2970 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2971 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2973 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2974 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2975 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2976 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2978 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2979 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2980 //handle holes (FAST_BILINEAR & weird filters)
2981 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2982 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2983 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2984 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2985 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2987 // Do we have enough lines in this slice to output the dstY line
2988 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2990 //Do horizontal scaling
2991 while(lastInLumBuf < lastLumSrcY)
2993 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2995 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2996 assert(lumBufIndex < 2*vLumBufSize);
2997 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2998 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2999 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3000 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3001 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3002 funnyYCode, c->srcFormat, formatConvBuffer,
3003 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3006 while(lastInChrBuf < lastChrSrcY)
3008 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3009 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3011 assert(chrBufIndex < 2*vChrBufSize);
3012 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3013 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3014 //FIXME replace parameters through context struct (some at least)
3016 if (!(isGray(srcFormat) || isGray(dstFormat)))
3017 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3018 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3019 funnyUVCode, c->srcFormat, formatConvBuffer,
3020 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3023 //wrap buf index around to stay inside the ring buffer
3024 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3025 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3027 else // not enough lines left in this slice -> load the rest in the buffer
3029 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3030 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3031 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3032 vChrBufSize, vLumBufSize);*/
3034 //Do horizontal scaling
3035 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3037 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3039 assert(lumBufIndex < 2*vLumBufSize);
3040 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3041 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3042 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3043 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3044 funnyYCode, c->srcFormat, formatConvBuffer,
3045 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3048 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3050 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3051 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3053 assert(chrBufIndex < 2*vChrBufSize);
3054 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3055 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3057 if (!(isGray(srcFormat) || isGray(dstFormat)))
3058 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3059 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3060 funnyUVCode, c->srcFormat, formatConvBuffer,
3061 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3064 //wrap buf index around to stay inside the ring buffer
3065 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3066 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3067 break; //we can't output a dstY line so let's try with the next slice
3071 b5Dither= ff_dither8[dstY&1];
3072 g6Dither= ff_dither4[dstY&1];
3073 g5Dither= ff_dither8[dstY&1];
3074 r5Dither= ff_dither8[(dstY+1)&1];
3078 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3079 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3082 if (flags & SWS_ACCURATE_RND){
3083 int s= APCK_SIZE / 8;
3084 for (i=0; i<vLumFilterSize; i+=2){
3085 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3086 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3087 lumMmxFilter[s*i+APCK_COEF/4 ]=
3088 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3089 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3091 for (i=0; i<vChrFilterSize; i+=2){
3092 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3093 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3094 chrMmxFilter[s*i+APCK_COEF/4 ]=
3095 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3096 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3099 for (i=0; i<vLumFilterSize; i++)
3101 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3102 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3103 lumMmxFilter[4*i+2]=
3104 lumMmxFilter[4*i+3]=
3105 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3107 for (i=0; i<vChrFilterSize; i++)
3109 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3110 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3111 chrMmxFilter[4*i+2]=
3112 chrMmxFilter[4*i+3]=
3113 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3117 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3118 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3119 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3120 RENAME(yuv2nv12X)(c,
3121 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3122 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3123 dest, uDest, dstW, chrDstW, dstFormat);
3125 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3127 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3128 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3129 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3131 int16_t *lumBuf = lumPixBuf[0];
3132 int16_t *chrBuf= chrPixBuf[0];
3133 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3138 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3139 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3140 dest, uDest, vDest, dstW, chrDstW);
3145 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3146 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3147 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3149 int chrAlpha= vChrFilter[2*dstY+1];
3150 if(flags & SWS_FULL_CHR_H_INT){
3151 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3152 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3153 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3156 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3157 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3160 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3162 int lumAlpha= vLumFilter[2*dstY+1];
3163 int chrAlpha= vChrFilter[2*dstY+1];
3165 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3167 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3168 if(flags & SWS_FULL_CHR_H_INT){
3169 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3170 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3171 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3174 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3175 dest, dstW, lumAlpha, chrAlpha, dstY);
3180 if(flags & SWS_FULL_CHR_H_INT){
3182 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3183 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3186 RENAME(yuv2packedX)(c,
3187 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3188 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3194 else // hmm looks like we can't use MMX here without overwriting this array's tail
3196 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3197 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3198 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3199 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3200 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3202 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3203 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3204 dest, uDest, dstW, chrDstW, dstFormat);
3206 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3208 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3209 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3211 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3212 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3213 dest, uDest, vDest, dstW, chrDstW);
3217 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3218 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3219 if(flags & SWS_FULL_CHR_H_INT){
3221 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3222 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3226 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3227 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3235 asm volatile(SFENCE:::"memory");
3236 asm volatile(EMMS:::"memory");
3238 /* store changed local vars back in the context */
3240 c->lumBufIndex= lumBufIndex;
3241 c->chrBufIndex= chrBufIndex;
3242 c->lastInLumBuf= lastInLumBuf;
3243 c->lastInChrBuf= lastInChrBuf;
3245 return dstY - lastDstY;