2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddw %%mm7, %%mm0 \n\t"\
194 "paddw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if (c->flags & SWS_ACCURATE_RND){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
970 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
974 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977 #endif //!HAVE_ALTIVEC
978 #endif /* HAVE_MMX */
981 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
985 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
990 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
999 if (c->flags & SWS_ACCURATE_RND){
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1021 for (i=0; i<dstW; i++)
1023 int val= (lumSrc[i]+64)>>7;
1034 for (i=0; i<chrDstW; i++)
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1041 else if (u>255) u=255;
1043 else if (v>255) v=255;
1054 * vertical scale YV12 to RGB
1056 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1065 YSCALEYUV2PACKEDX_ACCURATE
1067 WRITEBGR32(%4, %5, %%REGa)
1069 YSCALEYUV2PACKEDX_END
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1085 case PIX_FMT_RGB555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1095 WRITERGB15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1098 case PIX_FMT_RGB565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1108 WRITERGB16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1124 switch(c->dstFormat)
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 case PIX_FMT_RGB555:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1155 WRITERGB15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1158 case PIX_FMT_RGB565:
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1168 WRITERGB16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1171 case PIX_FMT_YUYV422:
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1184 #endif /* HAVE_MMX */
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1202 * vertical bilinear scale YV12 to RGB
1204 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1207 int yalpha1=yalpha^4095;
1208 int uvalpha1=uvalpha^4095;
1212 if (flags&SWS_FULL_CHR_H_INT)
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1290 case PIX_FMT_BGR555:
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1323 case PIX_FMT_BGR565:
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1356 #endif /* HAVE_MMX */
1361 if (dstFormat==PIX_FMT_RGB32)
1364 #ifdef WORDS_BIGENDIAN
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1378 else if (dstFormat==PIX_FMT_BGR24)
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1392 else if (dstFormat==PIX_FMT_BGR565)
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1407 else if (dstFormat==PIX_FMT_BGR555)
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1427 switch(c->dstFormat)
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1457 case PIX_FMT_RGB555:
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1470 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 case PIX_FMT_RGB565:
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1491 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 case PIX_FMT_YUYV422:
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C)
1518 * YV12 to RGB without scaling or interpolating
1520 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1523 const int yalpha1=0;
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1529 if (flags&SWS_FULL_CHR_H_INT)
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 case PIX_FMT_RGB555:
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1580 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1588 case PIX_FMT_RGB565:
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1601 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1609 case PIX_FMT_YUYV422:
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 case PIX_FMT_RGB555:
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1669 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1677 case PIX_FMT_RGB565:
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1690 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1698 case PIX_FMT_YUYV422:
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1714 #endif /* HAVE_MMX */
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C)
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C)
1723 //FIXME yuy2* can read up to 7 samples too much
1725 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1745 for (i=0; i<width; i++)
1750 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1776 for (i=0; i<width; i++)
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1782 assert(src1 == src2);
1785 /* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1791 "mov %0, %%"REG_a" \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1806 for (i=0; i<width; i++)
1811 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1837 for (i=0; i<width; i++)
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1843 assert(src1 == src2);
1846 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1847 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1850 for (i=0; i<width; i++)\
1852 int b= (((type*)src)[i]>>shb)&maskb;\
1853 int g= (((type*)src)[i]>>shg)&maskg;\
1854 int r= (((type*)src)[i]>>shr)&maskr;\
1856 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1860 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1861 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1862 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1863 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1864 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1865 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1867 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1868 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1871 for (i=0; i<width; i++)\
1873 int b= (((type*)src)[i]&maskb)>>shb;\
1874 int g= (((type*)src)[i]&maskg)>>shg;\
1875 int r= (((type*)src)[i]&maskr)>>shr;\
1877 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1878 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1881 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1884 for (i=0; i<width; i++)\
1886 int pix0= ((type*)src)[2*i+0];\
1887 int pix1= ((type*)src)[2*i+1];\
1888 int g= (pix0&maskg)+(pix1&maskg);\
1889 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1890 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1894 dstU[i]= (((RU)>>1)*r + ((GU)>>1)*g + ((BU)>>1)*b + (257<<((S)-1)))>>(S);\
1895 dstV[i]= (((RV)>>1)*r + ((GV)>>1)*g + ((BV)>>1)*b + (257<<((S)-1)))>>(S);\
1896 /* dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1897 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);*/\
1901 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1902 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1903 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1904 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1905 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1906 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1909 static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1912 if(srcFormat == PIX_FMT_BGR24){
1914 "movq "MANGLE(ff_bgr24toY1Coeff)", %mm5 \n\t"
1915 "movq "MANGLE(ff_bgr24toY2Coeff)", %mm6 \n\t"
1919 "movq "MANGLE(ff_rgb24toY1Coeff)", %mm5 \n\t"
1920 "movq "MANGLE(ff_rgb24toY2Coeff)", %mm6 \n\t"
1925 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1926 "mov %2, %%"REG_a" \n\t"
1927 "pxor %%mm7, %%mm7 \n\t"
1929 PREFETCH" 64(%0) \n\t"
1930 "movd (%0), %%mm0 \n\t"
1931 "movd 2(%0), %%mm1 \n\t"
1932 "movd 6(%0), %%mm2 \n\t"
1933 "movd 8(%0), %%mm3 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm1 \n\t"
1937 "punpcklbw %%mm7, %%mm2 \n\t"
1938 "punpcklbw %%mm7, %%mm3 \n\t"
1939 "pmaddwd %%mm5, %%mm0 \n\t"
1940 "pmaddwd %%mm6, %%mm1 \n\t"
1941 "pmaddwd %%mm5, %%mm2 \n\t"
1942 "pmaddwd %%mm6, %%mm3 \n\t"
1943 "paddd %%mm1, %%mm0 \n\t"
1944 "paddd %%mm3, %%mm2 \n\t"
1945 "paddd %%mm4, %%mm0 \n\t"
1946 "paddd %%mm4, %%mm2 \n\t"
1947 "psrad $15, %%mm0 \n\t"
1948 "psrad $15, %%mm2 \n\t"
1949 "packssdw %%mm2, %%mm0 \n\t"
1950 "packuswb %%mm0, %%mm0 \n\t"
1951 "movd %%mm0, (%1, %%"REG_a") \n\t"
1952 "add $4, %%"REG_a" \n\t"
1955 : "r" (dst+width), "g" (-width)
1960 static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1963 "movq 24+%4, %%mm6 \n\t"
1964 "mov %3, %%"REG_a" \n\t"
1965 "pxor %%mm7, %%mm7 \n\t"
1967 PREFETCH" 64(%0) \n\t"
1968 "movd (%0), %%mm0 \n\t"
1969 "movd 2(%0), %%mm1 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm1 \n\t"
1972 "movq %%mm0, %%mm2 \n\t"
1973 "movq %%mm1, %%mm3 \n\t"
1974 "pmaddwd %4, %%mm0 \n\t"
1975 "pmaddwd 8+%4, %%mm1 \n\t"
1976 "pmaddwd 16+%4, %%mm2 \n\t"
1977 "pmaddwd %%mm6, %%mm3 \n\t"
1978 "paddd %%mm1, %%mm0 \n\t"
1979 "paddd %%mm3, %%mm2 \n\t"
1981 "movd 6(%0), %%mm1 \n\t"
1982 "movd 8(%0), %%mm3 \n\t"
1984 "punpcklbw %%mm7, %%mm1 \n\t"
1985 "punpcklbw %%mm7, %%mm3 \n\t"
1986 "movq %%mm1, %%mm4 \n\t"
1987 "movq %%mm3, %%mm5 \n\t"
1988 "pmaddwd %4, %%mm1 \n\t"
1989 "pmaddwd 8+%4, %%mm3 \n\t"
1990 "pmaddwd 16+%4, %%mm4 \n\t"
1991 "pmaddwd %%mm6, %%mm5 \n\t"
1992 "paddd %%mm3, %%mm1 \n\t"
1993 "paddd %%mm5, %%mm4 \n\t"
1995 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1996 "paddd %%mm3, %%mm0 \n\t"
1997 "paddd %%mm3, %%mm2 \n\t"
1998 "paddd %%mm3, %%mm1 \n\t"
1999 "paddd %%mm3, %%mm4 \n\t"
2000 "psrad $15, %%mm0 \n\t"
2001 "psrad $15, %%mm2 \n\t"
2002 "psrad $15, %%mm1 \n\t"
2003 "psrad $15, %%mm4 \n\t"
2004 "packssdw %%mm1, %%mm0 \n\t"
2005 "packssdw %%mm4, %%mm2 \n\t"
2006 "packuswb %%mm0, %%mm0 \n\t"
2007 "packuswb %%mm2, %%mm2 \n\t"
2008 "movd %%mm0, (%1, %%"REG_a") \n\t"
2009 "movd %%mm2, (%2, %%"REG_a") \n\t"
2010 "add $4, %%"REG_a" \n\t"
2013 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2019 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2022 bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2025 for (i=0; i<width; i++)
2031 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2033 #endif /* HAVE_MMX */
2036 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2039 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2042 for (i=0; i<width; i++)
2044 int b= src1[3*i + 0];
2045 int g= src1[3*i + 1];
2046 int r= src1[3*i + 2];
2048 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2049 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2051 #endif /* HAVE_MMX */
2052 assert(src1 == src2);
2055 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2058 for (i=0; i<width; i++)
2060 int b= src1[6*i + 0] + src1[6*i + 3];
2061 int g= src1[6*i + 1] + src1[6*i + 4];
2062 int r= src1[6*i + 2] + src1[6*i + 5];
2064 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2065 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2067 assert(src1 == src2);
2070 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2073 bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2076 for (i=0; i<width; i++)
2082 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2087 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2092 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2094 for (i=0; i<width; i++)
2096 int r= src1[3*i + 0];
2097 int g= src1[3*i + 1];
2098 int b= src1[3*i + 2];
2100 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2101 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2106 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2110 for (i=0; i<width; i++)
2112 int r= src1[6*i + 0] + src1[6*i + 0];
2113 int g= src1[6*i + 1] + src1[6*i + 1];
2114 int b= src1[6*i + 2] + src1[6*i + 2];
2116 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2117 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2122 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2125 for (i=0; i<width; i++)
2129 dst[i]= pal[d] & 0xFF;
2133 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2136 assert(src1 == src2);
2137 for (i=0; i<width; i++)
2139 int p= pal[src1[i]];
2146 // bilinear / bicubic scaling
2147 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2148 int16_t *filter, int16_t *filterPos, long filterSize)
2151 assert(filterSize % 4 == 0 && filterSize>0);
2152 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2154 long counter= -2*dstW;
2156 filterPos-= counter/2;
2160 "push %%"REG_b" \n\t"
2162 "pxor %%mm7, %%mm7 \n\t"
2163 "movq "MANGLE(w02)", %%mm6 \n\t"
2164 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2165 "mov %%"REG_a", %%"REG_BP" \n\t"
2168 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2169 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2170 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2171 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2172 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2173 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2174 "punpcklbw %%mm7, %%mm0 \n\t"
2175 "punpcklbw %%mm7, %%mm2 \n\t"
2176 "pmaddwd %%mm1, %%mm0 \n\t"
2177 "pmaddwd %%mm2, %%mm3 \n\t"
2178 "psrad $8, %%mm0 \n\t"
2179 "psrad $8, %%mm3 \n\t"
2180 "packssdw %%mm3, %%mm0 \n\t"
2181 "pmaddwd %%mm6, %%mm0 \n\t"
2182 "packssdw %%mm0, %%mm0 \n\t"
2183 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2184 "add $4, %%"REG_BP" \n\t"
2187 "pop %%"REG_BP" \n\t"
2189 "pop %%"REG_b" \n\t"
2192 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2198 else if (filterSize==8)
2200 long counter= -2*dstW;
2202 filterPos-= counter/2;
2206 "push %%"REG_b" \n\t"
2208 "pxor %%mm7, %%mm7 \n\t"
2209 "movq "MANGLE(w02)", %%mm6 \n\t"
2210 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2211 "mov %%"REG_a", %%"REG_BP" \n\t"
2214 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2215 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2216 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2217 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2218 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2219 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2220 "punpcklbw %%mm7, %%mm0 \n\t"
2221 "punpcklbw %%mm7, %%mm2 \n\t"
2222 "pmaddwd %%mm1, %%mm0 \n\t"
2223 "pmaddwd %%mm2, %%mm3 \n\t"
2225 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2226 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2227 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2228 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2229 "punpcklbw %%mm7, %%mm4 \n\t"
2230 "punpcklbw %%mm7, %%mm2 \n\t"
2231 "pmaddwd %%mm1, %%mm4 \n\t"
2232 "pmaddwd %%mm2, %%mm5 \n\t"
2233 "paddd %%mm4, %%mm0 \n\t"
2234 "paddd %%mm5, %%mm3 \n\t"
2236 "psrad $8, %%mm0 \n\t"
2237 "psrad $8, %%mm3 \n\t"
2238 "packssdw %%mm3, %%mm0 \n\t"
2239 "pmaddwd %%mm6, %%mm0 \n\t"
2240 "packssdw %%mm0, %%mm0 \n\t"
2241 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2242 "add $4, %%"REG_BP" \n\t"
2245 "pop %%"REG_BP" \n\t"
2247 "pop %%"REG_b" \n\t"
2250 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2258 uint8_t *offset = src+filterSize;
2259 long counter= -2*dstW;
2260 //filter-= counter*filterSize/2;
2261 filterPos-= counter/2;
2264 "pxor %%mm7, %%mm7 \n\t"
2265 "movq "MANGLE(w02)", %%mm6 \n\t"
2268 "mov %2, %%"REG_c" \n\t"
2269 "movzwl (%%"REG_c", %0), %%eax \n\t"
2270 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2271 "mov %5, %%"REG_c" \n\t"
2272 "pxor %%mm4, %%mm4 \n\t"
2273 "pxor %%mm5, %%mm5 \n\t"
2275 "movq (%1), %%mm1 \n\t"
2276 "movq (%1, %6), %%mm3 \n\t"
2277 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2278 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2279 "punpcklbw %%mm7, %%mm0 \n\t"
2280 "punpcklbw %%mm7, %%mm2 \n\t"
2281 "pmaddwd %%mm1, %%mm0 \n\t"
2282 "pmaddwd %%mm2, %%mm3 \n\t"
2283 "paddd %%mm3, %%mm5 \n\t"
2284 "paddd %%mm0, %%mm4 \n\t"
2286 "add $4, %%"REG_c" \n\t"
2287 "cmp %4, %%"REG_c" \n\t"
2290 "psrad $8, %%mm4 \n\t"
2291 "psrad $8, %%mm5 \n\t"
2292 "packssdw %%mm5, %%mm4 \n\t"
2293 "pmaddwd %%mm6, %%mm4 \n\t"
2294 "packssdw %%mm4, %%mm4 \n\t"
2295 "mov %3, %%"REG_a" \n\t"
2296 "movd %%mm4, (%%"REG_a", %0) \n\t"
2300 : "+r" (counter), "+r" (filter)
2301 : "m" (filterPos), "m" (dst), "m"(offset),
2302 "m" (src), "r" (filterSize*2)
2303 : "%"REG_a, "%"REG_c, "%"REG_d
2308 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2311 for (i=0; i<dstW; i++)
2314 int srcPos= filterPos[i];
2316 //printf("filterPos: %d\n", filterPos[i]);
2317 for (j=0; j<filterSize; j++)
2319 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2320 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2322 //filter += hFilterSize;
2323 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2326 #endif /* HAVE_ALTIVEC */
2327 #endif /* HAVE_MMX */
2329 // *** horizontal scale Y line to temp buffer
2330 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2331 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2332 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2333 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2334 int32_t *mmx2FilterPos, uint8_t *pal)
2336 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2338 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2339 src= formatConvBuffer;
2341 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2343 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2344 src= formatConvBuffer;
2346 else if (srcFormat==PIX_FMT_RGB32)
2348 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2349 src= formatConvBuffer;
2351 else if (srcFormat==PIX_FMT_RGB32_1)
2353 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2354 src= formatConvBuffer;
2356 else if (srcFormat==PIX_FMT_BGR24)
2358 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2359 src= formatConvBuffer;
2361 else if (srcFormat==PIX_FMT_BGR565)
2363 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2364 src= formatConvBuffer;
2366 else if (srcFormat==PIX_FMT_BGR555)
2368 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2369 src= formatConvBuffer;
2371 else if (srcFormat==PIX_FMT_BGR32)
2373 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2374 src= formatConvBuffer;
2376 else if (srcFormat==PIX_FMT_BGR32_1)
2378 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2379 src= formatConvBuffer;
2381 else if (srcFormat==PIX_FMT_RGB24)
2383 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2384 src= formatConvBuffer;
2386 else if (srcFormat==PIX_FMT_RGB565)
2388 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2389 src= formatConvBuffer;
2391 else if (srcFormat==PIX_FMT_RGB555)
2393 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2394 src= formatConvBuffer;
2396 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2398 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2399 src= formatConvBuffer;
2403 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2404 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2406 if (!(flags&SWS_FAST_BILINEAR))
2409 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2411 else // fast bilinear upscale / crap downscale
2413 #if defined(ARCH_X86)
2417 uint64_t ebxsave __attribute__((aligned(8)));
2423 "mov %%"REG_b", %5 \n\t"
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2437 #define FUNNY_Y_CODE \
2438 "movl (%%"REG_b"), %%esi \n\t"\
2440 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2441 "add %%"REG_S", %%"REG_c" \n\t"\
2442 "add %%"REG_a", %%"REG_D" \n\t"\
2443 "xor %%"REG_a", %%"REG_a" \n\t"\
2447 #define FUNNY_Y_CODE \
2448 "movl (%%"REG_b"), %%esi \n\t"\
2450 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2451 "add %%"REG_a", %%"REG_D" \n\t"\
2452 "xor %%"REG_a", %%"REG_a" \n\t"\
2454 #endif /* ARCH_X86_64 */
2466 "mov %5, %%"REG_b" \n\t"
2468 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2473 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2478 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2482 #endif /* HAVE_MMX2 */
2483 long xInc_shr16 = xInc >> 16;
2484 uint16_t xInc_mask = xInc & 0xffff;
2485 //NO MMX just normal asm ...
2487 "xor %%"REG_a", %%"REG_a" \n\t" // i
2488 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2489 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2492 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2493 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2494 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2495 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2496 "shll $16, %%edi \n\t"
2497 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2498 "mov %1, %%"REG_D" \n\t"
2499 "shrl $9, %%esi \n\t"
2500 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2501 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2502 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2504 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2505 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2506 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2507 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2508 "shll $16, %%edi \n\t"
2509 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2510 "mov %1, %%"REG_D" \n\t"
2511 "shrl $9, %%esi \n\t"
2512 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2513 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2514 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2517 "add $2, %%"REG_a" \n\t"
2518 "cmp %2, %%"REG_a" \n\t"
2522 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2523 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2526 } //if MMX2 can't be used
2530 unsigned int xpos=0;
2531 for (i=0;i<dstWidth;i++)
2533 register unsigned int xx=xpos>>16;
2534 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2535 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2538 #endif /* defined(ARCH_X86) */
2541 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2543 //FIXME all pal and rgb srcFormats could do this convertion as well
2544 //FIXME all scalers more complex than bilinear could do half of this transform
2546 for (i=0; i<dstWidth; i++)
2547 dst[i]= (dst[i]*14071 + 33561947)>>14;
2549 for (i=0; i<dstWidth; i++)
2550 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2555 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2556 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2557 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2558 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2559 int32_t *mmx2FilterPos, uint8_t *pal)
2561 if (srcFormat==PIX_FMT_YUYV422)
2563 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2564 src1= formatConvBuffer;
2565 src2= formatConvBuffer+VOFW;
2567 else if (srcFormat==PIX_FMT_UYVY422)
2569 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2570 src1= formatConvBuffer;
2571 src2= formatConvBuffer+VOFW;
2573 else if (srcFormat==PIX_FMT_RGB32)
2575 if(c->chrSrcHSubSample)
2576 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2578 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2579 src1= formatConvBuffer;
2580 src2= formatConvBuffer+VOFW;
2582 else if (srcFormat==PIX_FMT_RGB32_1)
2584 if(c->chrSrcHSubSample)
2585 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2587 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2588 src1= formatConvBuffer;
2589 src2= formatConvBuffer+VOFW;
2591 else if (srcFormat==PIX_FMT_BGR24)
2593 if(c->chrSrcHSubSample)
2594 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2596 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2597 src1= formatConvBuffer;
2598 src2= formatConvBuffer+VOFW;
2600 else if (srcFormat==PIX_FMT_BGR565)
2602 if(c->chrSrcHSubSample)
2603 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2605 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2606 src1= formatConvBuffer;
2607 src2= formatConvBuffer+VOFW;
2609 else if (srcFormat==PIX_FMT_BGR555)
2611 if(c->chrSrcHSubSample)
2612 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2614 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2615 src1= formatConvBuffer;
2616 src2= formatConvBuffer+VOFW;
2618 else if (srcFormat==PIX_FMT_BGR32)
2620 if(c->chrSrcHSubSample)
2621 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2623 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2624 src1= formatConvBuffer;
2625 src2= formatConvBuffer+VOFW;
2627 else if (srcFormat==PIX_FMT_BGR32_1)
2629 if(c->chrSrcHSubSample)
2630 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2632 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2633 src1= formatConvBuffer;
2634 src2= formatConvBuffer+VOFW;
2636 else if (srcFormat==PIX_FMT_RGB24)
2638 if(c->chrSrcHSubSample)
2639 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2641 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2642 src1= formatConvBuffer;
2643 src2= formatConvBuffer+VOFW;
2645 else if (srcFormat==PIX_FMT_RGB565)
2647 if(c->chrSrcHSubSample)
2648 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2650 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2651 src1= formatConvBuffer;
2652 src2= formatConvBuffer+VOFW;
2654 else if (srcFormat==PIX_FMT_RGB555)
2656 if(c->chrSrcHSubSample)
2657 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2659 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2660 src1= formatConvBuffer;
2661 src2= formatConvBuffer+VOFW;
2663 else if (isGray(srcFormat))
2667 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2669 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2670 src1= formatConvBuffer;
2671 src2= formatConvBuffer+VOFW;
2675 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2676 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2678 if (!(flags&SWS_FAST_BILINEAR))
2681 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2682 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2684 else // fast bilinear upscale / crap downscale
2686 #if defined(ARCH_X86)
2690 uint64_t ebxsave __attribute__((aligned(8)));
2696 "mov %%"REG_b", %6 \n\t"
2698 "pxor %%mm7, %%mm7 \n\t"
2699 "mov %0, %%"REG_c" \n\t"
2700 "mov %1, %%"REG_D" \n\t"
2701 "mov %2, %%"REG_d" \n\t"
2702 "mov %3, %%"REG_b" \n\t"
2703 "xor %%"REG_a", %%"REG_a" \n\t" // i
2704 PREFETCH" (%%"REG_c") \n\t"
2705 PREFETCH" 32(%%"REG_c") \n\t"
2706 PREFETCH" 64(%%"REG_c") \n\t"
2710 #define FUNNY_UV_CODE \
2711 "movl (%%"REG_b"), %%esi \n\t"\
2713 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2714 "add %%"REG_S", %%"REG_c" \n\t"\
2715 "add %%"REG_a", %%"REG_D" \n\t"\
2716 "xor %%"REG_a", %%"REG_a" \n\t"\
2720 #define FUNNY_UV_CODE \
2721 "movl (%%"REG_b"), %%esi \n\t"\
2723 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2724 "add %%"REG_a", %%"REG_D" \n\t"\
2725 "xor %%"REG_a", %%"REG_a" \n\t"\
2727 #endif /* ARCH_X86_64 */
2733 "xor %%"REG_a", %%"REG_a" \n\t" // i
2734 "mov %5, %%"REG_c" \n\t" // src
2735 "mov %1, %%"REG_D" \n\t" // buf1
2736 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2737 PREFETCH" (%%"REG_c") \n\t"
2738 PREFETCH" 32(%%"REG_c") \n\t"
2739 PREFETCH" 64(%%"REG_c") \n\t"
2747 "mov %6, %%"REG_b" \n\t"
2749 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2750 "m" (funnyUVCode), "m" (src2)
2754 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2759 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2761 //printf("%d %d %d\n", dstWidth, i, srcW);
2762 dst[i] = src1[srcW-1]*128;
2763 dst[i+VOFW] = src2[srcW-1]*128;
2768 #endif /* HAVE_MMX2 */
2769 long xInc_shr16 = (long) (xInc >> 16);
2770 uint16_t xInc_mask = xInc & 0xffff;
2772 "xor %%"REG_a", %%"REG_a" \n\t" // i
2773 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2774 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2777 "mov %0, %%"REG_S" \n\t"
2778 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2779 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2780 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2781 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2782 "shll $16, %%edi \n\t"
2783 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2784 "mov %1, %%"REG_D" \n\t"
2785 "shrl $9, %%esi \n\t"
2786 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2788 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2789 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2790 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2791 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2792 "shll $16, %%edi \n\t"
2793 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2794 "mov %1, %%"REG_D" \n\t"
2795 "shrl $9, %%esi \n\t"
2796 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2798 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2799 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2800 "add $1, %%"REG_a" \n\t"
2801 "cmp %2, %%"REG_a" \n\t"
2804 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2805 which is needed to support GCC 4.0. */
2806 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2807 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2809 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2812 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2815 } //if MMX2 can't be used
2819 unsigned int xpos=0;
2820 for (i=0;i<dstWidth;i++)
2822 register unsigned int xx=xpos>>16;
2823 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2824 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2825 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2827 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2828 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2832 #endif /* defined(ARCH_X86) */
2834 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2836 //FIXME all pal and rgb srcFormats could do this convertion as well
2837 //FIXME all scalers more complex than bilinear could do half of this transform
2839 for (i=0; i<dstWidth; i++){
2840 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2841 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2844 for (i=0; i<dstWidth; i++){
2845 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2846 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2852 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2853 int srcSliceH, uint8_t* dst[], int dstStride[]){
2855 /* load a few things into local vars to make the code more readable? and faster */
2856 const int srcW= c->srcW;
2857 const int dstW= c->dstW;
2858 const int dstH= c->dstH;
2859 const int chrDstW= c->chrDstW;
2860 const int chrSrcW= c->chrSrcW;
2861 const int lumXInc= c->lumXInc;
2862 const int chrXInc= c->chrXInc;
2863 const int dstFormat= c->dstFormat;
2864 const int srcFormat= c->srcFormat;
2865 const int flags= c->flags;
2866 const int canMMX2BeUsed= c->canMMX2BeUsed;
2867 int16_t *vLumFilterPos= c->vLumFilterPos;
2868 int16_t *vChrFilterPos= c->vChrFilterPos;
2869 int16_t *hLumFilterPos= c->hLumFilterPos;
2870 int16_t *hChrFilterPos= c->hChrFilterPos;
2871 int16_t *vLumFilter= c->vLumFilter;
2872 int16_t *vChrFilter= c->vChrFilter;
2873 int16_t *hLumFilter= c->hLumFilter;
2874 int16_t *hChrFilter= c->hChrFilter;
2875 int32_t *lumMmxFilter= c->lumMmxFilter;
2876 int32_t *chrMmxFilter= c->chrMmxFilter;
2877 const int vLumFilterSize= c->vLumFilterSize;
2878 const int vChrFilterSize= c->vChrFilterSize;
2879 const int hLumFilterSize= c->hLumFilterSize;
2880 const int hChrFilterSize= c->hChrFilterSize;
2881 int16_t **lumPixBuf= c->lumPixBuf;
2882 int16_t **chrPixBuf= c->chrPixBuf;
2883 const int vLumBufSize= c->vLumBufSize;
2884 const int vChrBufSize= c->vChrBufSize;
2885 uint8_t *funnyYCode= c->funnyYCode;
2886 uint8_t *funnyUVCode= c->funnyUVCode;
2887 uint8_t *formatConvBuffer= c->formatConvBuffer;
2888 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2889 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2893 /* vars which will change and which we need to store back in the context */
2895 int lumBufIndex= c->lumBufIndex;
2896 int chrBufIndex= c->chrBufIndex;
2897 int lastInLumBuf= c->lastInLumBuf;
2898 int lastInChrBuf= c->lastInChrBuf;
2900 if (isPacked(c->srcFormat)){
2907 srcStride[2]= srcStride[0];
2909 srcStride[1]<<= c->vChrDrop;
2910 srcStride[2]<<= c->vChrDrop;
2912 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2913 // (int)dst[0], (int)dst[1], (int)dst[2]);
2915 #if 0 //self test FIXME move to a vfilter or something
2917 static volatile int i=0;
2919 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2920 selfTest(src, srcStride, c->srcW, c->srcH);
2925 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2926 //dstStride[0],dstStride[1],dstStride[2]);
2928 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2930 static int firstTime=1; //FIXME move this into the context perhaps
2931 if (flags & SWS_PRINT_INFO && firstTime)
2933 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2934 " ->cannot do aligned memory accesses anymore\n");
2939 /* Note the user might start scaling the picture in the middle so this
2940 will not get executed. This is not really intended but works
2941 currently, so people might do it. */
2952 for (;dstY < dstH; dstY++){
2953 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2954 const int chrDstY= dstY>>c->chrDstVSubSample;
2955 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2956 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2958 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2959 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2960 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2961 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2963 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2964 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2965 //handle holes (FAST_BILINEAR & weird filters)
2966 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2967 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2968 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2969 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2970 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2972 // Do we have enough lines in this slice to output the dstY line
2973 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2975 //Do horizontal scaling
2976 while(lastInLumBuf < lastLumSrcY)
2978 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2980 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2981 assert(lumBufIndex < 2*vLumBufSize);
2982 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2983 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2984 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2985 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2986 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2987 funnyYCode, c->srcFormat, formatConvBuffer,
2988 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2991 while(lastInChrBuf < lastChrSrcY)
2993 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2994 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2996 assert(chrBufIndex < 2*vChrBufSize);
2997 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2998 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2999 //FIXME replace parameters through context struct (some at least)
3001 if (!(isGray(srcFormat) || isGray(dstFormat)))
3002 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3003 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3004 funnyUVCode, c->srcFormat, formatConvBuffer,
3005 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3008 //wrap buf index around to stay inside the ring buffer
3009 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3010 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3012 else // not enough lines left in this slice -> load the rest in the buffer
3014 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3015 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3016 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3017 vChrBufSize, vLumBufSize);*/
3019 //Do horizontal scaling
3020 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3022 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3024 assert(lumBufIndex < 2*vLumBufSize);
3025 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3026 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3027 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3028 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3029 funnyYCode, c->srcFormat, formatConvBuffer,
3030 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3033 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3035 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3036 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3038 assert(chrBufIndex < 2*vChrBufSize);
3039 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3040 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3042 if (!(isGray(srcFormat) || isGray(dstFormat)))
3043 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3044 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3045 funnyUVCode, c->srcFormat, formatConvBuffer,
3046 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3049 //wrap buf index around to stay inside the ring buffer
3050 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3051 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3052 break; //we can't output a dstY line so let's try with the next slice
3056 b5Dither= ff_dither8[dstY&1];
3057 g6Dither= ff_dither4[dstY&1];
3058 g5Dither= ff_dither8[dstY&1];
3059 r5Dither= ff_dither8[(dstY+1)&1];
3063 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3064 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3067 if (flags & SWS_ACCURATE_RND){
3068 int s= APCK_SIZE / 8;
3069 for (i=0; i<vLumFilterSize; i+=2){
3070 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3071 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3072 lumMmxFilter[s*i+APCK_COEF/4 ]=
3073 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3074 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3076 for (i=0; i<vChrFilterSize; i+=2){
3077 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3078 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3079 chrMmxFilter[s*i+APCK_COEF/4 ]=
3080 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3081 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3084 for (i=0; i<vLumFilterSize; i++)
3086 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3087 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3088 lumMmxFilter[4*i+2]=
3089 lumMmxFilter[4*i+3]=
3090 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3092 for (i=0; i<vChrFilterSize; i++)
3094 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3095 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3096 chrMmxFilter[4*i+2]=
3097 chrMmxFilter[4*i+3]=
3098 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3102 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3103 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3104 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3105 RENAME(yuv2nv12X)(c,
3106 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3107 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3108 dest, uDest, dstW, chrDstW, dstFormat);
3110 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3112 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3113 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3114 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3116 int16_t *lumBuf = lumPixBuf[0];
3117 int16_t *chrBuf= chrPixBuf[0];
3118 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3123 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3124 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3125 dest, uDest, vDest, dstW, chrDstW);
3130 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3131 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3132 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3134 int chrAlpha= vChrFilter[2*dstY+1];
3135 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3136 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3138 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3140 int lumAlpha= vLumFilter[2*dstY+1];
3141 int chrAlpha= vChrFilter[2*dstY+1];
3143 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3145 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3146 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3147 dest, dstW, lumAlpha, chrAlpha, dstY);
3151 RENAME(yuv2packedX)(c,
3152 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3153 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3158 else // hmm looks like we can't use MMX here without overwriting this array's tail
3160 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3161 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3162 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3163 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3164 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3166 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3167 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3168 dest, uDest, dstW, chrDstW, dstFormat);
3170 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3172 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3173 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3175 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3176 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3177 dest, uDest, vDest, dstW, chrDstW);
3181 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3182 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3184 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3185 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3192 asm volatile(SFENCE:::"memory");
3193 asm volatile(EMMS:::"memory");
3195 /* store changed local vars back in the context */
3197 c->lumBufIndex= lumBufIndex;
3198 c->chrBufIndex= chrBufIndex;
3199 c->lastInLumBuf= lastInLumBuf;
3200 c->lastInChrBuf= lastInChrBuf;
3202 return dstY - lastDstY;