2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddw %%mm7, %%mm0 \n\t"\
194 "paddw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if (c->flags & SWS_ACCURATE_RND){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
970 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
974 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977 #endif //!HAVE_ALTIVEC
978 #endif /* HAVE_MMX */
981 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
985 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
990 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
999 if (c->flags & SWS_ACCURATE_RND){
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1021 for (i=0; i<dstW; i++)
1023 int val= (lumSrc[i]+64)>>7;
1034 for (i=0; i<chrDstW; i++)
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1041 else if (u>255) u=255;
1043 else if (v>255) v=255;
1054 * vertical scale YV12 to RGB
1056 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1065 YSCALEYUV2PACKEDX_ACCURATE
1067 WRITEBGR32(%4, %5, %%REGa)
1069 YSCALEYUV2PACKEDX_END
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1085 case PIX_FMT_RGB555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1095 WRITERGB15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1098 case PIX_FMT_RGB565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1108 WRITERGB16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1124 switch(c->dstFormat)
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 case PIX_FMT_RGB555:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1155 WRITERGB15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1158 case PIX_FMT_RGB565:
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1168 WRITERGB16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1171 case PIX_FMT_YUYV422:
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1184 #endif /* HAVE_MMX */
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1202 * vertical bilinear scale YV12 to RGB
1204 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1207 int yalpha1=yalpha^4095;
1208 int uvalpha1=uvalpha^4095;
1212 if (flags&SWS_FULL_CHR_H_INT)
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1290 case PIX_FMT_BGR555:
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1323 case PIX_FMT_BGR565:
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1356 #endif /* HAVE_MMX */
1361 if (dstFormat==PIX_FMT_RGB32)
1364 #ifdef WORDS_BIGENDIAN
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1378 else if (dstFormat==PIX_FMT_BGR24)
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1392 else if (dstFormat==PIX_FMT_BGR565)
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1407 else if (dstFormat==PIX_FMT_BGR555)
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1427 switch(c->dstFormat)
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1457 case PIX_FMT_RGB555:
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1470 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 case PIX_FMT_RGB565:
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1491 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 case PIX_FMT_YUYV422:
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C)
1518 * YV12 to RGB without scaling or interpolating
1520 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1523 const int yalpha1=0;
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1529 if (flags&SWS_FULL_CHR_H_INT)
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 case PIX_FMT_RGB555:
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1580 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1588 case PIX_FMT_RGB565:
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1601 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1609 case PIX_FMT_YUYV422:
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 case PIX_FMT_RGB555:
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1669 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1677 case PIX_FMT_RGB565:
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1690 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1698 case PIX_FMT_YUYV422:
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1714 #endif /* HAVE_MMX */
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C)
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C)
1723 //FIXME yuy2* can read up to 7 samples too much
1725 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1745 for (i=0; i<width; i++)
1750 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1776 for (i=0; i<width; i++)
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1782 assert(src1 == src2);
1785 /* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1791 "mov %0, %%"REG_a" \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1806 for (i=0; i<width; i++)
1811 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1837 for (i=0; i<width; i++)
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1843 assert(src1 == src2);
1846 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1847 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1850 for (i=0; i<width; i++)\
1852 int b= (((type*)src)[i]>>shb)&maskb;\
1853 int g= (((type*)src)[i]>>shg)&maskg;\
1854 int r= (((type*)src)[i]>>shr)&maskr;\
1856 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1860 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1861 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1862 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1863 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1864 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1865 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1868 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1871 assert(src1 == src2);
1872 for (i=0; i<width; i++)
1874 int b= ((uint32_t*)src1)[i]&0xFF;
1875 int g= (((uint32_t*)src1)[i]>>8)&0xFF;
1876 int r= (((uint32_t*)src1)[i]>>16)&0xFF;
1878 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1879 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1883 static inline void RENAME(bgr32ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1886 assert(src1 == src2);
1887 for (i=0; i<width; i++)
1889 const int a= ((uint32_t*)src1)[2*i+0];
1890 const int e= ((uint32_t*)src1)[2*i+1];
1891 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1892 const int h= (a&0x00FF00) + (e&0x00FF00);
1893 const int b= l&0x3FF;
1897 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1898 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1903 static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1906 if(srcFormat == PIX_FMT_BGR24){
1908 "movq "MANGLE(ff_bgr24toY1Coeff)", %mm5 \n\t"
1909 "movq "MANGLE(ff_bgr24toY2Coeff)", %mm6 \n\t"
1913 "movq "MANGLE(ff_rgb24toY1Coeff)", %mm5 \n\t"
1914 "movq "MANGLE(ff_rgb24toY2Coeff)", %mm6 \n\t"
1919 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1920 "mov %2, %%"REG_a" \n\t"
1921 "pxor %%mm7, %%mm7 \n\t"
1923 PREFETCH" 64(%0) \n\t"
1924 "movd (%0), %%mm0 \n\t"
1925 "movd 2(%0), %%mm1 \n\t"
1926 "movd 6(%0), %%mm2 \n\t"
1927 "movd 8(%0), %%mm3 \n\t"
1929 "punpcklbw %%mm7, %%mm0 \n\t"
1930 "punpcklbw %%mm7, %%mm1 \n\t"
1931 "punpcklbw %%mm7, %%mm2 \n\t"
1932 "punpcklbw %%mm7, %%mm3 \n\t"
1933 "pmaddwd %%mm5, %%mm0 \n\t"
1934 "pmaddwd %%mm6, %%mm1 \n\t"
1935 "pmaddwd %%mm5, %%mm2 \n\t"
1936 "pmaddwd %%mm6, %%mm3 \n\t"
1937 "paddd %%mm1, %%mm0 \n\t"
1938 "paddd %%mm3, %%mm2 \n\t"
1939 "paddd %%mm4, %%mm0 \n\t"
1940 "paddd %%mm4, %%mm2 \n\t"
1941 "psrad $15, %%mm0 \n\t"
1942 "psrad $15, %%mm2 \n\t"
1943 "packssdw %%mm2, %%mm0 \n\t"
1944 "packuswb %%mm0, %%mm0 \n\t"
1945 "movd %%mm0, (%1, %%"REG_a") \n\t"
1946 "add $4, %%"REG_a" \n\t"
1949 : "r" (dst+width), "g" (-width)
1954 static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1957 "movq 24+%4, %%mm6 \n\t"
1958 "mov %3, %%"REG_a" \n\t"
1959 "pxor %%mm7, %%mm7 \n\t"
1961 PREFETCH" 64(%0) \n\t"
1962 "movd (%0), %%mm0 \n\t"
1963 "movd 2(%0), %%mm1 \n\t"
1964 "punpcklbw %%mm7, %%mm0 \n\t"
1965 "punpcklbw %%mm7, %%mm1 \n\t"
1966 "movq %%mm0, %%mm2 \n\t"
1967 "movq %%mm1, %%mm3 \n\t"
1968 "pmaddwd %4, %%mm0 \n\t"
1969 "pmaddwd 8+%4, %%mm1 \n\t"
1970 "pmaddwd 16+%4, %%mm2 \n\t"
1971 "pmaddwd %%mm6, %%mm3 \n\t"
1972 "paddd %%mm1, %%mm0 \n\t"
1973 "paddd %%mm3, %%mm2 \n\t"
1975 "movd 6(%0), %%mm1 \n\t"
1976 "movd 8(%0), %%mm3 \n\t"
1978 "punpcklbw %%mm7, %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm3 \n\t"
1980 "movq %%mm1, %%mm4 \n\t"
1981 "movq %%mm3, %%mm5 \n\t"
1982 "pmaddwd %4, %%mm1 \n\t"
1983 "pmaddwd 8+%4, %%mm3 \n\t"
1984 "pmaddwd 16+%4, %%mm4 \n\t"
1985 "pmaddwd %%mm6, %%mm5 \n\t"
1986 "paddd %%mm3, %%mm1 \n\t"
1987 "paddd %%mm5, %%mm4 \n\t"
1989 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1990 "paddd %%mm3, %%mm0 \n\t"
1991 "paddd %%mm3, %%mm2 \n\t"
1992 "paddd %%mm3, %%mm1 \n\t"
1993 "paddd %%mm3, %%mm4 \n\t"
1994 "psrad $15, %%mm0 \n\t"
1995 "psrad $15, %%mm2 \n\t"
1996 "psrad $15, %%mm1 \n\t"
1997 "psrad $15, %%mm4 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t"
1999 "packssdw %%mm4, %%mm2 \n\t"
2000 "packuswb %%mm0, %%mm0 \n\t"
2001 "packuswb %%mm2, %%mm2 \n\t"
2002 "movd %%mm0, (%1, %%"REG_a") \n\t"
2003 "movd %%mm2, (%2, %%"REG_a") \n\t"
2004 "add $4, %%"REG_a" \n\t"
2007 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2013 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2016 bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2019 for (i=0; i<width; i++)
2025 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2027 #endif /* HAVE_MMX */
2030 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2033 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2036 for (i=0; i<width; i++)
2038 int b= src1[3*i + 0];
2039 int g= src1[3*i + 1];
2040 int r= src1[3*i + 2];
2042 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2043 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2045 #endif /* HAVE_MMX */
2046 assert(src1 == src2);
2049 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2052 for (i=0; i<width; i++)
2054 int b= src1[6*i + 0] + src1[6*i + 3];
2055 int g= src1[6*i + 1] + src1[6*i + 4];
2056 int r= src1[6*i + 2] + src1[6*i + 5];
2058 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2059 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2061 assert(src1 == src2);
2064 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2068 for (i=0; i<width; i++)
2070 int d= ((uint16_t*)src1)[i];
2073 int r= (d>>11)&0x1F;
2075 dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2);
2076 dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2);
2080 static inline void RENAME(rgb16ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2084 for (i=0; i<width; i++)
2086 int d0= ((uint32_t*)src1)[i];
2088 int dl= (d0&0x07E0F81F);
2089 int dh= ((d0>>5)&0x07C0F83F);
2091 int dh2= (dh>>11) + (dh<<21);
2095 int r= (d>>11)&0x7F;
2097 dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2);
2098 dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2);
2102 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2106 for (i=0; i<width; i++)
2108 int d= ((uint16_t*)src1)[i];
2111 int r= (d>>10)&0x1F;
2113 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3);
2114 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3);
2118 static inline void RENAME(rgb15ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2122 for (i=0; i<width; i++)
2124 int d0= ((uint32_t*)src1)[i];
2126 int dl= (d0&0x03E07C1F);
2127 int dh= ((d0>>5)&0x03E0F81F);
2129 int dh2= (dh>>11) + (dh<<21);
2133 int r= (d>>10)&0x7F;
2135 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3);
2136 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3);
2140 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2144 for (i=0; i<width; i++)
2146 int r= ((uint32_t*)src1)[i]&0xFF;
2147 int g= (((uint32_t*)src1)[i]>>8)&0xFF;
2148 int b= (((uint32_t*)src1)[i]>>16)&0xFF;
2150 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2151 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2155 static inline void RENAME(rgb32ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2159 for (i=0; i<width; i++)
2161 const int a= ((uint32_t*)src1)[2*i+0];
2162 const int e= ((uint32_t*)src1)[2*i+1];
2163 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2164 const int h= (a&0x00FF00) + (e&0x00FF00);
2165 const int r= l&0x3FF;
2169 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
2170 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1);
2174 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2177 bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2180 for (i=0; i<width; i++)
2186 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2191 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2196 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2198 for (i=0; i<width; i++)
2200 int r= src1[3*i + 0];
2201 int g= src1[3*i + 1];
2202 int b= src1[3*i + 2];
2204 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2205 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2210 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2214 for (i=0; i<width; i++)
2216 int r= src1[6*i + 0] + src1[6*i + 0];
2217 int g= src1[6*i + 1] + src1[6*i + 1];
2218 int b= src1[6*i + 2] + src1[6*i + 2];
2220 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2221 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2225 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2228 assert(src1 == src2);
2229 for (i=0; i<width; i++)
2231 int d= ((uint16_t*)src1)[i];
2234 int b= (d>>11)&0x1F;
2236 dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2);
2237 dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2);
2241 static inline void RENAME(bgr16ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2244 assert(src1 == src2);
2245 for (i=0; i<width; i++)
2247 int d0= ((uint32_t*)src1)[i];
2249 int dl= (d0&0x07E0F81F);
2250 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2253 int b= (d>>11)&0x3F;
2255 dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2);
2256 dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2);
2260 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2263 assert(src1 == src2);
2264 for (i=0; i<width; i++)
2266 int d= ((uint16_t*)src1)[i];
2269 int b= (d>>10)&0x1F;
2271 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3);
2272 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3);
2276 static inline void RENAME(bgr15ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2279 assert(src1 == src2);
2280 for (i=0; i<width; i++)
2282 int d0= ((uint32_t*)src1)[i];
2284 int dl= (d0&0x03E07C1F);
2285 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2288 int b= (d>>10)&0x3F;
2290 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3);
2291 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3);
2295 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2298 for (i=0; i<width; i++)
2302 dst[i]= pal[d] & 0xFF;
2306 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2309 assert(src1 == src2);
2310 for (i=0; i<width; i++)
2312 int p= pal[src1[i]];
2319 // bilinear / bicubic scaling
2320 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2321 int16_t *filter, int16_t *filterPos, long filterSize)
2324 assert(filterSize % 4 == 0 && filterSize>0);
2325 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2327 long counter= -2*dstW;
2329 filterPos-= counter/2;
2333 "push %%"REG_b" \n\t"
2335 "pxor %%mm7, %%mm7 \n\t"
2336 "movq "MANGLE(w02)", %%mm6 \n\t"
2337 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2338 "mov %%"REG_a", %%"REG_BP" \n\t"
2341 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2342 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2343 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2344 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2345 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2346 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2347 "punpcklbw %%mm7, %%mm0 \n\t"
2348 "punpcklbw %%mm7, %%mm2 \n\t"
2349 "pmaddwd %%mm1, %%mm0 \n\t"
2350 "pmaddwd %%mm2, %%mm3 \n\t"
2351 "psrad $8, %%mm0 \n\t"
2352 "psrad $8, %%mm3 \n\t"
2353 "packssdw %%mm3, %%mm0 \n\t"
2354 "pmaddwd %%mm6, %%mm0 \n\t"
2355 "packssdw %%mm0, %%mm0 \n\t"
2356 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2357 "add $4, %%"REG_BP" \n\t"
2360 "pop %%"REG_BP" \n\t"
2362 "pop %%"REG_b" \n\t"
2365 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2371 else if (filterSize==8)
2373 long counter= -2*dstW;
2375 filterPos-= counter/2;
2379 "push %%"REG_b" \n\t"
2381 "pxor %%mm7, %%mm7 \n\t"
2382 "movq "MANGLE(w02)", %%mm6 \n\t"
2383 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2384 "mov %%"REG_a", %%"REG_BP" \n\t"
2387 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2388 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2389 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2390 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2391 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2392 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2393 "punpcklbw %%mm7, %%mm0 \n\t"
2394 "punpcklbw %%mm7, %%mm2 \n\t"
2395 "pmaddwd %%mm1, %%mm0 \n\t"
2396 "pmaddwd %%mm2, %%mm3 \n\t"
2398 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2399 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2400 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2401 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2402 "punpcklbw %%mm7, %%mm4 \n\t"
2403 "punpcklbw %%mm7, %%mm2 \n\t"
2404 "pmaddwd %%mm1, %%mm4 \n\t"
2405 "pmaddwd %%mm2, %%mm5 \n\t"
2406 "paddd %%mm4, %%mm0 \n\t"
2407 "paddd %%mm5, %%mm3 \n\t"
2409 "psrad $8, %%mm0 \n\t"
2410 "psrad $8, %%mm3 \n\t"
2411 "packssdw %%mm3, %%mm0 \n\t"
2412 "pmaddwd %%mm6, %%mm0 \n\t"
2413 "packssdw %%mm0, %%mm0 \n\t"
2414 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2415 "add $4, %%"REG_BP" \n\t"
2418 "pop %%"REG_BP" \n\t"
2420 "pop %%"REG_b" \n\t"
2423 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2431 uint8_t *offset = src+filterSize;
2432 long counter= -2*dstW;
2433 //filter-= counter*filterSize/2;
2434 filterPos-= counter/2;
2437 "pxor %%mm7, %%mm7 \n\t"
2438 "movq "MANGLE(w02)", %%mm6 \n\t"
2441 "mov %2, %%"REG_c" \n\t"
2442 "movzwl (%%"REG_c", %0), %%eax \n\t"
2443 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2444 "mov %5, %%"REG_c" \n\t"
2445 "pxor %%mm4, %%mm4 \n\t"
2446 "pxor %%mm5, %%mm5 \n\t"
2448 "movq (%1), %%mm1 \n\t"
2449 "movq (%1, %6), %%mm3 \n\t"
2450 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2451 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2452 "punpcklbw %%mm7, %%mm0 \n\t"
2453 "punpcklbw %%mm7, %%mm2 \n\t"
2454 "pmaddwd %%mm1, %%mm0 \n\t"
2455 "pmaddwd %%mm2, %%mm3 \n\t"
2456 "paddd %%mm3, %%mm5 \n\t"
2457 "paddd %%mm0, %%mm4 \n\t"
2459 "add $4, %%"REG_c" \n\t"
2460 "cmp %4, %%"REG_c" \n\t"
2463 "psrad $8, %%mm4 \n\t"
2464 "psrad $8, %%mm5 \n\t"
2465 "packssdw %%mm5, %%mm4 \n\t"
2466 "pmaddwd %%mm6, %%mm4 \n\t"
2467 "packssdw %%mm4, %%mm4 \n\t"
2468 "mov %3, %%"REG_a" \n\t"
2469 "movd %%mm4, (%%"REG_a", %0) \n\t"
2473 : "+r" (counter), "+r" (filter)
2474 : "m" (filterPos), "m" (dst), "m"(offset),
2475 "m" (src), "r" (filterSize*2)
2476 : "%"REG_a, "%"REG_c, "%"REG_d
2481 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2484 for (i=0; i<dstW; i++)
2487 int srcPos= filterPos[i];
2489 //printf("filterPos: %d\n", filterPos[i]);
2490 for (j=0; j<filterSize; j++)
2492 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2493 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2495 //filter += hFilterSize;
2496 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2499 #endif /* HAVE_ALTIVEC */
2500 #endif /* HAVE_MMX */
2502 // *** horizontal scale Y line to temp buffer
2503 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2504 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2505 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2506 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2507 int32_t *mmx2FilterPos, uint8_t *pal)
2509 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2511 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2512 src= formatConvBuffer;
2514 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2516 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2517 src= formatConvBuffer;
2519 else if (srcFormat==PIX_FMT_RGB32)
2521 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2522 src= formatConvBuffer;
2524 else if (srcFormat==PIX_FMT_RGB32_1)
2526 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2527 src= formatConvBuffer;
2529 else if (srcFormat==PIX_FMT_BGR24)
2531 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2532 src= formatConvBuffer;
2534 else if (srcFormat==PIX_FMT_BGR565)
2536 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2537 src= formatConvBuffer;
2539 else if (srcFormat==PIX_FMT_BGR555)
2541 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2542 src= formatConvBuffer;
2544 else if (srcFormat==PIX_FMT_BGR32)
2546 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2547 src= formatConvBuffer;
2549 else if (srcFormat==PIX_FMT_BGR32_1)
2551 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2552 src= formatConvBuffer;
2554 else if (srcFormat==PIX_FMT_RGB24)
2556 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2557 src= formatConvBuffer;
2559 else if (srcFormat==PIX_FMT_RGB565)
2561 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2562 src= formatConvBuffer;
2564 else if (srcFormat==PIX_FMT_RGB555)
2566 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2567 src= formatConvBuffer;
2569 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2571 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2572 src= formatConvBuffer;
2576 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2577 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2579 if (!(flags&SWS_FAST_BILINEAR))
2582 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2584 else // fast bilinear upscale / crap downscale
2586 #if defined(ARCH_X86)
2590 uint64_t ebxsave __attribute__((aligned(8)));
2596 "mov %%"REG_b", %5 \n\t"
2598 "pxor %%mm7, %%mm7 \n\t"
2599 "mov %0, %%"REG_c" \n\t"
2600 "mov %1, %%"REG_D" \n\t"
2601 "mov %2, %%"REG_d" \n\t"
2602 "mov %3, %%"REG_b" \n\t"
2603 "xor %%"REG_a", %%"REG_a" \n\t" // i
2604 PREFETCH" (%%"REG_c") \n\t"
2605 PREFETCH" 32(%%"REG_c") \n\t"
2606 PREFETCH" 64(%%"REG_c") \n\t"
2610 #define FUNNY_Y_CODE \
2611 "movl (%%"REG_b"), %%esi \n\t"\
2613 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2614 "add %%"REG_S", %%"REG_c" \n\t"\
2615 "add %%"REG_a", %%"REG_D" \n\t"\
2616 "xor %%"REG_a", %%"REG_a" \n\t"\
2620 #define FUNNY_Y_CODE \
2621 "movl (%%"REG_b"), %%esi \n\t"\
2623 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2624 "add %%"REG_a", %%"REG_D" \n\t"\
2625 "xor %%"REG_a", %%"REG_a" \n\t"\
2627 #endif /* ARCH_X86_64 */
2639 "mov %5, %%"REG_b" \n\t"
2641 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2646 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2651 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2655 #endif /* HAVE_MMX2 */
2656 long xInc_shr16 = xInc >> 16;
2657 uint16_t xInc_mask = xInc & 0xffff;
2658 //NO MMX just normal asm ...
2660 "xor %%"REG_a", %%"REG_a" \n\t" // i
2661 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2662 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2665 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2666 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2667 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2668 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2669 "shll $16, %%edi \n\t"
2670 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2671 "mov %1, %%"REG_D" \n\t"
2672 "shrl $9, %%esi \n\t"
2673 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2674 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2675 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2677 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2678 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2679 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2680 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2681 "shll $16, %%edi \n\t"
2682 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2683 "mov %1, %%"REG_D" \n\t"
2684 "shrl $9, %%esi \n\t"
2685 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2686 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2687 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2690 "add $2, %%"REG_a" \n\t"
2691 "cmp %2, %%"REG_a" \n\t"
2695 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2696 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2699 } //if MMX2 can't be used
2703 unsigned int xpos=0;
2704 for (i=0;i<dstWidth;i++)
2706 register unsigned int xx=xpos>>16;
2707 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2708 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2711 #endif /* defined(ARCH_X86) */
2714 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2716 //FIXME all pal and rgb srcFormats could do this convertion as well
2717 //FIXME all scalers more complex than bilinear could do half of this transform
2719 for (i=0; i<dstWidth; i++)
2720 dst[i]= (dst[i]*14071 + 33561947)>>14;
2722 for (i=0; i<dstWidth; i++)
2723 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2728 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2729 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2730 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2731 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2732 int32_t *mmx2FilterPos, uint8_t *pal)
2734 if (srcFormat==PIX_FMT_YUYV422)
2736 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2737 src1= formatConvBuffer;
2738 src2= formatConvBuffer+VOFW;
2740 else if (srcFormat==PIX_FMT_UYVY422)
2742 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2743 src1= formatConvBuffer;
2744 src2= formatConvBuffer+VOFW;
2746 else if (srcFormat==PIX_FMT_RGB32)
2748 if(c->chrSrcHSubSample)
2749 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2751 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2752 src1= formatConvBuffer;
2753 src2= formatConvBuffer+VOFW;
2755 else if (srcFormat==PIX_FMT_RGB32_1)
2757 if(c->chrSrcHSubSample)
2758 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2760 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2761 src1= formatConvBuffer;
2762 src2= formatConvBuffer+VOFW;
2764 else if (srcFormat==PIX_FMT_BGR24)
2766 if(c->chrSrcHSubSample)
2767 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2769 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2770 src1= formatConvBuffer;
2771 src2= formatConvBuffer+VOFW;
2773 else if (srcFormat==PIX_FMT_BGR565)
2775 if(c->chrSrcHSubSample)
2776 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2778 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2779 src1= formatConvBuffer;
2780 src2= formatConvBuffer+VOFW;
2782 else if (srcFormat==PIX_FMT_BGR555)
2784 if(c->chrSrcHSubSample)
2785 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2787 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2788 src1= formatConvBuffer;
2789 src2= formatConvBuffer+VOFW;
2791 else if (srcFormat==PIX_FMT_BGR32)
2793 if(c->chrSrcHSubSample)
2794 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2796 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2797 src1= formatConvBuffer;
2798 src2= formatConvBuffer+VOFW;
2800 else if (srcFormat==PIX_FMT_BGR32_1)
2802 if(c->chrSrcHSubSample)
2803 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2805 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2806 src1= formatConvBuffer;
2807 src2= formatConvBuffer+VOFW;
2809 else if (srcFormat==PIX_FMT_RGB24)
2811 if(c->chrSrcHSubSample)
2812 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2814 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2815 src1= formatConvBuffer;
2816 src2= formatConvBuffer+VOFW;
2818 else if (srcFormat==PIX_FMT_RGB565)
2820 if(c->chrSrcHSubSample)
2821 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2823 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2824 src1= formatConvBuffer;
2825 src2= formatConvBuffer+VOFW;
2827 else if (srcFormat==PIX_FMT_RGB555)
2829 if(c->chrSrcHSubSample)
2830 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2832 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2833 src1= formatConvBuffer;
2834 src2= formatConvBuffer+VOFW;
2836 else if (isGray(srcFormat))
2840 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2842 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2843 src1= formatConvBuffer;
2844 src2= formatConvBuffer+VOFW;
2848 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2849 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2851 if (!(flags&SWS_FAST_BILINEAR))
2854 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2855 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2857 else // fast bilinear upscale / crap downscale
2859 #if defined(ARCH_X86)
2863 uint64_t ebxsave __attribute__((aligned(8)));
2869 "mov %%"REG_b", %6 \n\t"
2871 "pxor %%mm7, %%mm7 \n\t"
2872 "mov %0, %%"REG_c" \n\t"
2873 "mov %1, %%"REG_D" \n\t"
2874 "mov %2, %%"REG_d" \n\t"
2875 "mov %3, %%"REG_b" \n\t"
2876 "xor %%"REG_a", %%"REG_a" \n\t" // i
2877 PREFETCH" (%%"REG_c") \n\t"
2878 PREFETCH" 32(%%"REG_c") \n\t"
2879 PREFETCH" 64(%%"REG_c") \n\t"
2883 #define FUNNY_UV_CODE \
2884 "movl (%%"REG_b"), %%esi \n\t"\
2886 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2887 "add %%"REG_S", %%"REG_c" \n\t"\
2888 "add %%"REG_a", %%"REG_D" \n\t"\
2889 "xor %%"REG_a", %%"REG_a" \n\t"\
2893 #define FUNNY_UV_CODE \
2894 "movl (%%"REG_b"), %%esi \n\t"\
2896 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2897 "add %%"REG_a", %%"REG_D" \n\t"\
2898 "xor %%"REG_a", %%"REG_a" \n\t"\
2900 #endif /* ARCH_X86_64 */
2906 "xor %%"REG_a", %%"REG_a" \n\t" // i
2907 "mov %5, %%"REG_c" \n\t" // src
2908 "mov %1, %%"REG_D" \n\t" // buf1
2909 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2910 PREFETCH" (%%"REG_c") \n\t"
2911 PREFETCH" 32(%%"REG_c") \n\t"
2912 PREFETCH" 64(%%"REG_c") \n\t"
2920 "mov %6, %%"REG_b" \n\t"
2922 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2923 "m" (funnyUVCode), "m" (src2)
2927 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2932 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2934 //printf("%d %d %d\n", dstWidth, i, srcW);
2935 dst[i] = src1[srcW-1]*128;
2936 dst[i+VOFW] = src2[srcW-1]*128;
2941 #endif /* HAVE_MMX2 */
2942 long xInc_shr16 = (long) (xInc >> 16);
2943 uint16_t xInc_mask = xInc & 0xffff;
2945 "xor %%"REG_a", %%"REG_a" \n\t" // i
2946 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2947 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2950 "mov %0, %%"REG_S" \n\t"
2951 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2952 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2953 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2954 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2955 "shll $16, %%edi \n\t"
2956 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2957 "mov %1, %%"REG_D" \n\t"
2958 "shrl $9, %%esi \n\t"
2959 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2961 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2962 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2963 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2964 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2965 "shll $16, %%edi \n\t"
2966 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2967 "mov %1, %%"REG_D" \n\t"
2968 "shrl $9, %%esi \n\t"
2969 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2971 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2972 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2973 "add $1, %%"REG_a" \n\t"
2974 "cmp %2, %%"REG_a" \n\t"
2977 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2978 which is needed to support GCC 4.0. */
2979 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2980 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2982 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2985 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2988 } //if MMX2 can't be used
2992 unsigned int xpos=0;
2993 for (i=0;i<dstWidth;i++)
2995 register unsigned int xx=xpos>>16;
2996 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2997 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2998 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
3000 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
3001 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
3005 #endif /* defined(ARCH_X86) */
3007 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
3009 //FIXME all pal and rgb srcFormats could do this convertion as well
3010 //FIXME all scalers more complex than bilinear could do half of this transform
3012 for (i=0; i<dstWidth; i++){
3013 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
3014 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
3017 for (i=0; i<dstWidth; i++){
3018 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
3019 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
3025 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
3026 int srcSliceH, uint8_t* dst[], int dstStride[]){
3028 /* load a few things into local vars to make the code more readable? and faster */
3029 const int srcW= c->srcW;
3030 const int dstW= c->dstW;
3031 const int dstH= c->dstH;
3032 const int chrDstW= c->chrDstW;
3033 const int chrSrcW= c->chrSrcW;
3034 const int lumXInc= c->lumXInc;
3035 const int chrXInc= c->chrXInc;
3036 const int dstFormat= c->dstFormat;
3037 const int srcFormat= c->srcFormat;
3038 const int flags= c->flags;
3039 const int canMMX2BeUsed= c->canMMX2BeUsed;
3040 int16_t *vLumFilterPos= c->vLumFilterPos;
3041 int16_t *vChrFilterPos= c->vChrFilterPos;
3042 int16_t *hLumFilterPos= c->hLumFilterPos;
3043 int16_t *hChrFilterPos= c->hChrFilterPos;
3044 int16_t *vLumFilter= c->vLumFilter;
3045 int16_t *vChrFilter= c->vChrFilter;
3046 int16_t *hLumFilter= c->hLumFilter;
3047 int16_t *hChrFilter= c->hChrFilter;
3048 int32_t *lumMmxFilter= c->lumMmxFilter;
3049 int32_t *chrMmxFilter= c->chrMmxFilter;
3050 const int vLumFilterSize= c->vLumFilterSize;
3051 const int vChrFilterSize= c->vChrFilterSize;
3052 const int hLumFilterSize= c->hLumFilterSize;
3053 const int hChrFilterSize= c->hChrFilterSize;
3054 int16_t **lumPixBuf= c->lumPixBuf;
3055 int16_t **chrPixBuf= c->chrPixBuf;
3056 const int vLumBufSize= c->vLumBufSize;
3057 const int vChrBufSize= c->vChrBufSize;
3058 uint8_t *funnyYCode= c->funnyYCode;
3059 uint8_t *funnyUVCode= c->funnyUVCode;
3060 uint8_t *formatConvBuffer= c->formatConvBuffer;
3061 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
3062 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
3066 /* vars which will change and which we need to store back in the context */
3068 int lumBufIndex= c->lumBufIndex;
3069 int chrBufIndex= c->chrBufIndex;
3070 int lastInLumBuf= c->lastInLumBuf;
3071 int lastInChrBuf= c->lastInChrBuf;
3073 if (isPacked(c->srcFormat)){
3080 srcStride[2]= srcStride[0];
3082 srcStride[1]<<= c->vChrDrop;
3083 srcStride[2]<<= c->vChrDrop;
3085 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3086 // (int)dst[0], (int)dst[1], (int)dst[2]);
3088 #if 0 //self test FIXME move to a vfilter or something
3090 static volatile int i=0;
3092 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3093 selfTest(src, srcStride, c->srcW, c->srcH);
3098 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3099 //dstStride[0],dstStride[1],dstStride[2]);
3101 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3103 static int firstTime=1; //FIXME move this into the context perhaps
3104 if (flags & SWS_PRINT_INFO && firstTime)
3106 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3107 " ->cannot do aligned memory accesses anymore\n");
3112 /* Note the user might start scaling the picture in the middle so this
3113 will not get executed. This is not really intended but works
3114 currently, so people might do it. */
3125 for (;dstY < dstH; dstY++){
3126 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3127 const int chrDstY= dstY>>c->chrDstVSubSample;
3128 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3129 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3131 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3132 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3133 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3134 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3136 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3137 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3138 //handle holes (FAST_BILINEAR & weird filters)
3139 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3140 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3141 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3142 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3143 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3145 // Do we have enough lines in this slice to output the dstY line
3146 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3148 //Do horizontal scaling
3149 while(lastInLumBuf < lastLumSrcY)
3151 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3153 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3154 assert(lumBufIndex < 2*vLumBufSize);
3155 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3156 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3157 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3158 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3159 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3160 funnyYCode, c->srcFormat, formatConvBuffer,
3161 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3164 while(lastInChrBuf < lastChrSrcY)
3166 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3167 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3169 assert(chrBufIndex < 2*vChrBufSize);
3170 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3171 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3172 //FIXME replace parameters through context struct (some at least)
3174 if (!(isGray(srcFormat) || isGray(dstFormat)))
3175 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3176 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3177 funnyUVCode, c->srcFormat, formatConvBuffer,
3178 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3181 //wrap buf index around to stay inside the ring buffer
3182 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3183 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3185 else // not enough lines left in this slice -> load the rest in the buffer
3187 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3188 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3189 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3190 vChrBufSize, vLumBufSize);*/
3192 //Do horizontal scaling
3193 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3195 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3197 assert(lumBufIndex < 2*vLumBufSize);
3198 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3199 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3200 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3201 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3202 funnyYCode, c->srcFormat, formatConvBuffer,
3203 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3206 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3208 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3209 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3211 assert(chrBufIndex < 2*vChrBufSize);
3212 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3213 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3215 if (!(isGray(srcFormat) || isGray(dstFormat)))
3216 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3217 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3218 funnyUVCode, c->srcFormat, formatConvBuffer,
3219 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3222 //wrap buf index around to stay inside the ring buffer
3223 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3224 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3225 break; //we can't output a dstY line so let's try with the next slice
3229 b5Dither= ff_dither8[dstY&1];
3230 g6Dither= ff_dither4[dstY&1];
3231 g5Dither= ff_dither8[dstY&1];
3232 r5Dither= ff_dither8[(dstY+1)&1];
3236 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3237 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3240 if (flags & SWS_ACCURATE_RND){
3241 int s= APCK_SIZE / 8;
3242 for (i=0; i<vLumFilterSize; i+=2){
3243 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3244 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3245 lumMmxFilter[s*i+APCK_COEF/4 ]=
3246 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3247 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3249 for (i=0; i<vChrFilterSize; i+=2){
3250 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3251 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3252 chrMmxFilter[s*i+APCK_COEF/4 ]=
3253 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3254 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3257 for (i=0; i<vLumFilterSize; i++)
3259 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3260 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3261 lumMmxFilter[4*i+2]=
3262 lumMmxFilter[4*i+3]=
3263 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3265 for (i=0; i<vChrFilterSize; i++)
3267 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3268 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3269 chrMmxFilter[4*i+2]=
3270 chrMmxFilter[4*i+3]=
3271 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3275 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3276 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3277 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3278 RENAME(yuv2nv12X)(c,
3279 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3280 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3281 dest, uDest, dstW, chrDstW, dstFormat);
3283 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3285 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3286 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3287 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3289 int16_t *lumBuf = lumPixBuf[0];
3290 int16_t *chrBuf= chrPixBuf[0];
3291 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3296 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3297 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3298 dest, uDest, vDest, dstW, chrDstW);
3303 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3304 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3305 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3307 int chrAlpha= vChrFilter[2*dstY+1];
3308 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3309 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3311 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3313 int lumAlpha= vLumFilter[2*dstY+1];
3314 int chrAlpha= vChrFilter[2*dstY+1];
3316 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3318 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3319 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3320 dest, dstW, lumAlpha, chrAlpha, dstY);
3324 RENAME(yuv2packedX)(c,
3325 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3326 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3331 else // hmm looks like we can't use MMX here without overwriting this array's tail
3333 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3334 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3335 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3336 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3337 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3339 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3340 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3341 dest, uDest, dstW, chrDstW, dstFormat);
3343 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3345 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3346 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3348 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3349 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3350 dest, uDest, vDest, dstW, chrDstW);
3354 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3355 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3357 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3358 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3365 asm volatile(SFENCE:::"memory");
3366 asm volatile(EMMS:::"memory");
3368 /* store changed local vars back in the context */
3370 c->lumBufIndex= lumBufIndex;
3371 c->chrBufIndex= chrBufIndex;
3372 c->lastInLumBuf= lastInLumBuf;
3373 c->lastInChrBuf= lastInChrBuf;
3375 return dstY - lastDstY;