2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "swscale_template.h"
27 #if COMPILE_TEMPLATE_MMX2
28 #define PREFETCH "prefetchnta"
30 #define PREFETCH " # nop"
33 #if COMPILE_TEMPLATE_MMX2
34 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
38 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
40 #define YSCALEYUV2YV12X(offset, dest, end, pos) \
42 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
43 "movq %%mm3, %%mm4 \n\t"\
44 "lea " offset "(%0), %%"REG_d" \n\t"\
45 "mov (%%"REG_d"), %%"REG_S" \n\t"\
46 ".p2align 4 \n\t" /* FIXME Unroll? */\
48 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
49 "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
50 "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
51 "add $16, %%"REG_d" \n\t"\
52 "mov (%%"REG_d"), %%"REG_S" \n\t"\
53 "test %%"REG_S", %%"REG_S" \n\t"\
54 "pmulhw %%mm0, %%mm2 \n\t"\
55 "pmulhw %%mm0, %%mm5 \n\t"\
56 "paddw %%mm2, %%mm3 \n\t"\
57 "paddw %%mm5, %%mm4 \n\t"\
59 "psraw $3, %%mm3 \n\t"\
60 "psraw $3, %%mm4 \n\t"\
61 "packuswb %%mm4, %%mm3 \n\t"\
62 MOVNTQ(%%mm3, (%1, %3))\
65 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
66 "movq %%mm3, %%mm4 \n\t"\
67 "lea " offset "(%0), %%"REG_d" \n\t"\
68 "mov (%%"REG_d"), %%"REG_S" \n\t"\
70 :: "r" (&c->redDither),\
71 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
75 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
76 const int16_t **lumSrc, int lumFilterSize,
77 const int16_t *chrFilter, const int16_t **chrUSrc,
78 const int16_t **chrVSrc,
79 int chrFilterSize, const int16_t **alpSrc,
80 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
81 uint8_t *aDest, long dstW, long chrDstW)
84 x86_reg uv_off = c->uv_off;
85 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
86 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
88 if (CONFIG_SWSCALE_ALPHA && aDest) {
89 YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
92 YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
95 #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
97 "lea " offset "(%0), %%"REG_d" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
105 "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
106 "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %3))\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
156 static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
157 const int16_t **lumSrc, int lumFilterSize,
158 const int16_t *chrFilter, const int16_t **chrUSrc,
159 const int16_t **chrVSrc,
160 int chrFilterSize, const int16_t **alpSrc,
161 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
162 uint8_t *aDest, long dstW, long chrDstW)
165 x86_reg uv_off = c->uv_off;
166 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
167 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
169 if (CONFIG_SWSCALE_ALPHA && aDest) {
170 YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
173 YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
176 #define YSCALEYUV2YV121 \
177 "mov %2, %%"REG_a" \n\t"\
178 ".p2align 4 \n\t" /* FIXME Unroll? */\
180 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
181 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
182 "psraw $7, %%mm0 \n\t"\
183 "psraw $7, %%mm1 \n\t"\
184 "packuswb %%mm1, %%mm0 \n\t"\
185 MOVNTQ(%%mm0, (%1, %%REGa))\
186 "add $8, %%"REG_a" \n\t"\
189 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
190 const int16_t *chrUSrc, const int16_t *chrVSrc,
191 const int16_t *alpSrc,
192 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
193 uint8_t *aDest, long dstW, long chrDstW)
196 const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
197 uint8_t *dst[4]= { aDest, dest, uDest, vDest };
198 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
204 :: "r" (src[p]), "r" (dst[p] + counter[p]),
212 #define YSCALEYUV2YV121_ACCURATE \
213 "mov %2, %%"REG_a" \n\t"\
214 "pcmpeqw %%mm7, %%mm7 \n\t"\
215 "psrlw $15, %%mm7 \n\t"\
216 "psllw $6, %%mm7 \n\t"\
217 ".p2align 4 \n\t" /* FIXME Unroll? */\
219 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
220 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
221 "paddsw %%mm7, %%mm0 \n\t"\
222 "paddsw %%mm7, %%mm1 \n\t"\
223 "psraw $7, %%mm0 \n\t"\
224 "psraw $7, %%mm1 \n\t"\
225 "packuswb %%mm1, %%mm0 \n\t"\
226 MOVNTQ(%%mm0, (%1, %%REGa))\
227 "add $8, %%"REG_a" \n\t"\
230 static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
231 const int16_t *chrUSrc, const int16_t *chrVSrc,
232 const int16_t *alpSrc,
233 uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
234 uint8_t *aDest, long dstW, long chrDstW)
237 const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
238 uint8_t *dst[4]= { aDest, dest, uDest, vDest };
239 x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
244 YSCALEYUV2YV121_ACCURATE
245 :: "r" (src[p]), "r" (dst[p] + counter[p]),
253 #define YSCALEYUV2PACKEDX_UV \
255 "xor %%"REG_a", %%"REG_a" \n\t"\
259 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
260 "mov (%%"REG_d"), %%"REG_S" \n\t"\
261 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
262 "movq %%mm3, %%mm4 \n\t"\
265 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
266 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
267 "add %6, %%"REG_S" \n\t" \
268 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
269 "add $16, %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pmulhw %%mm0, %%mm2 \n\t"\
272 "pmulhw %%mm0, %%mm5 \n\t"\
273 "paddw %%mm2, %%mm3 \n\t"\
274 "paddw %%mm5, %%mm4 \n\t"\
275 "test %%"REG_S", %%"REG_S" \n\t"\
278 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
279 "lea "offset"(%0), %%"REG_d" \n\t"\
280 "mov (%%"REG_d"), %%"REG_S" \n\t"\
281 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
282 "movq "#dst1", "#dst2" \n\t"\
285 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
286 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
287 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
288 "add $16, %%"REG_d" \n\t"\
289 "mov (%%"REG_d"), %%"REG_S" \n\t"\
290 "pmulhw "#coeff", "#src1" \n\t"\
291 "pmulhw "#coeff", "#src2" \n\t"\
292 "paddw "#src1", "#dst1" \n\t"\
293 "paddw "#src2", "#dst2" \n\t"\
294 "test %%"REG_S", %%"REG_S" \n\t"\
297 #define YSCALEYUV2PACKEDX \
298 YSCALEYUV2PACKEDX_UV \
299 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
301 #define YSCALEYUV2PACKEDX_END \
302 :: "r" (&c->redDither), \
303 "m" (dummy), "m" (dummy), "m" (dummy),\
304 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
305 : "%"REG_a, "%"REG_d, "%"REG_S \
308 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
310 "xor %%"REG_a", %%"REG_a" \n\t"\
314 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm4, %%mm4 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm6, %%mm6 \n\t"\
319 "pxor %%mm7, %%mm7 \n\t"\
322 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
323 "add %6, %%"REG_S" \n\t" \
324 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
325 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
326 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
327 "movq %%mm0, %%mm3 \n\t"\
328 "punpcklwd %%mm1, %%mm0 \n\t"\
329 "punpckhwd %%mm1, %%mm3 \n\t"\
330 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
331 "pmaddwd %%mm1, %%mm0 \n\t"\
332 "pmaddwd %%mm1, %%mm3 \n\t"\
333 "paddd %%mm0, %%mm4 \n\t"\
334 "paddd %%mm3, %%mm5 \n\t"\
335 "add %6, %%"REG_S" \n\t" \
336 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
337 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
338 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
339 "test %%"REG_S", %%"REG_S" \n\t"\
340 "movq %%mm2, %%mm0 \n\t"\
341 "punpcklwd %%mm3, %%mm2 \n\t"\
342 "punpckhwd %%mm3, %%mm0 \n\t"\
343 "pmaddwd %%mm1, %%mm2 \n\t"\
344 "pmaddwd %%mm1, %%mm0 \n\t"\
345 "paddd %%mm2, %%mm6 \n\t"\
346 "paddd %%mm0, %%mm7 \n\t"\
348 "psrad $16, %%mm4 \n\t"\
349 "psrad $16, %%mm5 \n\t"\
350 "psrad $16, %%mm6 \n\t"\
351 "psrad $16, %%mm7 \n\t"\
352 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
353 "packssdw %%mm5, %%mm4 \n\t"\
354 "packssdw %%mm7, %%mm6 \n\t"\
355 "paddw %%mm0, %%mm4 \n\t"\
356 "paddw %%mm0, %%mm6 \n\t"\
357 "movq %%mm4, "U_TEMP"(%0) \n\t"\
358 "movq %%mm6, "V_TEMP"(%0) \n\t"\
360 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
361 "lea "offset"(%0), %%"REG_d" \n\t"\
362 "mov (%%"REG_d"), %%"REG_S" \n\t"\
363 "pxor %%mm1, %%mm1 \n\t"\
364 "pxor %%mm5, %%mm5 \n\t"\
365 "pxor %%mm7, %%mm7 \n\t"\
366 "pxor %%mm6, %%mm6 \n\t"\
369 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
370 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
371 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
372 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
373 "movq %%mm0, %%mm3 \n\t"\
374 "punpcklwd %%mm4, %%mm0 \n\t"\
375 "punpckhwd %%mm4, %%mm3 \n\t"\
376 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
377 "pmaddwd %%mm4, %%mm0 \n\t"\
378 "pmaddwd %%mm4, %%mm3 \n\t"\
379 "paddd %%mm0, %%mm1 \n\t"\
380 "paddd %%mm3, %%mm5 \n\t"\
381 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
382 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
383 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
384 "test %%"REG_S", %%"REG_S" \n\t"\
385 "movq %%mm2, %%mm0 \n\t"\
386 "punpcklwd %%mm3, %%mm2 \n\t"\
387 "punpckhwd %%mm3, %%mm0 \n\t"\
388 "pmaddwd %%mm4, %%mm2 \n\t"\
389 "pmaddwd %%mm4, %%mm0 \n\t"\
390 "paddd %%mm2, %%mm7 \n\t"\
391 "paddd %%mm0, %%mm6 \n\t"\
393 "psrad $16, %%mm1 \n\t"\
394 "psrad $16, %%mm5 \n\t"\
395 "psrad $16, %%mm7 \n\t"\
396 "psrad $16, %%mm6 \n\t"\
397 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
398 "packssdw %%mm5, %%mm1 \n\t"\
399 "packssdw %%mm6, %%mm7 \n\t"\
400 "paddw %%mm0, %%mm1 \n\t"\
401 "paddw %%mm0, %%mm7 \n\t"\
402 "movq "U_TEMP"(%0), %%mm3 \n\t"\
403 "movq "V_TEMP"(%0), %%mm4 \n\t"\
405 #define YSCALEYUV2PACKEDX_ACCURATE \
406 YSCALEYUV2PACKEDX_ACCURATE_UV \
407 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
409 #define YSCALEYUV2RGBX \
410 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
411 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
412 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
413 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
414 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
415 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
416 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
417 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
418 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
419 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
420 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
421 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
422 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
423 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
424 "paddw %%mm3, %%mm4 \n\t"\
425 "movq %%mm2, %%mm0 \n\t"\
426 "movq %%mm5, %%mm6 \n\t"\
427 "movq %%mm4, %%mm3 \n\t"\
428 "punpcklwd %%mm2, %%mm2 \n\t"\
429 "punpcklwd %%mm5, %%mm5 \n\t"\
430 "punpcklwd %%mm4, %%mm4 \n\t"\
431 "paddw %%mm1, %%mm2 \n\t"\
432 "paddw %%mm1, %%mm5 \n\t"\
433 "paddw %%mm1, %%mm4 \n\t"\
434 "punpckhwd %%mm0, %%mm0 \n\t"\
435 "punpckhwd %%mm6, %%mm6 \n\t"\
436 "punpckhwd %%mm3, %%mm3 \n\t"\
437 "paddw %%mm7, %%mm0 \n\t"\
438 "paddw %%mm7, %%mm6 \n\t"\
439 "paddw %%mm7, %%mm3 \n\t"\
440 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
441 "packuswb %%mm0, %%mm2 \n\t"\
442 "packuswb %%mm6, %%mm5 \n\t"\
443 "packuswb %%mm3, %%mm4 \n\t"\
445 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
446 "movq "#b", "#q2" \n\t" /* B */\
447 "movq "#r", "#t" \n\t" /* R */\
448 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
449 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
450 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
451 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
452 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
453 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
454 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
455 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
456 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
457 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
459 MOVNTQ( q0, (dst, index, 4))\
460 MOVNTQ( b, 8(dst, index, 4))\
461 MOVNTQ( q2, 16(dst, index, 4))\
462 MOVNTQ( q3, 24(dst, index, 4))\
464 "add $8, "#index" \n\t"\
465 "cmp "#dstw", "#index" \n\t"\
467 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
469 static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
470 const int16_t **lumSrc, int lumFilterSize,
471 const int16_t *chrFilter, const int16_t **chrUSrc,
472 const int16_t **chrVSrc,
473 int chrFilterSize, const int16_t **alpSrc,
474 uint8_t *dest, long dstW, long dstY)
477 x86_reg dstW_reg = dstW;
478 x86_reg uv_off = c->uv_off << 1;
480 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
481 YSCALEYUV2PACKEDX_ACCURATE
483 "movq %%mm2, "U_TEMP"(%0) \n\t"
484 "movq %%mm4, "V_TEMP"(%0) \n\t"
485 "movq %%mm5, "Y_TEMP"(%0) \n\t"
486 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
487 "movq "Y_TEMP"(%0), %%mm5 \n\t"
488 "psraw $3, %%mm1 \n\t"
489 "psraw $3, %%mm7 \n\t"
490 "packuswb %%mm7, %%mm1 \n\t"
491 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
492 YSCALEYUV2PACKEDX_END
494 YSCALEYUV2PACKEDX_ACCURATE
496 "pcmpeqd %%mm7, %%mm7 \n\t"
497 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
498 YSCALEYUV2PACKEDX_END
502 static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
503 const int16_t **lumSrc, int lumFilterSize,
504 const int16_t *chrFilter, const int16_t **chrUSrc,
505 const int16_t **chrVSrc,
506 int chrFilterSize, const int16_t **alpSrc,
507 uint8_t *dest, long dstW, long dstY)
510 x86_reg dstW_reg = dstW;
511 x86_reg uv_off = c->uv_off << 1;
513 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
516 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
517 "psraw $3, %%mm1 \n\t"
518 "psraw $3, %%mm7 \n\t"
519 "packuswb %%mm7, %%mm1 \n\t"
520 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
521 YSCALEYUV2PACKEDX_END
525 "pcmpeqd %%mm7, %%mm7 \n\t"
526 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
527 YSCALEYUV2PACKEDX_END
531 #define REAL_WRITERGB16(dst, dstw, index) \
532 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
533 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
534 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
535 "psrlq $3, %%mm2 \n\t"\
537 "movq %%mm2, %%mm1 \n\t"\
538 "movq %%mm4, %%mm3 \n\t"\
540 "punpcklbw %%mm7, %%mm3 \n\t"\
541 "punpcklbw %%mm5, %%mm2 \n\t"\
542 "punpckhbw %%mm7, %%mm4 \n\t"\
543 "punpckhbw %%mm5, %%mm1 \n\t"\
545 "psllq $3, %%mm3 \n\t"\
546 "psllq $3, %%mm4 \n\t"\
548 "por %%mm3, %%mm2 \n\t"\
549 "por %%mm4, %%mm1 \n\t"\
551 MOVNTQ(%%mm2, (dst, index, 2))\
552 MOVNTQ(%%mm1, 8(dst, index, 2))\
554 "add $8, "#index" \n\t"\
555 "cmp "#dstw", "#index" \n\t"\
557 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
559 static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
560 const int16_t **lumSrc, int lumFilterSize,
561 const int16_t *chrFilter, const int16_t **chrUSrc,
562 const int16_t **chrVSrc,
563 int chrFilterSize, const int16_t **alpSrc,
564 uint8_t *dest, long dstW, long dstY)
567 x86_reg dstW_reg = dstW;
568 x86_reg uv_off = c->uv_off << 1;
570 YSCALEYUV2PACKEDX_ACCURATE
572 "pxor %%mm7, %%mm7 \n\t"
573 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
575 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
576 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
577 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
579 WRITERGB16(%4, %5, %%REGa)
580 YSCALEYUV2PACKEDX_END
583 static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
584 const int16_t **lumSrc, int lumFilterSize,
585 const int16_t *chrFilter, const int16_t **chrUSrc,
586 const int16_t **chrVSrc,
587 int chrFilterSize, const int16_t **alpSrc,
588 uint8_t *dest, long dstW, long dstY)
591 x86_reg dstW_reg = dstW;
592 x86_reg uv_off = c->uv_off << 1;
596 "pxor %%mm7, %%mm7 \n\t"
597 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
599 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
600 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
601 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
603 WRITERGB16(%4, %5, %%REGa)
604 YSCALEYUV2PACKEDX_END
607 #define REAL_WRITERGB15(dst, dstw, index) \
608 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
609 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
610 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
611 "psrlq $3, %%mm2 \n\t"\
612 "psrlq $1, %%mm5 \n\t"\
614 "movq %%mm2, %%mm1 \n\t"\
615 "movq %%mm4, %%mm3 \n\t"\
617 "punpcklbw %%mm7, %%mm3 \n\t"\
618 "punpcklbw %%mm5, %%mm2 \n\t"\
619 "punpckhbw %%mm7, %%mm4 \n\t"\
620 "punpckhbw %%mm5, %%mm1 \n\t"\
622 "psllq $2, %%mm3 \n\t"\
623 "psllq $2, %%mm4 \n\t"\
625 "por %%mm3, %%mm2 \n\t"\
626 "por %%mm4, %%mm1 \n\t"\
628 MOVNTQ(%%mm2, (dst, index, 2))\
629 MOVNTQ(%%mm1, 8(dst, index, 2))\
631 "add $8, "#index" \n\t"\
632 "cmp "#dstw", "#index" \n\t"\
634 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
636 static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
637 const int16_t **lumSrc, int lumFilterSize,
638 const int16_t *chrFilter, const int16_t **chrUSrc,
639 const int16_t **chrVSrc,
640 int chrFilterSize, const int16_t **alpSrc,
641 uint8_t *dest, long dstW, long dstY)
644 x86_reg dstW_reg = dstW;
645 x86_reg uv_off = c->uv_off << 1;
647 YSCALEYUV2PACKEDX_ACCURATE
649 "pxor %%mm7, %%mm7 \n\t"
650 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
652 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
653 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
654 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
656 WRITERGB15(%4, %5, %%REGa)
657 YSCALEYUV2PACKEDX_END
660 static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
661 const int16_t **lumSrc, int lumFilterSize,
662 const int16_t *chrFilter, const int16_t **chrUSrc,
663 const int16_t **chrVSrc,
664 int chrFilterSize, const int16_t **alpSrc,
665 uint8_t *dest, long dstW, long dstY)
668 x86_reg dstW_reg = dstW;
669 x86_reg uv_off = c->uv_off << 1;
673 "pxor %%mm7, %%mm7 \n\t"
674 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
676 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
677 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
678 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
680 WRITERGB15(%4, %5, %%REGa)
681 YSCALEYUV2PACKEDX_END
684 #define WRITEBGR24MMX(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq %%mm2, %%mm1 \n\t" /* B */\
687 "movq %%mm5, %%mm6 \n\t" /* R */\
688 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
689 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
690 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
691 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
692 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
693 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
694 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
695 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
696 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
697 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
699 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
700 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
701 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
702 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
704 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
705 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
706 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
707 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
709 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
710 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
711 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
712 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
714 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
715 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
716 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
717 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
718 MOVNTQ(%%mm0, (dst))\
720 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
721 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
722 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
723 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
724 MOVNTQ(%%mm6, 8(dst))\
726 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
727 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
728 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
729 MOVNTQ(%%mm5, 16(dst))\
731 "add $24, "#dst" \n\t"\
733 "add $8, "#index" \n\t"\
734 "cmp "#dstw", "#index" \n\t"\
737 #define WRITEBGR24MMX2(dst, dstw, index) \
738 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
739 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
740 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
741 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
742 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
743 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
745 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
746 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
747 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
749 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
750 "por %%mm1, %%mm6 \n\t"\
751 "por %%mm3, %%mm6 \n\t"\
752 MOVNTQ(%%mm6, (dst))\
754 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
755 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
756 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
757 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
759 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
760 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
761 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
763 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
764 "por %%mm3, %%mm6 \n\t"\
765 MOVNTQ(%%mm6, 8(dst))\
767 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
768 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
769 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
771 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
772 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
773 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
775 "por %%mm1, %%mm3 \n\t"\
776 "por %%mm3, %%mm6 \n\t"\
777 MOVNTQ(%%mm6, 16(dst))\
779 "add $24, "#dst" \n\t"\
781 "add $8, "#index" \n\t"\
782 "cmp "#dstw", "#index" \n\t"\
785 #if COMPILE_TEMPLATE_MMX2
787 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
790 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
793 static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
794 const int16_t **lumSrc, int lumFilterSize,
795 const int16_t *chrFilter, const int16_t **chrUSrc,
796 const int16_t **chrVSrc,
797 int chrFilterSize, const int16_t **alpSrc,
798 uint8_t *dest, long dstW, long dstY)
801 x86_reg dstW_reg = dstW;
802 x86_reg uv_off = c->uv_off << 1;
804 YSCALEYUV2PACKEDX_ACCURATE
806 "pxor %%mm7, %%mm7 \n\t"
807 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
808 "add %4, %%"REG_c" \n\t"
809 WRITEBGR24(%%REGc, %5, %%REGa)
810 :: "r" (&c->redDither),
811 "m" (dummy), "m" (dummy), "m" (dummy),
812 "r" (dest), "m" (dstW_reg), "m"(uv_off)
813 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
817 static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
818 const int16_t **lumSrc, int lumFilterSize,
819 const int16_t *chrFilter, const int16_t **chrUSrc,
820 const int16_t **chrVSrc,
821 int chrFilterSize, const int16_t **alpSrc,
822 uint8_t *dest, long dstW, long dstY)
825 x86_reg dstW_reg = dstW;
826 x86_reg uv_off = c->uv_off << 1;
830 "pxor %%mm7, %%mm7 \n\t"
831 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
832 "add %4, %%"REG_c" \n\t"
833 WRITEBGR24(%%REGc, %5, %%REGa)
834 :: "r" (&c->redDither),
835 "m" (dummy), "m" (dummy), "m" (dummy),
836 "r" (dest), "m" (dstW_reg), "m"(uv_off)
837 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
841 #define REAL_WRITEYUY2(dst, dstw, index) \
842 "packuswb %%mm3, %%mm3 \n\t"\
843 "packuswb %%mm4, %%mm4 \n\t"\
844 "packuswb %%mm7, %%mm1 \n\t"\
845 "punpcklbw %%mm4, %%mm3 \n\t"\
846 "movq %%mm1, %%mm7 \n\t"\
847 "punpcklbw %%mm3, %%mm1 \n\t"\
848 "punpckhbw %%mm3, %%mm7 \n\t"\
850 MOVNTQ(%%mm1, (dst, index, 2))\
851 MOVNTQ(%%mm7, 8(dst, index, 2))\
853 "add $8, "#index" \n\t"\
854 "cmp "#dstw", "#index" \n\t"\
856 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
858 static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
859 const int16_t **lumSrc, int lumFilterSize,
860 const int16_t *chrFilter, const int16_t **chrUSrc,
861 const int16_t **chrVSrc,
862 int chrFilterSize, const int16_t **alpSrc,
863 uint8_t *dest, long dstW, long dstY)
866 x86_reg dstW_reg = dstW;
867 x86_reg uv_off = c->uv_off << 1;
869 YSCALEYUV2PACKEDX_ACCURATE
870 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
871 "psraw $3, %%mm3 \n\t"
872 "psraw $3, %%mm4 \n\t"
873 "psraw $3, %%mm1 \n\t"
874 "psraw $3, %%mm7 \n\t"
875 WRITEYUY2(%4, %5, %%REGa)
876 YSCALEYUV2PACKEDX_END
879 static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
880 const int16_t **lumSrc, int lumFilterSize,
881 const int16_t *chrFilter, const int16_t **chrUSrc,
882 const int16_t **chrVSrc,
883 int chrFilterSize, const int16_t **alpSrc,
884 uint8_t *dest, long dstW, long dstY)
887 x86_reg dstW_reg = dstW;
888 x86_reg uv_off = c->uv_off << 1;
891 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
892 "psraw $3, %%mm3 \n\t"
893 "psraw $3, %%mm4 \n\t"
894 "psraw $3, %%mm1 \n\t"
895 "psraw $3, %%mm7 \n\t"
896 WRITEYUY2(%4, %5, %%REGa)
897 YSCALEYUV2PACKEDX_END
900 #define REAL_YSCALEYUV2RGB_UV(index, c) \
901 "xor "#index", "#index" \n\t"\
904 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
905 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
906 "add "UV_OFFx2"("#c"), "#index" \n\t" \
907 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
908 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
909 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
910 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
911 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
912 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
913 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
914 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
915 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
916 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
917 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
918 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
919 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
920 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
921 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
922 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
923 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
924 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
925 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
927 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
928 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
929 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
930 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
931 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
932 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
933 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
934 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
935 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
936 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
937 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
938 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
939 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
941 #define REAL_YSCALEYUV2RGB_COEFF(c) \
942 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
943 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
944 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
945 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
946 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
947 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
948 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
949 "paddw %%mm3, %%mm4 \n\t"\
950 "movq %%mm2, %%mm0 \n\t"\
951 "movq %%mm5, %%mm6 \n\t"\
952 "movq %%mm4, %%mm3 \n\t"\
953 "punpcklwd %%mm2, %%mm2 \n\t"\
954 "punpcklwd %%mm5, %%mm5 \n\t"\
955 "punpcklwd %%mm4, %%mm4 \n\t"\
956 "paddw %%mm1, %%mm2 \n\t"\
957 "paddw %%mm1, %%mm5 \n\t"\
958 "paddw %%mm1, %%mm4 \n\t"\
959 "punpckhwd %%mm0, %%mm0 \n\t"\
960 "punpckhwd %%mm6, %%mm6 \n\t"\
961 "punpckhwd %%mm3, %%mm3 \n\t"\
962 "paddw %%mm7, %%mm0 \n\t"\
963 "paddw %%mm7, %%mm6 \n\t"\
964 "paddw %%mm7, %%mm3 \n\t"\
965 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
966 "packuswb %%mm0, %%mm2 \n\t"\
967 "packuswb %%mm6, %%mm5 \n\t"\
968 "packuswb %%mm3, %%mm4 \n\t"\
970 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
972 #define YSCALEYUV2RGB(index, c) \
973 REAL_YSCALEYUV2RGB_UV(index, c) \
974 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
975 REAL_YSCALEYUV2RGB_COEFF(c)
978 * vertical bilinear scale YV12 to RGB
980 static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
981 const uint16_t *buf1, const uint16_t *ubuf0,
982 const uint16_t *ubuf1, const uint16_t *vbuf0,
983 const uint16_t *vbuf1, const uint16_t *abuf0,
984 const uint16_t *abuf1, uint8_t *dest,
985 int dstW, int yalpha, int uvalpha, int y)
987 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
990 YSCALEYUV2RGB(%%r8, %5)
991 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
992 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
993 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
994 "packuswb %%mm7, %%mm1 \n\t"
995 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
996 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
998 "r" (abuf0), "r" (abuf1)
1002 *(const uint16_t **)(&c->u_temp)=abuf0;
1003 *(const uint16_t **)(&c->v_temp)=abuf1;
1005 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1006 "mov %4, %%"REG_b" \n\t"
1007 "push %%"REG_BP" \n\t"
1008 YSCALEYUV2RGB(%%REGBP, %5)
1011 "mov "U_TEMP"(%5), %0 \n\t"
1012 "mov "V_TEMP"(%5), %1 \n\t"
1013 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1014 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1015 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1016 "packuswb %%mm7, %%mm1 \n\t"
1019 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1020 "pop %%"REG_BP" \n\t"
1021 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1022 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1028 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1029 "mov %4, %%"REG_b" \n\t"
1030 "push %%"REG_BP" \n\t"
1031 YSCALEYUV2RGB(%%REGBP, %5)
1032 "pcmpeqd %%mm7, %%mm7 \n\t"
1033 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1034 "pop %%"REG_BP" \n\t"
1035 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1036 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1042 static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
1043 const uint16_t *buf1, const uint16_t *ubuf0,
1044 const uint16_t *ubuf1, const uint16_t *vbuf0,
1045 const uint16_t *vbuf1, const uint16_t *abuf0,
1046 const uint16_t *abuf1, uint8_t *dest,
1047 int dstW, int yalpha, int uvalpha, int y)
1049 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1051 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1052 "mov %4, %%"REG_b" \n\t"
1053 "push %%"REG_BP" \n\t"
1054 YSCALEYUV2RGB(%%REGBP, %5)
1055 "pxor %%mm7, %%mm7 \n\t"
1056 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1057 "pop %%"REG_BP" \n\t"
1058 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1059 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1064 static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
1065 const uint16_t *buf1, const uint16_t *ubuf0,
1066 const uint16_t *ubuf1, const uint16_t *vbuf0,
1067 const uint16_t *vbuf1, const uint16_t *abuf0,
1068 const uint16_t *abuf1, uint8_t *dest,
1069 int dstW, int yalpha, int uvalpha, int y)
1071 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1073 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1074 "mov %4, %%"REG_b" \n\t"
1075 "push %%"REG_BP" \n\t"
1076 YSCALEYUV2RGB(%%REGBP, %5)
1077 "pxor %%mm7, %%mm7 \n\t"
1078 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1080 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1081 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1082 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1084 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1085 "pop %%"REG_BP" \n\t"
1086 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1087 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1092 static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
1093 const uint16_t *buf1, const uint16_t *ubuf0,
1094 const uint16_t *ubuf1, const uint16_t *vbuf0,
1095 const uint16_t *vbuf1, const uint16_t *abuf0,
1096 const uint16_t *abuf1, uint8_t *dest,
1097 int dstW, int yalpha, int uvalpha, int y)
1099 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1101 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1102 "mov %4, %%"REG_b" \n\t"
1103 "push %%"REG_BP" \n\t"
1104 YSCALEYUV2RGB(%%REGBP, %5)
1105 "pxor %%mm7, %%mm7 \n\t"
1106 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1108 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1109 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1110 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1112 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1113 "pop %%"REG_BP" \n\t"
1114 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1115 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1120 #define REAL_YSCALEYUV2PACKED(index, c) \
1121 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1122 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
1123 "psraw $3, %%mm0 \n\t"\
1124 "psraw $3, %%mm1 \n\t"\
1125 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1126 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1127 "xor "#index", "#index" \n\t"\
1130 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1131 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1132 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1133 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1134 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1135 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1136 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1137 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1138 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1139 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1140 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1141 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1142 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1143 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1144 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1145 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
1146 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
1147 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
1148 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
1149 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
1150 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
1151 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1152 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1153 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1154 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1155 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1156 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1158 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1160 static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
1161 const uint16_t *buf1, const uint16_t *ubuf0,
1162 const uint16_t *ubuf1, const uint16_t *vbuf0,
1163 const uint16_t *vbuf1, const uint16_t *abuf0,
1164 const uint16_t *abuf1, uint8_t *dest,
1165 int dstW, int yalpha, int uvalpha, int y)
1167 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1169 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1170 "mov %4, %%"REG_b" \n\t"
1171 "push %%"REG_BP" \n\t"
1172 YSCALEYUV2PACKED(%%REGBP, %5)
1173 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1174 "pop %%"REG_BP" \n\t"
1175 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1176 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1181 #define REAL_YSCALEYUV2RGB1(index, c) \
1182 "xor "#index", "#index" \n\t"\
1185 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1186 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1187 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1188 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1189 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1190 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1191 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1192 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1193 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1194 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1195 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1196 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1197 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1198 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1199 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1200 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1201 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1202 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1203 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1204 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1205 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1206 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1207 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1208 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1209 "paddw %%mm3, %%mm4 \n\t"\
1210 "movq %%mm2, %%mm0 \n\t"\
1211 "movq %%mm5, %%mm6 \n\t"\
1212 "movq %%mm4, %%mm3 \n\t"\
1213 "punpcklwd %%mm2, %%mm2 \n\t"\
1214 "punpcklwd %%mm5, %%mm5 \n\t"\
1215 "punpcklwd %%mm4, %%mm4 \n\t"\
1216 "paddw %%mm1, %%mm2 \n\t"\
1217 "paddw %%mm1, %%mm5 \n\t"\
1218 "paddw %%mm1, %%mm4 \n\t"\
1219 "punpckhwd %%mm0, %%mm0 \n\t"\
1220 "punpckhwd %%mm6, %%mm6 \n\t"\
1221 "punpckhwd %%mm3, %%mm3 \n\t"\
1222 "paddw %%mm7, %%mm0 \n\t"\
1223 "paddw %%mm7, %%mm6 \n\t"\
1224 "paddw %%mm7, %%mm3 \n\t"\
1225 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1226 "packuswb %%mm0, %%mm2 \n\t"\
1227 "packuswb %%mm6, %%mm5 \n\t"\
1228 "packuswb %%mm3, %%mm4 \n\t"\
1230 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1232 // do vertical chrominance interpolation
1233 #define REAL_YSCALEYUV2RGB1b(index, c) \
1234 "xor "#index", "#index" \n\t"\
1237 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1238 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1239 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1240 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1241 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1242 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1243 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1244 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1245 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1246 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1247 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1248 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1249 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1250 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1251 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1252 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1253 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1254 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1255 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1256 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1257 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1258 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1259 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1260 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1261 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1262 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1263 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1264 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1265 "paddw %%mm3, %%mm4 \n\t"\
1266 "movq %%mm2, %%mm0 \n\t"\
1267 "movq %%mm5, %%mm6 \n\t"\
1268 "movq %%mm4, %%mm3 \n\t"\
1269 "punpcklwd %%mm2, %%mm2 \n\t"\
1270 "punpcklwd %%mm5, %%mm5 \n\t"\
1271 "punpcklwd %%mm4, %%mm4 \n\t"\
1272 "paddw %%mm1, %%mm2 \n\t"\
1273 "paddw %%mm1, %%mm5 \n\t"\
1274 "paddw %%mm1, %%mm4 \n\t"\
1275 "punpckhwd %%mm0, %%mm0 \n\t"\
1276 "punpckhwd %%mm6, %%mm6 \n\t"\
1277 "punpckhwd %%mm3, %%mm3 \n\t"\
1278 "paddw %%mm7, %%mm0 \n\t"\
1279 "paddw %%mm7, %%mm6 \n\t"\
1280 "paddw %%mm7, %%mm3 \n\t"\
1281 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1282 "packuswb %%mm0, %%mm2 \n\t"\
1283 "packuswb %%mm6, %%mm5 \n\t"\
1284 "packuswb %%mm3, %%mm4 \n\t"\
1286 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1288 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1289 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1290 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1291 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1292 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1293 "packuswb %%mm1, %%mm7 \n\t"
1294 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1297 * YV12 to RGB without scaling or interpolating
1299 static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
1300 const uint16_t *ubuf0, const uint16_t *ubuf1,
1301 const uint16_t *vbuf0, const uint16_t *vbuf1,
1302 const uint16_t *abuf0, uint8_t *dest,
1303 int dstW, int uvalpha, enum PixelFormat dstFormat,
1306 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1308 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1309 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1312 "mov %4, %%"REG_b" \n\t"
1313 "push %%"REG_BP" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP, %5)
1315 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1317 "pop %%"REG_BP" \n\t"
1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1319 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1324 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1325 "mov %4, %%"REG_b" \n\t"
1326 "push %%"REG_BP" \n\t"
1327 YSCALEYUV2RGB1(%%REGBP, %5)
1328 "pcmpeqd %%mm7, %%mm7 \n\t"
1329 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1337 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1339 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1340 "mov %4, %%"REG_b" \n\t"
1341 "push %%"REG_BP" \n\t"
1342 YSCALEYUV2RGB1b(%%REGBP, %5)
1343 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1344 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1345 "pop %%"REG_BP" \n\t"
1346 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1347 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1352 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1353 "mov %4, %%"REG_b" \n\t"
1354 "push %%"REG_BP" \n\t"
1355 YSCALEYUV2RGB1b(%%REGBP, %5)
1356 "pcmpeqd %%mm7, %%mm7 \n\t"
1357 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1358 "pop %%"REG_BP" \n\t"
1359 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1360 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1367 static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
1368 const uint16_t *ubuf0, const uint16_t *ubuf1,
1369 const uint16_t *vbuf0, const uint16_t *vbuf1,
1370 const uint16_t *abuf0, uint8_t *dest,
1371 int dstW, int uvalpha, enum PixelFormat dstFormat,
1374 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1376 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1378 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1379 "mov %4, %%"REG_b" \n\t"
1380 "push %%"REG_BP" \n\t"
1381 YSCALEYUV2RGB1(%%REGBP, %5)
1382 "pxor %%mm7, %%mm7 \n\t"
1383 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1384 "pop %%"REG_BP" \n\t"
1385 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1386 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_b" \n\t"
1393 "push %%"REG_BP" \n\t"
1394 YSCALEYUV2RGB1b(%%REGBP, %5)
1395 "pxor %%mm7, %%mm7 \n\t"
1396 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1397 "pop %%"REG_BP" \n\t"
1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1399 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1405 static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
1406 const uint16_t *ubuf0, const uint16_t *ubuf1,
1407 const uint16_t *vbuf0, const uint16_t *vbuf1,
1408 const uint16_t *abuf0, uint8_t *dest,
1409 int dstW, int uvalpha, enum PixelFormat dstFormat,
1412 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1414 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1416 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1417 "mov %4, %%"REG_b" \n\t"
1418 "push %%"REG_BP" \n\t"
1419 YSCALEYUV2RGB1(%%REGBP, %5)
1420 "pxor %%mm7, %%mm7 \n\t"
1421 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1423 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1424 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1425 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1427 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1428 "pop %%"REG_BP" \n\t"
1429 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1430 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1435 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1436 "mov %4, %%"REG_b" \n\t"
1437 "push %%"REG_BP" \n\t"
1438 YSCALEYUV2RGB1b(%%REGBP, %5)
1439 "pxor %%mm7, %%mm7 \n\t"
1440 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1442 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1443 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1444 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1446 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1447 "pop %%"REG_BP" \n\t"
1448 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1449 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1455 static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
1456 const uint16_t *ubuf0, const uint16_t *ubuf1,
1457 const uint16_t *vbuf0, const uint16_t *vbuf1,
1458 const uint16_t *abuf0, uint8_t *dest,
1459 int dstW, int uvalpha, enum PixelFormat dstFormat,
1462 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1464 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1466 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1467 "mov %4, %%"REG_b" \n\t"
1468 "push %%"REG_BP" \n\t"
1469 YSCALEYUV2RGB1(%%REGBP, %5)
1470 "pxor %%mm7, %%mm7 \n\t"
1471 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1473 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1474 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1475 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1477 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1478 "pop %%"REG_BP" \n\t"
1479 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1485 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1486 "mov %4, %%"REG_b" \n\t"
1487 "push %%"REG_BP" \n\t"
1488 YSCALEYUV2RGB1b(%%REGBP, %5)
1489 "pxor %%mm7, %%mm7 \n\t"
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1492 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1493 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1494 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1496 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1497 "pop %%"REG_BP" \n\t"
1498 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1499 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1505 #define REAL_YSCALEYUV2PACKED1(index, c) \
1506 "xor "#index", "#index" \n\t"\
1509 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1510 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1511 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1512 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1513 "psraw $7, %%mm3 \n\t" \
1514 "psraw $7, %%mm4 \n\t" \
1515 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1516 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1517 "psraw $7, %%mm1 \n\t" \
1518 "psraw $7, %%mm7 \n\t" \
1520 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1522 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1523 "xor "#index", "#index" \n\t"\
1526 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1527 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1528 "add "UV_OFFx2"("#c"), "#index" \n\t" \
1529 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1530 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1531 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
1532 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1533 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1534 "psrlw $8, %%mm3 \n\t" \
1535 "psrlw $8, %%mm4 \n\t" \
1536 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1537 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1538 "psraw $7, %%mm1 \n\t" \
1539 "psraw $7, %%mm7 \n\t"
1540 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1542 static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
1543 const uint16_t *ubuf0, const uint16_t *ubuf1,
1544 const uint16_t *vbuf0, const uint16_t *vbuf1,
1545 const uint16_t *abuf0, uint8_t *dest,
1546 int dstW, int uvalpha, enum PixelFormat dstFormat,
1549 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1551 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1553 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1554 "mov %4, %%"REG_b" \n\t"
1555 "push %%"REG_BP" \n\t"
1556 YSCALEYUV2PACKED1(%%REGBP, %5)
1557 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1558 "pop %%"REG_BP" \n\t"
1559 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1560 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1565 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1566 "mov %4, %%"REG_b" \n\t"
1567 "push %%"REG_BP" \n\t"
1568 YSCALEYUV2PACKED1b(%%REGBP, %5)
1569 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1570 "pop %%"REG_BP" \n\t"
1571 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1578 #if !COMPILE_TEMPLATE_MMX2
1579 //FIXME yuy2* can read up to 7 samples too much
1581 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1584 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1585 "mov %0, %%"REG_a" \n\t"
1587 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1589 "pand %%mm2, %%mm0 \n\t"
1590 "pand %%mm2, %%mm1 \n\t"
1591 "packuswb %%mm1, %%mm0 \n\t"
1592 "movq %%mm0, (%2, %%"REG_a") \n\t"
1593 "add $8, %%"REG_a" \n\t"
1595 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1600 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1603 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1606 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1608 "psrlw $8, %%mm0 \n\t"
1609 "psrlw $8, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, %%mm1 \n\t"
1612 "psrlw $8, %%mm0 \n\t"
1613 "pand %%mm4, %%mm1 \n\t"
1614 "packuswb %%mm0, %%mm0 \n\t"
1615 "packuswb %%mm1, %%mm1 \n\t"
1616 "movd %%mm0, (%3, %%"REG_a") \n\t"
1617 "movd %%mm1, (%2, %%"REG_a") \n\t"
1618 "add $4, %%"REG_a" \n\t"
1620 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1623 assert(src1 == src2);
1626 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1629 "mov %0, %%"REG_a" \n\t"
1631 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1633 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1634 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1635 "psrlw $8, %%mm0 \n\t"
1636 "psrlw $8, %%mm1 \n\t"
1637 "psrlw $8, %%mm2 \n\t"
1638 "psrlw $8, %%mm3 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "packuswb %%mm3, %%mm2 \n\t"
1641 "movq %%mm0, (%3, %%"REG_a") \n\t"
1642 "movq %%mm2, (%4, %%"REG_a") \n\t"
1643 "add $8, %%"REG_a" \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1650 /* This is almost identical to the previous, end exists only because
1651 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1652 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1655 "mov %0, %%"REG_a" \n\t"
1657 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1658 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1659 "psrlw $8, %%mm0 \n\t"
1660 "psrlw $8, %%mm1 \n\t"
1661 "packuswb %%mm1, %%mm0 \n\t"
1662 "movq %%mm0, (%2, %%"REG_a") \n\t"
1663 "add $8, %%"REG_a" \n\t"
1665 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1670 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1673 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1674 "mov %0, %%"REG_a" \n\t"
1676 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1677 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1678 "pand %%mm4, %%mm0 \n\t"
1679 "pand %%mm4, %%mm1 \n\t"
1680 "packuswb %%mm1, %%mm0 \n\t"
1681 "movq %%mm0, %%mm1 \n\t"
1682 "psrlw $8, %%mm0 \n\t"
1683 "pand %%mm4, %%mm1 \n\t"
1684 "packuswb %%mm0, %%mm0 \n\t"
1685 "packuswb %%mm1, %%mm1 \n\t"
1686 "movd %%mm0, (%3, %%"REG_a") \n\t"
1687 "movd %%mm1, (%2, %%"REG_a") \n\t"
1688 "add $4, %%"REG_a" \n\t"
1690 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1693 assert(src1 == src2);
1696 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1699 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1700 "mov %0, %%"REG_a" \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1705 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1706 "pand %%mm4, %%mm0 \n\t"
1707 "pand %%mm4, %%mm1 \n\t"
1708 "pand %%mm4, %%mm2 \n\t"
1709 "pand %%mm4, %%mm3 \n\t"
1710 "packuswb %%mm1, %%mm0 \n\t"
1711 "packuswb %%mm3, %%mm2 \n\t"
1712 "movq %%mm0, (%3, %%"REG_a") \n\t"
1713 "movq %%mm2, (%4, %%"REG_a") \n\t"
1714 "add $8, %%"REG_a" \n\t"
1716 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1721 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1722 const uint8_t *src, long width)
1725 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1726 "mov %0, %%"REG_a" \n\t"
1728 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1729 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1730 "movq %%mm0, %%mm2 \n\t"
1731 "movq %%mm1, %%mm3 \n\t"
1732 "pand %%mm4, %%mm0 \n\t"
1733 "pand %%mm4, %%mm1 \n\t"
1734 "psrlw $8, %%mm2 \n\t"
1735 "psrlw $8, %%mm3 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "packuswb %%mm3, %%mm2 \n\t"
1738 "movq %%mm0, (%2, %%"REG_a") \n\t"
1739 "movq %%mm2, (%3, %%"REG_a") \n\t"
1740 "add $8, %%"REG_a" \n\t"
1742 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1747 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1748 const uint8_t *src1, const uint8_t *src2,
1749 long width, uint32_t *unused)
1751 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1754 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1755 const uint8_t *src1, const uint8_t *src2,
1756 long width, uint32_t *unused)
1758 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1760 #endif /* !COMPILE_TEMPLATE_MMX2 */
1762 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1765 if(srcFormat == PIX_FMT_BGR24) {
1767 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1768 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1773 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1774 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1780 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1781 "mov %2, %%"REG_a" \n\t"
1782 "pxor %%mm7, %%mm7 \n\t"
1784 PREFETCH" 64(%0) \n\t"
1785 "movd (%0), %%mm0 \n\t"
1786 "movd 2(%0), %%mm1 \n\t"
1787 "movd 6(%0), %%mm2 \n\t"
1788 "movd 8(%0), %%mm3 \n\t"
1790 "punpcklbw %%mm7, %%mm0 \n\t"
1791 "punpcklbw %%mm7, %%mm1 \n\t"
1792 "punpcklbw %%mm7, %%mm2 \n\t"
1793 "punpcklbw %%mm7, %%mm3 \n\t"
1794 "pmaddwd %%mm5, %%mm0 \n\t"
1795 "pmaddwd %%mm6, %%mm1 \n\t"
1796 "pmaddwd %%mm5, %%mm2 \n\t"
1797 "pmaddwd %%mm6, %%mm3 \n\t"
1798 "paddd %%mm1, %%mm0 \n\t"
1799 "paddd %%mm3, %%mm2 \n\t"
1800 "paddd %%mm4, %%mm0 \n\t"
1801 "paddd %%mm4, %%mm2 \n\t"
1802 "psrad $15, %%mm0 \n\t"
1803 "psrad $15, %%mm2 \n\t"
1804 "packssdw %%mm2, %%mm0 \n\t"
1805 "packuswb %%mm0, %%mm0 \n\t"
1806 "movd %%mm0, (%1, %%"REG_a") \n\t"
1807 "add $4, %%"REG_a" \n\t"
1810 : "r" (dst+width), "g" ((x86_reg)-width)
1815 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1818 "movq 24(%4), %%mm6 \n\t"
1819 "mov %3, %%"REG_a" \n\t"
1820 "pxor %%mm7, %%mm7 \n\t"
1822 PREFETCH" 64(%0) \n\t"
1823 "movd (%0), %%mm0 \n\t"
1824 "movd 2(%0), %%mm1 \n\t"
1825 "punpcklbw %%mm7, %%mm0 \n\t"
1826 "punpcklbw %%mm7, %%mm1 \n\t"
1827 "movq %%mm0, %%mm2 \n\t"
1828 "movq %%mm1, %%mm3 \n\t"
1829 "pmaddwd (%4), %%mm0 \n\t"
1830 "pmaddwd 8(%4), %%mm1 \n\t"
1831 "pmaddwd 16(%4), %%mm2 \n\t"
1832 "pmaddwd %%mm6, %%mm3 \n\t"
1833 "paddd %%mm1, %%mm0 \n\t"
1834 "paddd %%mm3, %%mm2 \n\t"
1836 "movd 6(%0), %%mm1 \n\t"
1837 "movd 8(%0), %%mm3 \n\t"
1839 "punpcklbw %%mm7, %%mm1 \n\t"
1840 "punpcklbw %%mm7, %%mm3 \n\t"
1841 "movq %%mm1, %%mm4 \n\t"
1842 "movq %%mm3, %%mm5 \n\t"
1843 "pmaddwd (%4), %%mm1 \n\t"
1844 "pmaddwd 8(%4), %%mm3 \n\t"
1845 "pmaddwd 16(%4), %%mm4 \n\t"
1846 "pmaddwd %%mm6, %%mm5 \n\t"
1847 "paddd %%mm3, %%mm1 \n\t"
1848 "paddd %%mm5, %%mm4 \n\t"
1850 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1851 "paddd %%mm3, %%mm0 \n\t"
1852 "paddd %%mm3, %%mm2 \n\t"
1853 "paddd %%mm3, %%mm1 \n\t"
1854 "paddd %%mm3, %%mm4 \n\t"
1855 "psrad $15, %%mm0 \n\t"
1856 "psrad $15, %%mm2 \n\t"
1857 "psrad $15, %%mm1 \n\t"
1858 "psrad $15, %%mm4 \n\t"
1859 "packssdw %%mm1, %%mm0 \n\t"
1860 "packssdw %%mm4, %%mm2 \n\t"
1861 "packuswb %%mm0, %%mm0 \n\t"
1862 "packuswb %%mm2, %%mm2 \n\t"
1863 "movd %%mm0, (%1, %%"REG_a") \n\t"
1864 "movd %%mm2, (%2, %%"REG_a") \n\t"
1865 "add $4, %%"REG_a" \n\t"
1868 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1873 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1875 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1878 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1880 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1881 assert(src1 == src2);
1884 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1886 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1889 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1892 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1895 #if !COMPILE_TEMPLATE_MMX2
1896 // bilinear / bicubic scaling
1897 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
1898 const int16_t *filter, const int16_t *filterPos, long filterSize)
1900 assert(filterSize % 4 == 0 && filterSize>0);
1901 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1902 x86_reg counter= -2*dstW;
1904 filterPos-= counter/2;
1908 "push %%"REG_b" \n\t"
1910 "pxor %%mm7, %%mm7 \n\t"
1911 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1912 "mov %%"REG_a", %%"REG_BP" \n\t"
1915 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1916 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1917 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1918 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1919 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1920 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1921 "punpcklbw %%mm7, %%mm0 \n\t"
1922 "punpcklbw %%mm7, %%mm2 \n\t"
1923 "pmaddwd %%mm1, %%mm0 \n\t"
1924 "pmaddwd %%mm2, %%mm3 \n\t"
1925 "movq %%mm0, %%mm4 \n\t"
1926 "punpckldq %%mm3, %%mm0 \n\t"
1927 "punpckhdq %%mm3, %%mm4 \n\t"
1928 "paddd %%mm4, %%mm0 \n\t"
1929 "psrad $7, %%mm0 \n\t"
1930 "packssdw %%mm0, %%mm0 \n\t"
1931 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1932 "add $4, %%"REG_BP" \n\t"
1935 "pop %%"REG_BP" \n\t"
1937 "pop %%"REG_b" \n\t"
1940 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1945 } else if (filterSize==8) {
1946 x86_reg counter= -2*dstW;
1948 filterPos-= counter/2;
1952 "push %%"REG_b" \n\t"
1954 "pxor %%mm7, %%mm7 \n\t"
1955 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1956 "mov %%"REG_a", %%"REG_BP" \n\t"
1959 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1960 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1961 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1962 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1963 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1964 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1967 "pmaddwd %%mm1, %%mm0 \n\t"
1968 "pmaddwd %%mm2, %%mm3 \n\t"
1970 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1971 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1972 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1973 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1974 "punpcklbw %%mm7, %%mm4 \n\t"
1975 "punpcklbw %%mm7, %%mm2 \n\t"
1976 "pmaddwd %%mm1, %%mm4 \n\t"
1977 "pmaddwd %%mm2, %%mm5 \n\t"
1978 "paddd %%mm4, %%mm0 \n\t"
1979 "paddd %%mm5, %%mm3 \n\t"
1980 "movq %%mm0, %%mm4 \n\t"
1981 "punpckldq %%mm3, %%mm0 \n\t"
1982 "punpckhdq %%mm3, %%mm4 \n\t"
1983 "paddd %%mm4, %%mm0 \n\t"
1984 "psrad $7, %%mm0 \n\t"
1985 "packssdw %%mm0, %%mm0 \n\t"
1986 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1987 "add $4, %%"REG_BP" \n\t"
1990 "pop %%"REG_BP" \n\t"
1992 "pop %%"REG_b" \n\t"
1995 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2001 const uint8_t *offset = src+filterSize;
2002 x86_reg counter= -2*dstW;
2003 //filter-= counter*filterSize/2;
2004 filterPos-= counter/2;
2007 "pxor %%mm7, %%mm7 \n\t"
2010 "mov %2, %%"REG_c" \n\t"
2011 "movzwl (%%"REG_c", %0), %%eax \n\t"
2012 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2013 "mov %5, %%"REG_c" \n\t"
2014 "pxor %%mm4, %%mm4 \n\t"
2015 "pxor %%mm5, %%mm5 \n\t"
2017 "movq (%1), %%mm1 \n\t"
2018 "movq (%1, %6), %%mm3 \n\t"
2019 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2020 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2021 "punpcklbw %%mm7, %%mm0 \n\t"
2022 "punpcklbw %%mm7, %%mm2 \n\t"
2023 "pmaddwd %%mm1, %%mm0 \n\t"
2024 "pmaddwd %%mm2, %%mm3 \n\t"
2025 "paddd %%mm3, %%mm5 \n\t"
2026 "paddd %%mm0, %%mm4 \n\t"
2028 "add $4, %%"REG_c" \n\t"
2029 "cmp %4, %%"REG_c" \n\t"
2032 "movq %%mm4, %%mm0 \n\t"
2033 "punpckldq %%mm5, %%mm4 \n\t"
2034 "punpckhdq %%mm5, %%mm0 \n\t"
2035 "paddd %%mm0, %%mm4 \n\t"
2036 "psrad $7, %%mm4 \n\t"
2037 "packssdw %%mm4, %%mm4 \n\t"
2038 "mov %3, %%"REG_a" \n\t"
2039 "movd %%mm4, (%%"REG_a", %0) \n\t"
2043 : "+r" (counter), "+r" (filter)
2044 : "m" (filterPos), "m" (dst), "m"(offset),
2045 "m" (src), "r" ((x86_reg)filterSize*2)
2046 : "%"REG_a, "%"REG_c, "%"REG_d
2050 #endif /* !COMPILE_TEMPLATE_MMX2 */
2052 #if COMPILE_TEMPLATE_MMX2
2053 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2054 long dstWidth, const uint8_t *src, int srcW,
2057 int32_t *filterPos = c->hLumFilterPos;
2058 int16_t *filter = c->hLumFilter;
2059 void *mmx2FilterCode= c->lumMmx2FilterCode;
2062 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2067 "mov %%"REG_b", %5 \n\t"
2069 "pxor %%mm7, %%mm7 \n\t"
2070 "mov %0, %%"REG_c" \n\t"
2071 "mov %1, %%"REG_D" \n\t"
2072 "mov %2, %%"REG_d" \n\t"
2073 "mov %3, %%"REG_b" \n\t"
2074 "xor %%"REG_a", %%"REG_a" \n\t" // i
2075 PREFETCH" (%%"REG_c") \n\t"
2076 PREFETCH" 32(%%"REG_c") \n\t"
2077 PREFETCH" 64(%%"REG_c") \n\t"
2080 #define CALL_MMX2_FILTER_CODE \
2081 "movl (%%"REG_b"), %%esi \n\t"\
2083 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2084 "add %%"REG_S", %%"REG_c" \n\t"\
2085 "add %%"REG_a", %%"REG_D" \n\t"\
2086 "xor %%"REG_a", %%"REG_a" \n\t"\
2089 #define CALL_MMX2_FILTER_CODE \
2090 "movl (%%"REG_b"), %%esi \n\t"\
2092 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2093 "add %%"REG_a", %%"REG_D" \n\t"\
2094 "xor %%"REG_a", %%"REG_a" \n\t"\
2096 #endif /* ARCH_X86_64 */
2098 CALL_MMX2_FILTER_CODE
2099 CALL_MMX2_FILTER_CODE
2100 CALL_MMX2_FILTER_CODE
2101 CALL_MMX2_FILTER_CODE
2102 CALL_MMX2_FILTER_CODE
2103 CALL_MMX2_FILTER_CODE
2104 CALL_MMX2_FILTER_CODE
2105 CALL_MMX2_FILTER_CODE
2108 "mov %5, %%"REG_b" \n\t"
2110 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2111 "m" (mmx2FilterCode)
2115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2121 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2122 dst[i] = src[srcW-1]*128;
2125 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
2126 long dstWidth, const uint8_t *src1,
2127 const uint8_t *src2, int srcW, int xInc)
2129 int32_t *filterPos = c->hChrFilterPos;
2130 int16_t *filter = c->hChrFilter;
2131 void *mmx2FilterCode= c->chrMmx2FilterCode;
2134 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2139 "mov %%"REG_b", %7 \n\t"
2141 "pxor %%mm7, %%mm7 \n\t"
2142 "mov %0, %%"REG_c" \n\t"
2143 "mov %1, %%"REG_D" \n\t"
2144 "mov %2, %%"REG_d" \n\t"
2145 "mov %3, %%"REG_b" \n\t"
2146 "xor %%"REG_a", %%"REG_a" \n\t" // i
2147 PREFETCH" (%%"REG_c") \n\t"
2148 PREFETCH" 32(%%"REG_c") \n\t"
2149 PREFETCH" 64(%%"REG_c") \n\t"
2151 CALL_MMX2_FILTER_CODE
2152 CALL_MMX2_FILTER_CODE
2153 CALL_MMX2_FILTER_CODE
2154 CALL_MMX2_FILTER_CODE
2155 "xor %%"REG_a", %%"REG_a" \n\t" // i
2156 "mov %5, %%"REG_c" \n\t" // src
2157 "mov %6, %%"REG_D" \n\t" // buf2
2158 PREFETCH" (%%"REG_c") \n\t"
2159 PREFETCH" 32(%%"REG_c") \n\t"
2160 PREFETCH" 64(%%"REG_c") \n\t"
2162 CALL_MMX2_FILTER_CODE
2163 CALL_MMX2_FILTER_CODE
2164 CALL_MMX2_FILTER_CODE
2165 CALL_MMX2_FILTER_CODE
2168 "mov %7, %%"REG_b" \n\t"
2170 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
2171 "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
2175 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2181 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2182 dst1[i] = src1[srcW-1]*128;
2183 dst2[i] = src2[srcW-1]*128;
2186 #endif /* COMPILE_TEMPLATE_MMX2 */
2188 #if !COMPILE_TEMPLATE_MMX2
2189 static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
2190 int lastInLumBuf, int lastInChrBuf)
2192 const int dstH= c->dstH;
2193 const int flags= c->flags;
2194 int16_t **lumPixBuf= c->lumPixBuf;
2195 int16_t **chrUPixBuf= c->chrUPixBuf;
2196 int16_t **alpPixBuf= c->alpPixBuf;
2197 const int vLumBufSize= c->vLumBufSize;
2198 const int vChrBufSize= c->vChrBufSize;
2199 int16_t *vLumFilterPos= c->vLumFilterPos;
2200 int16_t *vChrFilterPos= c->vChrFilterPos;
2201 int16_t *vLumFilter= c->vLumFilter;
2202 int16_t *vChrFilter= c->vChrFilter;
2203 int32_t *lumMmxFilter= c->lumMmxFilter;
2204 int32_t *chrMmxFilter= c->chrMmxFilter;
2205 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2206 const int vLumFilterSize= c->vLumFilterSize;
2207 const int vChrFilterSize= c->vChrFilterSize;
2208 const int chrDstY= dstY>>c->chrDstVSubSample;
2209 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2210 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2212 c->blueDither= ff_dither8[dstY&1];
2213 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2214 c->greenDither= ff_dither8[dstY&1];
2216 c->greenDither= ff_dither4[dstY&1];
2217 c->redDither= ff_dither8[(dstY+1)&1];
2218 if (dstY < dstH - 2) {
2219 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2220 const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2221 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2223 if (flags & SWS_ACCURATE_RND) {
2224 int s= APCK_SIZE / 8;
2225 for (i=0; i<vLumFilterSize; i+=2) {
2226 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2227 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2228 lumMmxFilter[s*i+APCK_COEF/4 ]=
2229 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2230 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2231 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2232 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2233 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2234 alpMmxFilter[s*i+APCK_COEF/4 ]=
2235 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2238 for (i=0; i<vChrFilterSize; i+=2) {
2239 *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
2240 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
2241 chrMmxFilter[s*i+APCK_COEF/4 ]=
2242 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2243 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2246 for (i=0; i<vLumFilterSize; i++) {
2247 *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
2248 lumMmxFilter[4*i+2]=
2249 lumMmxFilter[4*i+3]=
2250 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2251 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2252 *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
2253 alpMmxFilter[4*i+2]=
2254 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2257 for (i=0; i<vChrFilterSize; i++) {
2258 *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
2259 chrMmxFilter[4*i+2]=
2260 chrMmxFilter[4*i+3]=
2261 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2266 #endif /* !COMPILE_TEMPLATE_MMX2 */
2268 static void RENAME(sws_init_swScale)(SwsContext *c)
2270 enum PixelFormat srcFormat = c->srcFormat;
2272 if (!(c->flags & SWS_BITEXACT)) {
2273 if (c->flags & SWS_ACCURATE_RND) {
2274 c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
2275 c->yuv2yuvX = RENAME(yuv2yuvX_ar );
2276 switch (c->dstFormat) {
2277 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
2278 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
2279 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
2280 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
2281 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
2285 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2286 c->yuv2yuvX = RENAME(yuv2yuvX );
2287 switch (c->dstFormat) {
2288 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
2289 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
2290 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
2291 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
2292 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
2296 switch (c->dstFormat) {
2298 c->yuv2packed1 = RENAME(yuv2rgb32_1);
2299 c->yuv2packed2 = RENAME(yuv2rgb32_2);
2302 c->yuv2packed1 = RENAME(yuv2bgr24_1);
2303 c->yuv2packed2 = RENAME(yuv2bgr24_2);
2305 case PIX_FMT_RGB555:
2306 c->yuv2packed1 = RENAME(yuv2rgb555_1);
2307 c->yuv2packed2 = RENAME(yuv2rgb555_2);
2309 case PIX_FMT_RGB565:
2310 c->yuv2packed1 = RENAME(yuv2rgb565_1);
2311 c->yuv2packed2 = RENAME(yuv2rgb565_2);
2313 case PIX_FMT_YUYV422:
2314 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
2315 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
2322 #if !COMPILE_TEMPLATE_MMX2
2323 c->hScale = RENAME(hScale );
2324 #endif /* !COMPILE_TEMPLATE_MMX2 */
2326 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2327 #if COMPILE_TEMPLATE_MMX2
2328 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2330 c->hyscale_fast = RENAME(hyscale_fast);
2331 c->hcscale_fast = RENAME(hcscale_fast);
2333 #endif /* COMPILE_TEMPLATE_MMX2 */
2334 c->hyscale_fast = NULL;
2335 c->hcscale_fast = NULL;
2336 #if COMPILE_TEMPLATE_MMX2
2338 #endif /* COMPILE_TEMPLATE_MMX2 */
2340 #if !COMPILE_TEMPLATE_MMX2
2342 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2343 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2344 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2345 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2346 case PIX_FMT_YUV420P16BE:
2347 case PIX_FMT_YUV422P16BE:
2348 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2349 case PIX_FMT_YUV420P16LE:
2350 case PIX_FMT_YUV422P16LE:
2351 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2354 #endif /* !COMPILE_TEMPLATE_MMX2 */
2355 if (!c->chrSrcHSubSample) {
2357 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2358 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2363 switch (srcFormat) {
2364 #if !COMPILE_TEMPLATE_MMX2
2365 case PIX_FMT_YUYV422 :
2366 case PIX_FMT_YUV420P16BE:
2367 case PIX_FMT_YUV422P16BE:
2368 case PIX_FMT_YUV444P16BE:
2369 case PIX_FMT_Y400A :
2370 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2371 case PIX_FMT_UYVY422 :
2372 case PIX_FMT_YUV420P16LE:
2373 case PIX_FMT_YUV422P16LE:
2374 case PIX_FMT_YUV444P16LE:
2375 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2376 #endif /* !COMPILE_TEMPLATE_MMX2 */
2377 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
2378 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
2381 #if !COMPILE_TEMPLATE_MMX2
2383 switch (srcFormat) {
2384 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
2388 #endif /* !COMPILE_TEMPLATE_MMX2 */