2 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #if COMPILE_TEMPLATE_MMX2
27 #define PREFETCH "prefetchnta"
29 #define PREFETCH " # nop"
32 #if COMPILE_TEMPLATE_MMX2
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
41 #if !COMPILE_TEMPLATE_MMX2
42 static av_always_inline void
43 dither_8to16(const uint8_t *srcDither, int rot)
46 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
47 "movq (%0), %%mm3\n\t"
48 "movq %%mm3, %%mm4\n\t"
49 "psrlq $24, %%mm3\n\t"
50 "psllq $40, %%mm4\n\t"
51 "por %%mm4, %%mm3\n\t"
52 "movq %%mm3, %%mm4\n\t"
53 "punpcklbw %%mm0, %%mm3\n\t"
54 "punpckhbw %%mm0, %%mm4\n\t"
58 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
59 "movq (%0), %%mm3\n\t"
60 "movq %%mm3, %%mm4\n\t"
61 "punpcklbw %%mm0, %%mm3\n\t"
62 "punpckhbw %%mm0, %%mm4\n\t"
69 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
70 const int16_t **src, uint8_t *dest, int dstW,
71 const uint8_t *dither, int offset)
73 dither_8to16(dither, offset);
77 "movq %%mm3, %%mm6\n\t"
78 "movq %%mm4, %%mm7\n\t"
80 "mov %0, %%"REG_d" \n\t"\
81 "mov (%%"REG_d"), %%"REG_S" \n\t"\
82 ".p2align 4 \n\t" /* FIXME Unroll? */\
84 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
85 "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
86 "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
87 "add $16, %%"REG_d" \n\t"\
88 "mov (%%"REG_d"), %%"REG_S" \n\t"\
89 "test %%"REG_S", %%"REG_S" \n\t"\
90 "pmulhw %%mm0, %%mm2 \n\t"\
91 "pmulhw %%mm0, %%mm5 \n\t"\
92 "paddw %%mm2, %%mm3 \n\t"\
93 "paddw %%mm5, %%mm4 \n\t"\
95 "psraw $3, %%mm3 \n\t"\
96 "psraw $3, %%mm4 \n\t"\
97 "packuswb %%mm4, %%mm3 \n\t"
98 MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
99 "add $8, %%"REG_c" \n\t"\
100 "cmp %2, %%"REG_c" \n\t"\
101 "movq %%mm6, %%mm3\n\t"
102 "movq %%mm7, %%mm4\n\t"
103 "mov %0, %%"REG_d" \n\t"\
104 "mov (%%"REG_d"), %%"REG_S" \n\t"\
107 "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
108 : "%"REG_d, "%"REG_S, "%"REG_c
112 #define YSCALEYUV2PACKEDX_UV \
114 "xor %%"REG_a", %%"REG_a" \n\t"\
118 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
119 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
121 "movq %%mm3, %%mm4 \n\t"\
124 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
125 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
126 "add %6, %%"REG_S" \n\t" \
127 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
128 "add $16, %%"REG_d" \n\t"\
129 "mov (%%"REG_d"), %%"REG_S" \n\t"\
130 "pmulhw %%mm0, %%mm2 \n\t"\
131 "pmulhw %%mm0, %%mm5 \n\t"\
132 "paddw %%mm2, %%mm3 \n\t"\
133 "paddw %%mm5, %%mm4 \n\t"\
134 "test %%"REG_S", %%"REG_S" \n\t"\
137 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
138 "lea "offset"(%0), %%"REG_d" \n\t"\
139 "mov (%%"REG_d"), %%"REG_S" \n\t"\
140 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
141 "movq "#dst1", "#dst2" \n\t"\
144 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
145 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
146 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
147 "add $16, %%"REG_d" \n\t"\
148 "mov (%%"REG_d"), %%"REG_S" \n\t"\
149 "pmulhw "#coeff", "#src1" \n\t"\
150 "pmulhw "#coeff", "#src2" \n\t"\
151 "paddw "#src1", "#dst1" \n\t"\
152 "paddw "#src2", "#dst2" \n\t"\
153 "test %%"REG_S", %%"REG_S" \n\t"\
156 #define YSCALEYUV2PACKEDX \
157 YSCALEYUV2PACKEDX_UV \
158 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
160 #define YSCALEYUV2PACKEDX_END \
161 :: "r" (&c->redDither), \
162 "m" (dummy), "m" (dummy), "m" (dummy),\
163 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
164 : "%"REG_a, "%"REG_d, "%"REG_S \
167 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
169 "xor %%"REG_a", %%"REG_a" \n\t"\
173 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
174 "mov (%%"REG_d"), %%"REG_S" \n\t"\
175 "pxor %%mm4, %%mm4 \n\t"\
176 "pxor %%mm5, %%mm5 \n\t"\
177 "pxor %%mm6, %%mm6 \n\t"\
178 "pxor %%mm7, %%mm7 \n\t"\
181 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
182 "add %6, %%"REG_S" \n\t" \
183 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
184 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
185 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
186 "movq %%mm0, %%mm3 \n\t"\
187 "punpcklwd %%mm1, %%mm0 \n\t"\
188 "punpckhwd %%mm1, %%mm3 \n\t"\
189 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
190 "pmaddwd %%mm1, %%mm0 \n\t"\
191 "pmaddwd %%mm1, %%mm3 \n\t"\
192 "paddd %%mm0, %%mm4 \n\t"\
193 "paddd %%mm3, %%mm5 \n\t"\
194 "add %6, %%"REG_S" \n\t" \
195 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
196 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
197 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
198 "test %%"REG_S", %%"REG_S" \n\t"\
199 "movq %%mm2, %%mm0 \n\t"\
200 "punpcklwd %%mm3, %%mm2 \n\t"\
201 "punpckhwd %%mm3, %%mm0 \n\t"\
202 "pmaddwd %%mm1, %%mm2 \n\t"\
203 "pmaddwd %%mm1, %%mm0 \n\t"\
204 "paddd %%mm2, %%mm6 \n\t"\
205 "paddd %%mm0, %%mm7 \n\t"\
207 "psrad $16, %%mm4 \n\t"\
208 "psrad $16, %%mm5 \n\t"\
209 "psrad $16, %%mm6 \n\t"\
210 "psrad $16, %%mm7 \n\t"\
211 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
212 "packssdw %%mm5, %%mm4 \n\t"\
213 "packssdw %%mm7, %%mm6 \n\t"\
214 "paddw %%mm0, %%mm4 \n\t"\
215 "paddw %%mm0, %%mm6 \n\t"\
216 "movq %%mm4, "U_TEMP"(%0) \n\t"\
217 "movq %%mm6, "V_TEMP"(%0) \n\t"\
219 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
220 "lea "offset"(%0), %%"REG_d" \n\t"\
221 "mov (%%"REG_d"), %%"REG_S" \n\t"\
222 "pxor %%mm1, %%mm1 \n\t"\
223 "pxor %%mm5, %%mm5 \n\t"\
224 "pxor %%mm7, %%mm7 \n\t"\
225 "pxor %%mm6, %%mm6 \n\t"\
228 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
229 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
230 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
231 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
232 "movq %%mm0, %%mm3 \n\t"\
233 "punpcklwd %%mm4, %%mm0 \n\t"\
234 "punpckhwd %%mm4, %%mm3 \n\t"\
235 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
236 "pmaddwd %%mm4, %%mm0 \n\t"\
237 "pmaddwd %%mm4, %%mm3 \n\t"\
238 "paddd %%mm0, %%mm1 \n\t"\
239 "paddd %%mm3, %%mm5 \n\t"\
240 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
241 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
242 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
243 "test %%"REG_S", %%"REG_S" \n\t"\
244 "movq %%mm2, %%mm0 \n\t"\
245 "punpcklwd %%mm3, %%mm2 \n\t"\
246 "punpckhwd %%mm3, %%mm0 \n\t"\
247 "pmaddwd %%mm4, %%mm2 \n\t"\
248 "pmaddwd %%mm4, %%mm0 \n\t"\
249 "paddd %%mm2, %%mm7 \n\t"\
250 "paddd %%mm0, %%mm6 \n\t"\
252 "psrad $16, %%mm1 \n\t"\
253 "psrad $16, %%mm5 \n\t"\
254 "psrad $16, %%mm7 \n\t"\
255 "psrad $16, %%mm6 \n\t"\
256 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
257 "packssdw %%mm5, %%mm1 \n\t"\
258 "packssdw %%mm6, %%mm7 \n\t"\
259 "paddw %%mm0, %%mm1 \n\t"\
260 "paddw %%mm0, %%mm7 \n\t"\
261 "movq "U_TEMP"(%0), %%mm3 \n\t"\
262 "movq "V_TEMP"(%0), %%mm4 \n\t"\
264 #define YSCALEYUV2PACKEDX_ACCURATE \
265 YSCALEYUV2PACKEDX_ACCURATE_UV \
266 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
268 #define YSCALEYUV2RGBX \
269 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
270 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
271 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
272 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
273 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
274 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
275 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
276 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
277 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
278 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
279 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
280 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
281 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
282 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
283 "paddw %%mm3, %%mm4 \n\t"\
284 "movq %%mm2, %%mm0 \n\t"\
285 "movq %%mm5, %%mm6 \n\t"\
286 "movq %%mm4, %%mm3 \n\t"\
287 "punpcklwd %%mm2, %%mm2 \n\t"\
288 "punpcklwd %%mm5, %%mm5 \n\t"\
289 "punpcklwd %%mm4, %%mm4 \n\t"\
290 "paddw %%mm1, %%mm2 \n\t"\
291 "paddw %%mm1, %%mm5 \n\t"\
292 "paddw %%mm1, %%mm4 \n\t"\
293 "punpckhwd %%mm0, %%mm0 \n\t"\
294 "punpckhwd %%mm6, %%mm6 \n\t"\
295 "punpckhwd %%mm3, %%mm3 \n\t"\
296 "paddw %%mm7, %%mm0 \n\t"\
297 "paddw %%mm7, %%mm6 \n\t"\
298 "paddw %%mm7, %%mm3 \n\t"\
299 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
300 "packuswb %%mm0, %%mm2 \n\t"\
301 "packuswb %%mm6, %%mm5 \n\t"\
302 "packuswb %%mm3, %%mm4 \n\t"\
304 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
305 "movq "#b", "#q2" \n\t" /* B */\
306 "movq "#r", "#t" \n\t" /* R */\
307 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
308 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
309 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
310 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
311 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
312 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
313 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
314 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
315 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
316 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
318 MOVNTQ( q0, (dst, index, 4))\
319 MOVNTQ( b, 8(dst, index, 4))\
320 MOVNTQ( q2, 16(dst, index, 4))\
321 MOVNTQ( q3, 24(dst, index, 4))\
323 "add $8, "#index" \n\t"\
324 "cmp "#dstw", "#index" \n\t"\
326 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
328 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
329 const int16_t **lumSrc, int lumFilterSize,
330 const int16_t *chrFilter, const int16_t **chrUSrc,
331 const int16_t **chrVSrc,
332 int chrFilterSize, const int16_t **alpSrc,
333 uint8_t *dest, int dstW, int dstY)
336 x86_reg dstW_reg = dstW;
337 x86_reg uv_off = c->uv_offx2;
339 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
340 YSCALEYUV2PACKEDX_ACCURATE
342 "movq %%mm2, "U_TEMP"(%0) \n\t"
343 "movq %%mm4, "V_TEMP"(%0) \n\t"
344 "movq %%mm5, "Y_TEMP"(%0) \n\t"
345 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
346 "movq "Y_TEMP"(%0), %%mm5 \n\t"
347 "psraw $3, %%mm1 \n\t"
348 "psraw $3, %%mm7 \n\t"
349 "packuswb %%mm7, %%mm1 \n\t"
350 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
351 YSCALEYUV2PACKEDX_END
353 YSCALEYUV2PACKEDX_ACCURATE
355 "pcmpeqd %%mm7, %%mm7 \n\t"
356 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
357 YSCALEYUV2PACKEDX_END
361 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
362 const int16_t **lumSrc, int lumFilterSize,
363 const int16_t *chrFilter, const int16_t **chrUSrc,
364 const int16_t **chrVSrc,
365 int chrFilterSize, const int16_t **alpSrc,
366 uint8_t *dest, int dstW, int dstY)
369 x86_reg dstW_reg = dstW;
370 x86_reg uv_off = c->uv_offx2;
372 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
375 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
376 "psraw $3, %%mm1 \n\t"
377 "psraw $3, %%mm7 \n\t"
378 "packuswb %%mm7, %%mm1 \n\t"
379 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
380 YSCALEYUV2PACKEDX_END
384 "pcmpeqd %%mm7, %%mm7 \n\t"
385 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
386 YSCALEYUV2PACKEDX_END
390 #define REAL_WRITERGB16(dst, dstw, index) \
391 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
392 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
393 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
394 "psrlq $3, %%mm2 \n\t"\
396 "movq %%mm2, %%mm1 \n\t"\
397 "movq %%mm4, %%mm3 \n\t"\
399 "punpcklbw %%mm7, %%mm3 \n\t"\
400 "punpcklbw %%mm5, %%mm2 \n\t"\
401 "punpckhbw %%mm7, %%mm4 \n\t"\
402 "punpckhbw %%mm5, %%mm1 \n\t"\
404 "psllq $3, %%mm3 \n\t"\
405 "psllq $3, %%mm4 \n\t"\
407 "por %%mm3, %%mm2 \n\t"\
408 "por %%mm4, %%mm1 \n\t"\
410 MOVNTQ(%%mm2, (dst, index, 2))\
411 MOVNTQ(%%mm1, 8(dst, index, 2))\
413 "add $8, "#index" \n\t"\
414 "cmp "#dstw", "#index" \n\t"\
416 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
418 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
419 const int16_t **lumSrc, int lumFilterSize,
420 const int16_t *chrFilter, const int16_t **chrUSrc,
421 const int16_t **chrVSrc,
422 int chrFilterSize, const int16_t **alpSrc,
423 uint8_t *dest, int dstW, int dstY)
426 x86_reg dstW_reg = dstW;
427 x86_reg uv_off = c->uv_offx2;
429 YSCALEYUV2PACKEDX_ACCURATE
431 "pxor %%mm7, %%mm7 \n\t"
432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
434 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
435 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
436 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
438 WRITERGB16(%4, %5, %%REGa)
439 YSCALEYUV2PACKEDX_END
442 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
443 const int16_t **lumSrc, int lumFilterSize,
444 const int16_t *chrFilter, const int16_t **chrUSrc,
445 const int16_t **chrVSrc,
446 int chrFilterSize, const int16_t **alpSrc,
447 uint8_t *dest, int dstW, int dstY)
450 x86_reg dstW_reg = dstW;
451 x86_reg uv_off = c->uv_offx2;
455 "pxor %%mm7, %%mm7 \n\t"
456 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
458 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
459 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
460 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
462 WRITERGB16(%4, %5, %%REGa)
463 YSCALEYUV2PACKEDX_END
466 #define REAL_WRITERGB15(dst, dstw, index) \
467 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
468 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
469 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
470 "psrlq $3, %%mm2 \n\t"\
471 "psrlq $1, %%mm5 \n\t"\
473 "movq %%mm2, %%mm1 \n\t"\
474 "movq %%mm4, %%mm3 \n\t"\
476 "punpcklbw %%mm7, %%mm3 \n\t"\
477 "punpcklbw %%mm5, %%mm2 \n\t"\
478 "punpckhbw %%mm7, %%mm4 \n\t"\
479 "punpckhbw %%mm5, %%mm1 \n\t"\
481 "psllq $2, %%mm3 \n\t"\
482 "psllq $2, %%mm4 \n\t"\
484 "por %%mm3, %%mm2 \n\t"\
485 "por %%mm4, %%mm1 \n\t"\
487 MOVNTQ(%%mm2, (dst, index, 2))\
488 MOVNTQ(%%mm1, 8(dst, index, 2))\
490 "add $8, "#index" \n\t"\
491 "cmp "#dstw", "#index" \n\t"\
493 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
495 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
496 const int16_t **lumSrc, int lumFilterSize,
497 const int16_t *chrFilter, const int16_t **chrUSrc,
498 const int16_t **chrVSrc,
499 int chrFilterSize, const int16_t **alpSrc,
500 uint8_t *dest, int dstW, int dstY)
503 x86_reg dstW_reg = dstW;
504 x86_reg uv_off = c->uv_offx2;
506 YSCALEYUV2PACKEDX_ACCURATE
508 "pxor %%mm7, %%mm7 \n\t"
509 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
511 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
512 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
513 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
515 WRITERGB15(%4, %5, %%REGa)
516 YSCALEYUV2PACKEDX_END
519 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
520 const int16_t **lumSrc, int lumFilterSize,
521 const int16_t *chrFilter, const int16_t **chrUSrc,
522 const int16_t **chrVSrc,
523 int chrFilterSize, const int16_t **alpSrc,
524 uint8_t *dest, int dstW, int dstY)
527 x86_reg dstW_reg = dstW;
528 x86_reg uv_off = c->uv_offx2;
532 "pxor %%mm7, %%mm7 \n\t"
533 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
535 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
536 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
537 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
539 WRITERGB15(%4, %5, %%REGa)
540 YSCALEYUV2PACKEDX_END
543 #define WRITEBGR24MMX(dst, dstw, index) \
544 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
545 "movq %%mm2, %%mm1 \n\t" /* B */\
546 "movq %%mm5, %%mm6 \n\t" /* R */\
547 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
548 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
549 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
550 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
551 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
552 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
553 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
554 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
555 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
556 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
558 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
559 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
560 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
561 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
563 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
564 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
565 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
566 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
568 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
569 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
570 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
571 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
573 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
574 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
575 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
576 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
577 MOVNTQ(%%mm0, (dst))\
579 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
580 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
581 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
582 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
583 MOVNTQ(%%mm6, 8(dst))\
585 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
586 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
587 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
588 MOVNTQ(%%mm5, 16(dst))\
590 "add $24, "#dst" \n\t"\
592 "add $8, "#index" \n\t"\
593 "cmp "#dstw", "#index" \n\t"\
596 #define WRITEBGR24MMX2(dst, dstw, index) \
597 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
598 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
599 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
600 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
601 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
602 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
604 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
605 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
606 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
608 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
609 "por %%mm1, %%mm6 \n\t"\
610 "por %%mm3, %%mm6 \n\t"\
611 MOVNTQ(%%mm6, (dst))\
613 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
614 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
615 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
616 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
618 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
619 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
620 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
622 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
623 "por %%mm3, %%mm6 \n\t"\
624 MOVNTQ(%%mm6, 8(dst))\
626 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
627 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
628 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
630 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
631 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
632 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
634 "por %%mm1, %%mm3 \n\t"\
635 "por %%mm3, %%mm6 \n\t"\
636 MOVNTQ(%%mm6, 16(dst))\
638 "add $24, "#dst" \n\t"\
640 "add $8, "#index" \n\t"\
641 "cmp "#dstw", "#index" \n\t"\
644 #if COMPILE_TEMPLATE_MMX2
646 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
649 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
652 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
653 const int16_t **lumSrc, int lumFilterSize,
654 const int16_t *chrFilter, const int16_t **chrUSrc,
655 const int16_t **chrVSrc,
656 int chrFilterSize, const int16_t **alpSrc,
657 uint8_t *dest, int dstW, int dstY)
660 x86_reg dstW_reg = dstW;
661 x86_reg uv_off = c->uv_offx2;
663 YSCALEYUV2PACKEDX_ACCURATE
665 "pxor %%mm7, %%mm7 \n\t"
666 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
667 "add %4, %%"REG_c" \n\t"
668 WRITEBGR24(%%REGc, %5, %%REGa)
669 :: "r" (&c->redDither),
670 "m" (dummy), "m" (dummy), "m" (dummy),
671 "r" (dest), "m" (dstW_reg), "m"(uv_off)
672 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
676 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
677 const int16_t **lumSrc, int lumFilterSize,
678 const int16_t *chrFilter, const int16_t **chrUSrc,
679 const int16_t **chrVSrc,
680 int chrFilterSize, const int16_t **alpSrc,
681 uint8_t *dest, int dstW, int dstY)
684 x86_reg dstW_reg = dstW;
685 x86_reg uv_off = c->uv_offx2;
689 "pxor %%mm7, %%mm7 \n\t"
690 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
691 "add %4, %%"REG_c" \n\t"
692 WRITEBGR24(%%REGc, %5, %%REGa)
693 :: "r" (&c->redDither),
694 "m" (dummy), "m" (dummy), "m" (dummy),
695 "r" (dest), "m" (dstW_reg), "m"(uv_off)
696 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
700 #define REAL_WRITEYUY2(dst, dstw, index) \
701 "packuswb %%mm3, %%mm3 \n\t"\
702 "packuswb %%mm4, %%mm4 \n\t"\
703 "packuswb %%mm7, %%mm1 \n\t"\
704 "punpcklbw %%mm4, %%mm3 \n\t"\
705 "movq %%mm1, %%mm7 \n\t"\
706 "punpcklbw %%mm3, %%mm1 \n\t"\
707 "punpckhbw %%mm3, %%mm7 \n\t"\
709 MOVNTQ(%%mm1, (dst, index, 2))\
710 MOVNTQ(%%mm7, 8(dst, index, 2))\
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
715 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
717 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
718 const int16_t **lumSrc, int lumFilterSize,
719 const int16_t *chrFilter, const int16_t **chrUSrc,
720 const int16_t **chrVSrc,
721 int chrFilterSize, const int16_t **alpSrc,
722 uint8_t *dest, int dstW, int dstY)
725 x86_reg dstW_reg = dstW;
726 x86_reg uv_off = c->uv_offx2;
728 YSCALEYUV2PACKEDX_ACCURATE
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
730 "psraw $3, %%mm3 \n\t"
731 "psraw $3, %%mm4 \n\t"
732 "psraw $3, %%mm1 \n\t"
733 "psraw $3, %%mm7 \n\t"
734 WRITEYUY2(%4, %5, %%REGa)
735 YSCALEYUV2PACKEDX_END
738 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
739 const int16_t **lumSrc, int lumFilterSize,
740 const int16_t *chrFilter, const int16_t **chrUSrc,
741 const int16_t **chrVSrc,
742 int chrFilterSize, const int16_t **alpSrc,
743 uint8_t *dest, int dstW, int dstY)
746 x86_reg dstW_reg = dstW;
747 x86_reg uv_off = c->uv_offx2;
750 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
751 "psraw $3, %%mm3 \n\t"
752 "psraw $3, %%mm4 \n\t"
753 "psraw $3, %%mm1 \n\t"
754 "psraw $3, %%mm7 \n\t"
755 WRITEYUY2(%4, %5, %%REGa)
756 YSCALEYUV2PACKEDX_END
759 #define REAL_YSCALEYUV2RGB_UV(index, c) \
760 "xor "#index", "#index" \n\t"\
763 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
764 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
765 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
766 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
767 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
768 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
769 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
770 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
771 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
772 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
773 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
774 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
775 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
776 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
777 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
778 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
779 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
780 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
781 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
782 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
783 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
784 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
786 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
787 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
788 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
789 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
790 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
791 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
792 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
793 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
794 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
795 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
796 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
797 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
798 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
800 #define REAL_YSCALEYUV2RGB_COEFF(c) \
801 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
802 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
803 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
804 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
805 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
806 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
807 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
808 "paddw %%mm3, %%mm4 \n\t"\
809 "movq %%mm2, %%mm0 \n\t"\
810 "movq %%mm5, %%mm6 \n\t"\
811 "movq %%mm4, %%mm3 \n\t"\
812 "punpcklwd %%mm2, %%mm2 \n\t"\
813 "punpcklwd %%mm5, %%mm5 \n\t"\
814 "punpcklwd %%mm4, %%mm4 \n\t"\
815 "paddw %%mm1, %%mm2 \n\t"\
816 "paddw %%mm1, %%mm5 \n\t"\
817 "paddw %%mm1, %%mm4 \n\t"\
818 "punpckhwd %%mm0, %%mm0 \n\t"\
819 "punpckhwd %%mm6, %%mm6 \n\t"\
820 "punpckhwd %%mm3, %%mm3 \n\t"\
821 "paddw %%mm7, %%mm0 \n\t"\
822 "paddw %%mm7, %%mm6 \n\t"\
823 "paddw %%mm7, %%mm3 \n\t"\
824 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
825 "packuswb %%mm0, %%mm2 \n\t"\
826 "packuswb %%mm6, %%mm5 \n\t"\
827 "packuswb %%mm3, %%mm4 \n\t"\
829 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
831 #define YSCALEYUV2RGB(index, c) \
832 REAL_YSCALEYUV2RGB_UV(index, c) \
833 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
834 REAL_YSCALEYUV2RGB_COEFF(c)
837 * vertical bilinear scale YV12 to RGB
839 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
840 const int16_t *ubuf[2], const int16_t *vbuf[2],
841 const int16_t *abuf[2], uint8_t *dest,
842 int dstW, int yalpha, int uvalpha, int y)
844 const int16_t *buf0 = buf[0], *buf1 = buf[1],
845 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
847 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
848 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
851 YSCALEYUV2RGB(%%r8, %5)
852 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
853 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
854 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
855 "packuswb %%mm7, %%mm1 \n\t"
856 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
857 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
859 "r" (abuf0), "r" (abuf1)
863 c->u_temp=(intptr_t)abuf0;
864 c->v_temp=(intptr_t)abuf1;
866 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
867 "mov %4, %%"REG_b" \n\t"
868 "push %%"REG_BP" \n\t"
869 YSCALEYUV2RGB(%%REGBP, %5)
872 "mov "U_TEMP"(%5), %0 \n\t"
873 "mov "V_TEMP"(%5), %1 \n\t"
874 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
875 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
876 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
877 "packuswb %%mm7, %%mm1 \n\t"
880 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
881 "pop %%"REG_BP" \n\t"
882 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
883 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
889 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
890 "mov %4, %%"REG_b" \n\t"
891 "push %%"REG_BP" \n\t"
892 YSCALEYUV2RGB(%%REGBP, %5)
893 "pcmpeqd %%mm7, %%mm7 \n\t"
894 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
895 "pop %%"REG_BP" \n\t"
896 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
897 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
903 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
904 const int16_t *ubuf[2], const int16_t *vbuf[2],
905 const int16_t *abuf[2], uint8_t *dest,
906 int dstW, int yalpha, int uvalpha, int y)
908 const int16_t *buf0 = buf[0], *buf1 = buf[1],
909 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
911 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
913 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
914 "mov %4, %%"REG_b" \n\t"
915 "push %%"REG_BP" \n\t"
916 YSCALEYUV2RGB(%%REGBP, %5)
917 "pxor %%mm7, %%mm7 \n\t"
918 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
919 "pop %%"REG_BP" \n\t"
920 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
921 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
926 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
927 const int16_t *ubuf[2], const int16_t *vbuf[2],
928 const int16_t *abuf[2], uint8_t *dest,
929 int dstW, int yalpha, int uvalpha, int y)
931 const int16_t *buf0 = buf[0], *buf1 = buf[1],
932 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
934 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
936 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
937 "mov %4, %%"REG_b" \n\t"
938 "push %%"REG_BP" \n\t"
939 YSCALEYUV2RGB(%%REGBP, %5)
940 "pxor %%mm7, %%mm7 \n\t"
941 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
943 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
944 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
945 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
947 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
948 "pop %%"REG_BP" \n\t"
949 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
950 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
955 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
956 const int16_t *ubuf[2], const int16_t *vbuf[2],
957 const int16_t *abuf[2], uint8_t *dest,
958 int dstW, int yalpha, int uvalpha, int y)
960 const int16_t *buf0 = buf[0], *buf1 = buf[1],
961 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
963 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
965 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
966 "mov %4, %%"REG_b" \n\t"
967 "push %%"REG_BP" \n\t"
968 YSCALEYUV2RGB(%%REGBP, %5)
969 "pxor %%mm7, %%mm7 \n\t"
970 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
972 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
973 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
974 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
976 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
977 "pop %%"REG_BP" \n\t"
978 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
979 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
984 #define REAL_YSCALEYUV2PACKED(index, c) \
985 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
986 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
987 "psraw $3, %%mm0 \n\t"\
988 "psraw $3, %%mm1 \n\t"\
989 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
990 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
991 "xor "#index", "#index" \n\t"\
994 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
995 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
996 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
997 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
998 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
999 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1000 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1001 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1002 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1003 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1004 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1005 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1006 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1007 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1008 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1009 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
1010 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
1011 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
1012 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
1013 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
1014 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
1015 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1016 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1017 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1018 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1019 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1020 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1022 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1024 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
1025 const int16_t *ubuf[2], const int16_t *vbuf[2],
1026 const int16_t *abuf[2], uint8_t *dest,
1027 int dstW, int yalpha, int uvalpha, int y)
1029 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1030 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1032 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1034 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1035 "mov %4, %%"REG_b" \n\t"
1036 "push %%"REG_BP" \n\t"
1037 YSCALEYUV2PACKED(%%REGBP, %5)
1038 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1039 "pop %%"REG_BP" \n\t"
1040 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1041 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1046 #define REAL_YSCALEYUV2RGB1(index, c) \
1047 "xor "#index", "#index" \n\t"\
1050 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1051 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1052 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1053 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1054 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1055 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1056 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1057 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1058 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1059 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1060 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1061 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1062 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1063 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1064 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1065 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1066 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1067 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1068 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1069 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1070 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1071 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1072 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1073 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1074 "paddw %%mm3, %%mm4 \n\t"\
1075 "movq %%mm2, %%mm0 \n\t"\
1076 "movq %%mm5, %%mm6 \n\t"\
1077 "movq %%mm4, %%mm3 \n\t"\
1078 "punpcklwd %%mm2, %%mm2 \n\t"\
1079 "punpcklwd %%mm5, %%mm5 \n\t"\
1080 "punpcklwd %%mm4, %%mm4 \n\t"\
1081 "paddw %%mm1, %%mm2 \n\t"\
1082 "paddw %%mm1, %%mm5 \n\t"\
1083 "paddw %%mm1, %%mm4 \n\t"\
1084 "punpckhwd %%mm0, %%mm0 \n\t"\
1085 "punpckhwd %%mm6, %%mm6 \n\t"\
1086 "punpckhwd %%mm3, %%mm3 \n\t"\
1087 "paddw %%mm7, %%mm0 \n\t"\
1088 "paddw %%mm7, %%mm6 \n\t"\
1089 "paddw %%mm7, %%mm3 \n\t"\
1090 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1091 "packuswb %%mm0, %%mm2 \n\t"\
1092 "packuswb %%mm6, %%mm5 \n\t"\
1093 "packuswb %%mm3, %%mm4 \n\t"\
1095 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1097 // do vertical chrominance interpolation
1098 #define REAL_YSCALEYUV2RGB1b(index, c) \
1099 "xor "#index", "#index" \n\t"\
1102 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1103 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1104 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1105 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1106 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1107 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1108 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1109 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1110 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1111 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1112 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1113 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1114 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1115 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1116 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1117 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1118 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1119 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1120 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1121 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1122 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1123 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1124 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1125 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1126 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1127 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1128 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1129 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1130 "paddw %%mm3, %%mm4 \n\t"\
1131 "movq %%mm2, %%mm0 \n\t"\
1132 "movq %%mm5, %%mm6 \n\t"\
1133 "movq %%mm4, %%mm3 \n\t"\
1134 "punpcklwd %%mm2, %%mm2 \n\t"\
1135 "punpcklwd %%mm5, %%mm5 \n\t"\
1136 "punpcklwd %%mm4, %%mm4 \n\t"\
1137 "paddw %%mm1, %%mm2 \n\t"\
1138 "paddw %%mm1, %%mm5 \n\t"\
1139 "paddw %%mm1, %%mm4 \n\t"\
1140 "punpckhwd %%mm0, %%mm0 \n\t"\
1141 "punpckhwd %%mm6, %%mm6 \n\t"\
1142 "punpckhwd %%mm3, %%mm3 \n\t"\
1143 "paddw %%mm7, %%mm0 \n\t"\
1144 "paddw %%mm7, %%mm6 \n\t"\
1145 "paddw %%mm7, %%mm3 \n\t"\
1146 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1147 "packuswb %%mm0, %%mm2 \n\t"\
1148 "packuswb %%mm6, %%mm5 \n\t"\
1149 "packuswb %%mm3, %%mm4 \n\t"\
1151 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1153 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1154 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1155 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1156 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1157 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1158 "packuswb %%mm1, %%mm7 \n\t"
1159 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1162 * YV12 to RGB without scaling or interpolating
1164 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1165 const int16_t *ubuf[2], const int16_t *vbuf[2],
1166 const int16_t *abuf0, uint8_t *dest,
1167 int dstW, int uvalpha, int y)
1169 const int16_t *ubuf0 = ubuf[0];
1170 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1172 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1173 const int16_t *ubuf1 = ubuf[0];
1174 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1176 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1177 "mov %4, %%"REG_b" \n\t"
1178 "push %%"REG_BP" \n\t"
1179 YSCALEYUV2RGB1(%%REGBP, %5)
1180 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1181 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1182 "pop %%"REG_BP" \n\t"
1183 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1184 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1189 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1190 "mov %4, %%"REG_b" \n\t"
1191 "push %%"REG_BP" \n\t"
1192 YSCALEYUV2RGB1(%%REGBP, %5)
1193 "pcmpeqd %%mm7, %%mm7 \n\t"
1194 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1195 "pop %%"REG_BP" \n\t"
1196 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1197 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1202 const int16_t *ubuf1 = ubuf[1];
1203 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1205 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1206 "mov %4, %%"REG_b" \n\t"
1207 "push %%"REG_BP" \n\t"
1208 YSCALEYUV2RGB1b(%%REGBP, %5)
1209 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1210 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1211 "pop %%"REG_BP" \n\t"
1212 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1213 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1218 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1219 "mov %4, %%"REG_b" \n\t"
1220 "push %%"REG_BP" \n\t"
1221 YSCALEYUV2RGB1b(%%REGBP, %5)
1222 "pcmpeqd %%mm7, %%mm7 \n\t"
1223 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1224 "pop %%"REG_BP" \n\t"
1225 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1226 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1233 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1234 const int16_t *ubuf[2], const int16_t *vbuf[2],
1235 const int16_t *abuf0, uint8_t *dest,
1236 int dstW, int uvalpha, int y)
1238 const int16_t *ubuf0 = ubuf[0];
1239 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1241 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1242 const int16_t *ubuf1 = ubuf[0];
1244 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1245 "mov %4, %%"REG_b" \n\t"
1246 "push %%"REG_BP" \n\t"
1247 YSCALEYUV2RGB1(%%REGBP, %5)
1248 "pxor %%mm7, %%mm7 \n\t"
1249 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1250 "pop %%"REG_BP" \n\t"
1251 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1252 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1256 const int16_t *ubuf1 = ubuf[1];
1258 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1259 "mov %4, %%"REG_b" \n\t"
1260 "push %%"REG_BP" \n\t"
1261 YSCALEYUV2RGB1b(%%REGBP, %5)
1262 "pxor %%mm7, %%mm7 \n\t"
1263 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1264 "pop %%"REG_BP" \n\t"
1265 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1266 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1272 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1273 const int16_t *ubuf[2], const int16_t *vbuf[2],
1274 const int16_t *abuf0, uint8_t *dest,
1275 int dstW, int uvalpha, int y)
1277 const int16_t *ubuf0 = ubuf[0];
1278 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1280 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1281 const int16_t *ubuf1 = ubuf[0];
1283 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1284 "mov %4, %%"REG_b" \n\t"
1285 "push %%"REG_BP" \n\t"
1286 YSCALEYUV2RGB1(%%REGBP, %5)
1287 "pxor %%mm7, %%mm7 \n\t"
1288 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1290 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1291 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1292 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1294 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1301 const int16_t *ubuf1 = ubuf[1];
1303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304 "mov %4, %%"REG_b" \n\t"
1305 "push %%"REG_BP" \n\t"
1306 YSCALEYUV2RGB1b(%%REGBP, %5)
1307 "pxor %%mm7, %%mm7 \n\t"
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1310 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1311 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1312 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1314 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1315 "pop %%"REG_BP" \n\t"
1316 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1317 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1323 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1324 const int16_t *ubuf[2], const int16_t *vbuf[2],
1325 const int16_t *abuf0, uint8_t *dest,
1326 int dstW, int uvalpha, int y)
1328 const int16_t *ubuf0 = ubuf[0];
1329 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1331 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1332 const int16_t *ubuf1 = ubuf[0];
1334 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1335 "mov %4, %%"REG_b" \n\t"
1336 "push %%"REG_BP" \n\t"
1337 YSCALEYUV2RGB1(%%REGBP, %5)
1338 "pxor %%mm7, %%mm7 \n\t"
1339 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1341 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1342 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1343 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1345 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1346 "pop %%"REG_BP" \n\t"
1347 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1348 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1352 const int16_t *ubuf1 = ubuf[1];
1354 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1355 "mov %4, %%"REG_b" \n\t"
1356 "push %%"REG_BP" \n\t"
1357 YSCALEYUV2RGB1b(%%REGBP, %5)
1358 "pxor %%mm7, %%mm7 \n\t"
1359 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1361 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1362 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1363 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1365 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1366 "pop %%"REG_BP" \n\t"
1367 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1368 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1374 #define REAL_YSCALEYUV2PACKED1(index, c) \
1375 "xor "#index", "#index" \n\t"\
1378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1379 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1380 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1381 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1382 "psraw $7, %%mm3 \n\t" \
1383 "psraw $7, %%mm4 \n\t" \
1384 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1385 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1386 "psraw $7, %%mm1 \n\t" \
1387 "psraw $7, %%mm7 \n\t" \
1389 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1391 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1392 "xor "#index", "#index" \n\t"\
1395 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1396 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1397 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1398 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1399 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1400 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1401 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1402 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1403 "psrlw $8, %%mm3 \n\t" \
1404 "psrlw $8, %%mm4 \n\t" \
1405 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1406 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1407 "psraw $7, %%mm1 \n\t" \
1408 "psraw $7, %%mm7 \n\t"
1409 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1411 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1412 const int16_t *ubuf[2], const int16_t *vbuf[2],
1413 const int16_t *abuf0, uint8_t *dest,
1414 int dstW, int uvalpha, int y)
1416 const int16_t *ubuf0 = ubuf[0];
1417 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1419 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1420 const int16_t *ubuf1 = ubuf[0];
1422 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1423 "mov %4, %%"REG_b" \n\t"
1424 "push %%"REG_BP" \n\t"
1425 YSCALEYUV2PACKED1(%%REGBP, %5)
1426 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1433 const int16_t *ubuf1 = ubuf[1];
1435 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1436 "mov %4, %%"REG_b" \n\t"
1437 "push %%"REG_BP" \n\t"
1438 YSCALEYUV2PACKED1b(%%REGBP, %5)
1439 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1440 "pop %%"REG_BP" \n\t"
1441 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1442 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1448 #if COMPILE_TEMPLATE_MMX2
1449 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
1450 int dstWidth, const uint8_t *src,
1453 int32_t *filterPos = c->hLumFilterPos;
1454 int16_t *filter = c->hLumFilter;
1455 void *mmx2FilterCode= c->lumMmx2FilterCode;
1466 "mov %%"REG_b", %5 \n\t"
1468 "mov -8(%%rsp), %%"REG_a" \n\t"
1469 "mov %%"REG_a", %6 \n\t"
1473 "mov -8(%%rsp), %%"REG_a" \n\t"
1474 "mov %%"REG_a", %5 \n\t"
1477 "pxor %%mm7, %%mm7 \n\t"
1478 "mov %0, %%"REG_c" \n\t"
1479 "mov %1, %%"REG_D" \n\t"
1480 "mov %2, %%"REG_d" \n\t"
1481 "mov %3, %%"REG_b" \n\t"
1482 "xor %%"REG_a", %%"REG_a" \n\t" // i
1483 PREFETCH" (%%"REG_c") \n\t"
1484 PREFETCH" 32(%%"REG_c") \n\t"
1485 PREFETCH" 64(%%"REG_c") \n\t"
1488 #define CALL_MMX2_FILTER_CODE \
1489 "movl (%%"REG_b"), %%esi \n\t"\
1491 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
1492 "add %%"REG_S", %%"REG_c" \n\t"\
1493 "add %%"REG_a", %%"REG_D" \n\t"\
1494 "xor %%"REG_a", %%"REG_a" \n\t"\
1497 #define CALL_MMX2_FILTER_CODE \
1498 "movl (%%"REG_b"), %%esi \n\t"\
1500 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
1501 "add %%"REG_a", %%"REG_D" \n\t"\
1502 "xor %%"REG_a", %%"REG_a" \n\t"\
1504 #endif /* ARCH_X86_64 */
1506 CALL_MMX2_FILTER_CODE
1507 CALL_MMX2_FILTER_CODE
1508 CALL_MMX2_FILTER_CODE
1509 CALL_MMX2_FILTER_CODE
1510 CALL_MMX2_FILTER_CODE
1511 CALL_MMX2_FILTER_CODE
1512 CALL_MMX2_FILTER_CODE
1513 CALL_MMX2_FILTER_CODE
1516 "mov %5, %%"REG_b" \n\t"
1518 "mov %6, %%"REG_a" \n\t"
1519 "mov %%"REG_a", -8(%%rsp) \n\t"
1523 "mov %5, %%"REG_a" \n\t"
1524 "mov %%"REG_a", -8(%%rsp) \n\t"
1527 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
1528 "m" (mmx2FilterCode)
1535 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
1541 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1542 dst[i] = src[srcW-1]*128;
1545 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
1546 int dstWidth, const uint8_t *src1,
1547 const uint8_t *src2, int srcW, int xInc)
1549 int32_t *filterPos = c->hChrFilterPos;
1550 int16_t *filter = c->hChrFilter;
1551 void *mmx2FilterCode= c->chrMmx2FilterCode;
1554 DECLARE_ALIGNED(8, uint64_t, ebxsave);
1557 DECLARE_ALIGNED(8, uint64_t, retsave);
1562 "mov %%"REG_b", %7 \n\t"
1564 "mov -8(%%rsp), %%"REG_a" \n\t"
1565 "mov %%"REG_a", %8 \n\t"
1569 "mov -8(%%rsp), %%"REG_a" \n\t"
1570 "mov %%"REG_a", %7 \n\t"
1573 "pxor %%mm7, %%mm7 \n\t"
1574 "mov %0, %%"REG_c" \n\t"
1575 "mov %1, %%"REG_D" \n\t"
1576 "mov %2, %%"REG_d" \n\t"
1577 "mov %3, %%"REG_b" \n\t"
1578 "xor %%"REG_a", %%"REG_a" \n\t" // i
1579 PREFETCH" (%%"REG_c") \n\t"
1580 PREFETCH" 32(%%"REG_c") \n\t"
1581 PREFETCH" 64(%%"REG_c") \n\t"
1583 CALL_MMX2_FILTER_CODE
1584 CALL_MMX2_FILTER_CODE
1585 CALL_MMX2_FILTER_CODE
1586 CALL_MMX2_FILTER_CODE
1587 "xor %%"REG_a", %%"REG_a" \n\t" // i
1588 "mov %5, %%"REG_c" \n\t" // src
1589 "mov %6, %%"REG_D" \n\t" // buf2
1590 PREFETCH" (%%"REG_c") \n\t"
1591 PREFETCH" 32(%%"REG_c") \n\t"
1592 PREFETCH" 64(%%"REG_c") \n\t"
1594 CALL_MMX2_FILTER_CODE
1595 CALL_MMX2_FILTER_CODE
1596 CALL_MMX2_FILTER_CODE
1597 CALL_MMX2_FILTER_CODE
1600 "mov %7, %%"REG_b" \n\t"
1602 "mov %8, %%"REG_a" \n\t"
1603 "mov %%"REG_a", -8(%%rsp) \n\t"
1607 "mov %7, %%"REG_a" \n\t"
1608 "mov %%"REG_a", -8(%%rsp) \n\t"
1611 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
1612 "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
1619 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
1625 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1626 dst1[i] = src1[srcW-1]*128;
1627 dst2[i] = src2[srcW-1]*128;
1630 #endif /* COMPILE_TEMPLATE_MMX2 */
1632 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
1634 enum PixelFormat dstFormat = c->dstFormat;
1636 c->use_mmx_vfilter= 0;
1637 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
1638 && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
1639 if (c->flags & SWS_ACCURATE_RND) {
1640 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1641 switch (c->dstFormat) {
1642 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1643 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1644 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1645 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1646 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1651 c->use_mmx_vfilter= 1;
1652 c->yuv2planeX = RENAME(yuv2yuvX );
1653 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1654 switch (c->dstFormat) {
1655 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1656 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1657 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1658 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1659 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1664 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1665 switch (c->dstFormat) {
1667 c->yuv2packed1 = RENAME(yuv2rgb32_1);
1668 c->yuv2packed2 = RENAME(yuv2rgb32_2);
1671 c->yuv2packed1 = RENAME(yuv2bgr24_1);
1672 c->yuv2packed2 = RENAME(yuv2bgr24_2);
1674 case PIX_FMT_RGB555:
1675 c->yuv2packed1 = RENAME(yuv2rgb555_1);
1676 c->yuv2packed2 = RENAME(yuv2rgb555_2);
1678 case PIX_FMT_RGB565:
1679 c->yuv2packed1 = RENAME(yuv2rgb565_1);
1680 c->yuv2packed2 = RENAME(yuv2rgb565_2);
1682 case PIX_FMT_YUYV422:
1683 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1684 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1692 if (c->srcBpc == 8 && c->dstBpc <= 10) {
1693 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
1694 #if COMPILE_TEMPLATE_MMX2
1695 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
1697 c->hyscale_fast = RENAME(hyscale_fast);
1698 c->hcscale_fast = RENAME(hcscale_fast);
1700 #endif /* COMPILE_TEMPLATE_MMX2 */
1701 c->hyscale_fast = NULL;
1702 c->hcscale_fast = NULL;
1703 #if COMPILE_TEMPLATE_MMX2
1705 #endif /* COMPILE_TEMPLATE_MMX2 */