]> git.sesse.net Git - ffmpeg/blob - libswscale/x86/swscale_template.c
Merge commit 'b396bbad100a7493691d09b8dceba91e3cd28e2e'
[ffmpeg] / libswscale / x86 / swscale_template.c
1 /*
2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include <stdint.h>
22
23 #include "libavutil/x86/asm.h"
24 #include "libswscale/swscale_internal.h"
25
26 #undef REAL_MOVNTQ
27 #undef MOVNTQ
28 #undef MOVNTQ2
29 #undef PREFETCH
30
31
32 #if COMPILE_TEMPLATE_MMXEXT
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
35 #else
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
38 #endif
39 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
40
41 #if !COMPILE_TEMPLATE_MMXEXT
42 static av_always_inline void
43 dither_8to16(const uint8_t *srcDither, int rot)
44 {
45     if (rot) {
46         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
47                          "movq       (%0), %%mm3\n\t"
48                          "movq      %%mm3, %%mm4\n\t"
49                          "psrlq       $24, %%mm3\n\t"
50                          "psllq       $40, %%mm4\n\t"
51                          "por       %%mm4, %%mm3\n\t"
52                          "movq      %%mm3, %%mm4\n\t"
53                          "punpcklbw %%mm0, %%mm3\n\t"
54                          "punpckhbw %%mm0, %%mm4\n\t"
55                          :: "r"(srcDither)
56                          );
57     } else {
58         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
59                          "movq       (%0), %%mm3\n\t"
60                          "movq      %%mm3, %%mm4\n\t"
61                          "punpcklbw %%mm0, %%mm3\n\t"
62                          "punpckhbw %%mm0, %%mm4\n\t"
63                          :: "r"(srcDither)
64                          );
65     }
66 }
67 #endif
68
69 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
70                            const int16_t **src, uint8_t *dest, int dstW,
71                            const uint8_t *dither, int offset)
72 {
73     dither_8to16(dither, offset);
74     filterSize--;
75     __asm__ volatile(
76         "movd %0, %%mm1\n\t"
77         "punpcklwd %%mm1, %%mm1\n\t"
78         "punpckldq %%mm1, %%mm1\n\t"
79         "psllw        $3, %%mm1\n\t"
80         "paddw     %%mm1, %%mm3\n\t"
81         "paddw     %%mm1, %%mm4\n\t"
82         "psraw        $4, %%mm3\n\t"
83         "psraw        $4, %%mm4\n\t"
84         ::"m"(filterSize)
85      );
86
87     __asm__ volatile(\
88         "movq    %%mm3, %%mm6\n\t"
89         "movq    %%mm4, %%mm7\n\t"
90         "movl %3, %%ecx\n\t"
91         "mov                                 %0, %%"REG_d"  \n\t"\
92         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
93         ".p2align                             4             \n\t" /* FIXME Unroll? */\
94         "1:                                                 \n\t"\
95         "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
96         "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
97         "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
98         "add                                $16, %%"REG_d"  \n\t"\
99         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
100         "test                         %%"REG_S", %%"REG_S"  \n\t"\
101         "pmulhw                           %%mm0, %%mm2      \n\t"\
102         "pmulhw                           %%mm0, %%mm5      \n\t"\
103         "paddw                            %%mm2, %%mm3      \n\t"\
104         "paddw                            %%mm5, %%mm4      \n\t"\
105         " jnz                                1b             \n\t"\
106         "psraw                               $3, %%mm3      \n\t"\
107         "psraw                               $3, %%mm4      \n\t"\
108         "packuswb                         %%mm4, %%mm3      \n\t"
109         MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
110         "add                          $8, %%"REG_c"         \n\t"\
111         "cmp                          %2, %%"REG_c"         \n\t"\
112         "movq    %%mm6, %%mm3\n\t"
113         "movq    %%mm7, %%mm4\n\t"
114         "mov                                 %0, %%"REG_d"  \n\t"\
115         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
116         "jb                                  1b             \n\t"\
117         :: "g" (filter),
118            "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
119         : "%"REG_d, "%"REG_S, "%"REG_c
120     );
121 }
122
123 #define YSCALEYUV2PACKEDX_UV \
124     __asm__ volatile(\
125         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
126         ".p2align                      4                \n\t"\
127         "nop                                            \n\t"\
128         "1:                                             \n\t"\
129         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
130         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
131         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
132         "movq                      %%mm3, %%mm4         \n\t"\
133         ".p2align                      4                \n\t"\
134         "2:                                             \n\t"\
135         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
136         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
137         "add                          %6, %%"REG_S"     \n\t" \
138         "movq     (%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
139         "add                         $16, %%"REG_d"     \n\t"\
140         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
141         "pmulhw                    %%mm0, %%mm2         \n\t"\
142         "pmulhw                    %%mm0, %%mm5         \n\t"\
143         "paddw                     %%mm2, %%mm3         \n\t"\
144         "paddw                     %%mm5, %%mm4         \n\t"\
145         "test                  %%"REG_S", %%"REG_S"     \n\t"\
146         " jnz                         2b                \n\t"\
147
148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149     "lea                "offset"(%0), %%"REG_d"     \n\t"\
150     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
151     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
152     "movq                    "#dst1", "#dst2"       \n\t"\
153     ".p2align                      4                \n\t"\
154     "2:                                             \n\t"\
155     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
156     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
157     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
158     "add                         $16, %%"REG_d"            \n\t"\
159     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
160     "pmulhw                 "#coeff", "#src1"       \n\t"\
161     "pmulhw                 "#coeff", "#src2"       \n\t"\
162     "paddw                   "#src1", "#dst1"       \n\t"\
163     "paddw                   "#src2", "#dst2"       \n\t"\
164     "test                  %%"REG_S", %%"REG_S"     \n\t"\
165     " jnz                         2b                \n\t"\
166
167 #define YSCALEYUV2PACKEDX \
168     YSCALEYUV2PACKEDX_UV \
169     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
170
171 #define YSCALEYUV2PACKEDX_END                     \
172         :: "r" (&c->redDither),                   \
173             "m" (dummy), "m" (dummy), "m" (dummy),\
174             "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175             NAMED_CONSTRAINTS_ADD(bF8,bFC) \
176         : "%"REG_a, "%"REG_d, "%"REG_S            \
177     );
178
179 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
180     __asm__ volatile(\
181         "xor %%"REG_a", %%"REG_a"                       \n\t"\
182         ".p2align                      4                \n\t"\
183         "nop                                            \n\t"\
184         "1:                                             \n\t"\
185         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
186         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
187         "pxor                      %%mm4, %%mm4         \n\t"\
188         "pxor                      %%mm5, %%mm5         \n\t"\
189         "pxor                      %%mm6, %%mm6         \n\t"\
190         "pxor                      %%mm7, %%mm7         \n\t"\
191         ".p2align                      4                \n\t"\
192         "2:                                             \n\t"\
193         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
194         "add                          %6, %%"REG_S"      \n\t" \
195         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
196         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
197         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
198         "movq                      %%mm0, %%mm3         \n\t"\
199         "punpcklwd                 %%mm1, %%mm0         \n\t"\
200         "punpckhwd                 %%mm1, %%mm3         \n\t"\
201         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
202         "pmaddwd                   %%mm1, %%mm0         \n\t"\
203         "pmaddwd                   %%mm1, %%mm3         \n\t"\
204         "paddd                     %%mm0, %%mm4         \n\t"\
205         "paddd                     %%mm3, %%mm5         \n\t"\
206         "add                          %6, %%"REG_S"      \n\t" \
207         "movq     (%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
208         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
209         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
210         "test                  %%"REG_S", %%"REG_S"     \n\t"\
211         "movq                      %%mm2, %%mm0         \n\t"\
212         "punpcklwd                 %%mm3, %%mm2         \n\t"\
213         "punpckhwd                 %%mm3, %%mm0         \n\t"\
214         "pmaddwd                   %%mm1, %%mm2         \n\t"\
215         "pmaddwd                   %%mm1, %%mm0         \n\t"\
216         "paddd                     %%mm2, %%mm6         \n\t"\
217         "paddd                     %%mm0, %%mm7         \n\t"\
218         " jnz                         2b                \n\t"\
219         "psrad                       $16, %%mm4         \n\t"\
220         "psrad                       $16, %%mm5         \n\t"\
221         "psrad                       $16, %%mm6         \n\t"\
222         "psrad                       $16, %%mm7         \n\t"\
223         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
224         "packssdw                  %%mm5, %%mm4         \n\t"\
225         "packssdw                  %%mm7, %%mm6         \n\t"\
226         "paddw                     %%mm0, %%mm4         \n\t"\
227         "paddw                     %%mm0, %%mm6         \n\t"\
228         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
229         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
230
231 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
232     "lea                "offset"(%0), %%"REG_d"     \n\t"\
233     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
234     "pxor                      %%mm1, %%mm1         \n\t"\
235     "pxor                      %%mm5, %%mm5         \n\t"\
236     "pxor                      %%mm7, %%mm7         \n\t"\
237     "pxor                      %%mm6, %%mm6         \n\t"\
238     ".p2align                      4                \n\t"\
239     "2:                                             \n\t"\
240     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
241     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
242     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
243     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
244     "movq                      %%mm0, %%mm3         \n\t"\
245     "punpcklwd                 %%mm4, %%mm0         \n\t"\
246     "punpckhwd                 %%mm4, %%mm3         \n\t"\
247     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
248     "pmaddwd                   %%mm4, %%mm0         \n\t"\
249     "pmaddwd                   %%mm4, %%mm3         \n\t"\
250     "paddd                     %%mm0, %%mm1         \n\t"\
251     "paddd                     %%mm3, %%mm5         \n\t"\
252     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
253     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
254     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
255     "test                  %%"REG_S", %%"REG_S"     \n\t"\
256     "movq                      %%mm2, %%mm0         \n\t"\
257     "punpcklwd                 %%mm3, %%mm2         \n\t"\
258     "punpckhwd                 %%mm3, %%mm0         \n\t"\
259     "pmaddwd                   %%mm4, %%mm2         \n\t"\
260     "pmaddwd                   %%mm4, %%mm0         \n\t"\
261     "paddd                     %%mm2, %%mm7         \n\t"\
262     "paddd                     %%mm0, %%mm6         \n\t"\
263     " jnz                         2b                \n\t"\
264     "psrad                       $16, %%mm1         \n\t"\
265     "psrad                       $16, %%mm5         \n\t"\
266     "psrad                       $16, %%mm7         \n\t"\
267     "psrad                       $16, %%mm6         \n\t"\
268     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
269     "packssdw                  %%mm5, %%mm1         \n\t"\
270     "packssdw                  %%mm6, %%mm7         \n\t"\
271     "paddw                     %%mm0, %%mm1         \n\t"\
272     "paddw                     %%mm0, %%mm7         \n\t"\
273     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
274     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
275
276 #define YSCALEYUV2PACKEDX_ACCURATE \
277     YSCALEYUV2PACKEDX_ACCURATE_UV \
278     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
279
280 #define YSCALEYUV2RGBX \
281     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
282     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
283     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
284     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
285     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
286     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
287     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
288     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
289     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
290     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
291     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
292     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
293     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
294     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
295     "paddw           %%mm3, %%mm4       \n\t"\
296     "movq            %%mm2, %%mm0       \n\t"\
297     "movq            %%mm5, %%mm6       \n\t"\
298     "movq            %%mm4, %%mm3       \n\t"\
299     "punpcklwd       %%mm2, %%mm2       \n\t"\
300     "punpcklwd       %%mm5, %%mm5       \n\t"\
301     "punpcklwd       %%mm4, %%mm4       \n\t"\
302     "paddw           %%mm1, %%mm2       \n\t"\
303     "paddw           %%mm1, %%mm5       \n\t"\
304     "paddw           %%mm1, %%mm4       \n\t"\
305     "punpckhwd       %%mm0, %%mm0       \n\t"\
306     "punpckhwd       %%mm6, %%mm6       \n\t"\
307     "punpckhwd       %%mm3, %%mm3       \n\t"\
308     "paddw           %%mm7, %%mm0       \n\t"\
309     "paddw           %%mm7, %%mm6       \n\t"\
310     "paddw           %%mm7, %%mm3       \n\t"\
311     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
312     "packuswb        %%mm0, %%mm2       \n\t"\
313     "packuswb        %%mm6, %%mm5       \n\t"\
314     "packuswb        %%mm3, %%mm4       \n\t"\
315
316 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
317     "movq       "#b", "#q2"     \n\t" /* B */\
318     "movq       "#r", "#t"      \n\t" /* R */\
319     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
320     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
321     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
322     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
323     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
324     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
325     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
326     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
327     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
328     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
329 \
330     MOVNTQ(   q0,   (dst, index, 4))\
331     MOVNTQ(    b,  8(dst, index, 4))\
332     MOVNTQ(   q2, 16(dst, index, 4))\
333     MOVNTQ(   q3, 24(dst, index, 4))\
334 \
335     "add      $8, "#index"      \n\t"\
336     "cmp  "dstw", "#index"      \n\t"\
337     " jb      1b                \n\t"
338 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
339
340 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
341                                    const int16_t **lumSrc, int lumFilterSize,
342                                    const int16_t *chrFilter, const int16_t **chrUSrc,
343                                    const int16_t **chrVSrc,
344                                    int chrFilterSize, const int16_t **alpSrc,
345                                    uint8_t *dest, int dstW, int dstY)
346 {
347     x86_reg dummy=0;
348     x86_reg dstW_reg = dstW;
349     x86_reg uv_off = c->uv_offx2;
350
351     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
352         YSCALEYUV2PACKEDX_ACCURATE
353         YSCALEYUV2RGBX
354         "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
355         "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
356         "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
357         YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
358         "movq               "Y_TEMP"(%0), %%mm5         \n\t"
359         "psraw                        $3, %%mm1         \n\t"
360         "psraw                        $3, %%mm7         \n\t"
361         "packuswb                  %%mm7, %%mm1         \n\t"
362         WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
363         YSCALEYUV2PACKEDX_END
364     } else {
365         YSCALEYUV2PACKEDX_ACCURATE
366         YSCALEYUV2RGBX
367         "pcmpeqd %%mm7, %%mm7 \n\t"
368         WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
369         YSCALEYUV2PACKEDX_END
370     }
371 }
372
373 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
374                                 const int16_t **lumSrc, int lumFilterSize,
375                                 const int16_t *chrFilter, const int16_t **chrUSrc,
376                                 const int16_t **chrVSrc,
377                                 int chrFilterSize, const int16_t **alpSrc,
378                                 uint8_t *dest, int dstW, int dstY)
379 {
380     x86_reg dummy=0;
381     x86_reg dstW_reg = dstW;
382     x86_reg uv_off = c->uv_offx2;
383
384     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
385         YSCALEYUV2PACKEDX
386         YSCALEYUV2RGBX
387         YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
388         "psraw                        $3, %%mm1         \n\t"
389         "psraw                        $3, %%mm7         \n\t"
390         "packuswb                  %%mm7, %%mm1         \n\t"
391         WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
392         YSCALEYUV2PACKEDX_END
393     } else {
394         YSCALEYUV2PACKEDX
395         YSCALEYUV2RGBX
396         "pcmpeqd %%mm7, %%mm7 \n\t"
397         WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
398         YSCALEYUV2PACKEDX_END
399     }
400 }
401
402 #define REAL_WRITERGB16(dst, dstw, index) \
403     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
404     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
405     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
406     "psrlq           $3, %%mm2  \n\t"\
407 \
408     "movq         %%mm2, %%mm1  \n\t"\
409     "movq         %%mm4, %%mm3  \n\t"\
410 \
411     "punpcklbw    %%mm7, %%mm3  \n\t"\
412     "punpcklbw    %%mm5, %%mm2  \n\t"\
413     "punpckhbw    %%mm7, %%mm4  \n\t"\
414     "punpckhbw    %%mm5, %%mm1  \n\t"\
415 \
416     "psllq           $3, %%mm3  \n\t"\
417     "psllq           $3, %%mm4  \n\t"\
418 \
419     "por          %%mm3, %%mm2  \n\t"\
420     "por          %%mm4, %%mm1  \n\t"\
421 \
422     MOVNTQ(%%mm2,  (dst, index, 2))\
423     MOVNTQ(%%mm1, 8(dst, index, 2))\
424 \
425     "add             $8, "#index"   \n\t"\
426     "cmp         "dstw", "#index"   \n\t"\
427     " jb             1b             \n\t"
428 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
429
430 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
431                                     const int16_t **lumSrc, int lumFilterSize,
432                                     const int16_t *chrFilter, const int16_t **chrUSrc,
433                                     const int16_t **chrVSrc,
434                                     int chrFilterSize, const int16_t **alpSrc,
435                                     uint8_t *dest, int dstW, int dstY)
436 {
437     x86_reg dummy=0;
438     x86_reg dstW_reg = dstW;
439     x86_reg uv_off = c->uv_offx2;
440
441     YSCALEYUV2PACKEDX_ACCURATE
442     YSCALEYUV2RGBX
443     "pxor %%mm7, %%mm7 \n\t"
444     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
445 #ifdef DITHER1XBPP
446     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
447     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
448     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
449 #endif
450     WRITERGB16(%4, "%5", %%REGa)
451     YSCALEYUV2PACKEDX_END
452 }
453
454 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
455                                  const int16_t **lumSrc, int lumFilterSize,
456                                  const int16_t *chrFilter, const int16_t **chrUSrc,
457                                  const int16_t **chrVSrc,
458                                  int chrFilterSize, const int16_t **alpSrc,
459                                  uint8_t *dest, int dstW, int dstY)
460 {
461     x86_reg dummy=0;
462     x86_reg dstW_reg = dstW;
463     x86_reg uv_off = c->uv_offx2;
464
465     YSCALEYUV2PACKEDX
466     YSCALEYUV2RGBX
467     "pxor %%mm7, %%mm7 \n\t"
468     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
469 #ifdef DITHER1XBPP
470     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
471     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
472     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
473 #endif
474     WRITERGB16(%4, "%5", %%REGa)
475     YSCALEYUV2PACKEDX_END
476 }
477
478 #define REAL_WRITERGB15(dst, dstw, index) \
479     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
480     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
481     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
482     "psrlq           $3, %%mm2  \n\t"\
483     "psrlq           $1, %%mm5  \n\t"\
484 \
485     "movq         %%mm2, %%mm1  \n\t"\
486     "movq         %%mm4, %%mm3  \n\t"\
487 \
488     "punpcklbw    %%mm7, %%mm3  \n\t"\
489     "punpcklbw    %%mm5, %%mm2  \n\t"\
490     "punpckhbw    %%mm7, %%mm4  \n\t"\
491     "punpckhbw    %%mm5, %%mm1  \n\t"\
492 \
493     "psllq           $2, %%mm3  \n\t"\
494     "psllq           $2, %%mm4  \n\t"\
495 \
496     "por          %%mm3, %%mm2  \n\t"\
497     "por          %%mm4, %%mm1  \n\t"\
498 \
499     MOVNTQ(%%mm2,  (dst, index, 2))\
500     MOVNTQ(%%mm1, 8(dst, index, 2))\
501 \
502     "add             $8, "#index"   \n\t"\
503     "cmp         "dstw", "#index"   \n\t"\
504     " jb             1b             \n\t"
505 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
506
507 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
508                                     const int16_t **lumSrc, int lumFilterSize,
509                                     const int16_t *chrFilter, const int16_t **chrUSrc,
510                                     const int16_t **chrVSrc,
511                                     int chrFilterSize, const int16_t **alpSrc,
512                                     uint8_t *dest, int dstW, int dstY)
513 {
514     x86_reg dummy=0;
515     x86_reg dstW_reg = dstW;
516     x86_reg uv_off = c->uv_offx2;
517
518     YSCALEYUV2PACKEDX_ACCURATE
519     YSCALEYUV2RGBX
520     "pxor %%mm7, %%mm7 \n\t"
521     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
522 #ifdef DITHER1XBPP
523     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
524     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
525     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
526 #endif
527     WRITERGB15(%4, "%5", %%REGa)
528     YSCALEYUV2PACKEDX_END
529 }
530
531 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
532                                  const int16_t **lumSrc, int lumFilterSize,
533                                  const int16_t *chrFilter, const int16_t **chrUSrc,
534                                  const int16_t **chrVSrc,
535                                  int chrFilterSize, const int16_t **alpSrc,
536                                  uint8_t *dest, int dstW, int dstY)
537 {
538     x86_reg dummy=0;
539     x86_reg dstW_reg = dstW;
540     x86_reg uv_off = c->uv_offx2;
541
542     YSCALEYUV2PACKEDX
543     YSCALEYUV2RGBX
544     "pxor %%mm7, %%mm7 \n\t"
545     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
546 #ifdef DITHER1XBPP
547     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
548     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
549     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
550 #endif
551     WRITERGB15(%4, "%5", %%REGa)
552     YSCALEYUV2PACKEDX_END
553 }
554
555 #define WRITEBGR24MMX(dst, dstw, index) \
556     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
557     "movq      %%mm2, %%mm1     \n\t" /* B */\
558     "movq      %%mm5, %%mm6     \n\t" /* R */\
559     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
560     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
561     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
562     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
563     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
564     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
565     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
566     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
567     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
568     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
569 \
570     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
571     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
572     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
573     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
574 \
575     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
576     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
577     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
578     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
579 \
580     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
581     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
582     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
583     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
584 \
585     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
586     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
587     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
588     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
589     MOVNTQ(%%mm0, (dst))\
590 \
591     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
592     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
593     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
594     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
595     MOVNTQ(%%mm6, 8(dst))\
596 \
597     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
598     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
599     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
600     MOVNTQ(%%mm5, 16(dst))\
601 \
602     "add         $24, "#dst"    \n\t"\
603 \
604     "add          $8, "#index"  \n\t"\
605     "cmp      "dstw", "#index"  \n\t"\
606     " jb          1b            \n\t"
607
608 #define WRITEBGR24MMXEXT(dst, dstw, index) \
609     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
610     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
611     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
612     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
613     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
614     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
615 \
616     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
617     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
618     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
619 \
620     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
621     "por    %%mm1, %%mm6        \n\t"\
622     "por    %%mm3, %%mm6        \n\t"\
623     MOVNTQ(%%mm6, (dst))\
624 \
625     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
626     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
627     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
628     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
629 \
630     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
631     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
632     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
633 \
634     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
635     "por    %%mm3, %%mm6        \n\t"\
636     MOVNTQ(%%mm6, 8(dst))\
637 \
638     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
639     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
640     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
641 \
642     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
643     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
644     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
645 \
646     "por    %%mm1, %%mm3        \n\t"\
647     "por    %%mm3, %%mm6        \n\t"\
648     MOVNTQ(%%mm6, 16(dst))\
649 \
650     "add      $24, "#dst"       \n\t"\
651 \
652     "add       $8, "#index"     \n\t"\
653     "cmp   "dstw", "#index"     \n\t"\
654     " jb       1b               \n\t"
655
656 #if COMPILE_TEMPLATE_MMXEXT
657 #undef WRITEBGR24
658 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
659 #else
660 #undef WRITEBGR24
661 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
662 #endif
663
664 #if HAVE_6REGS
665 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
666                                    const int16_t **lumSrc, int lumFilterSize,
667                                    const int16_t *chrFilter, const int16_t **chrUSrc,
668                                    const int16_t **chrVSrc,
669                                    int chrFilterSize, const int16_t **alpSrc,
670                                    uint8_t *dest, int dstW, int dstY)
671 {
672     x86_reg dummy=0;
673     x86_reg dstW_reg = dstW;
674     x86_reg uv_off = c->uv_offx2;
675
676     YSCALEYUV2PACKEDX_ACCURATE
677     YSCALEYUV2RGBX
678     "pxor %%mm7, %%mm7 \n\t"
679     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
680     "add %4, %%"REG_c"                        \n\t"
681     WRITEBGR24(%%REGc, "%5", %%REGa)
682     :: "r" (&c->redDither),
683        "m" (dummy), "m" (dummy), "m" (dummy),
684        "r" (dest), "m" (dstW_reg), "m"(uv_off)
685        NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
686     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
687     );
688 }
689
690 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
691                                 const int16_t **lumSrc, int lumFilterSize,
692                                 const int16_t *chrFilter, const int16_t **chrUSrc,
693                                 const int16_t **chrVSrc,
694                                 int chrFilterSize, const int16_t **alpSrc,
695                                 uint8_t *dest, int dstW, int dstY)
696 {
697     x86_reg dummy=0;
698     x86_reg dstW_reg = dstW;
699     x86_reg uv_off = c->uv_offx2;
700
701     YSCALEYUV2PACKEDX
702     YSCALEYUV2RGBX
703     "pxor                    %%mm7, %%mm7       \n\t"
704     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
705     "add                        %4, %%"REG_c"   \n\t"
706     WRITEBGR24(%%REGc, "%5", %%REGa)
707     :: "r" (&c->redDither),
708        "m" (dummy), "m" (dummy), "m" (dummy),
709        "r" (dest),  "m" (dstW_reg), "m"(uv_off)
710        NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
711     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
712     );
713 }
714 #endif /* HAVE_6REGS */
715
716 #define REAL_WRITEYUY2(dst, dstw, index) \
717     "packuswb  %%mm3, %%mm3     \n\t"\
718     "packuswb  %%mm4, %%mm4     \n\t"\
719     "packuswb  %%mm7, %%mm1     \n\t"\
720     "punpcklbw %%mm4, %%mm3     \n\t"\
721     "movq      %%mm1, %%mm7     \n\t"\
722     "punpcklbw %%mm3, %%mm1     \n\t"\
723     "punpckhbw %%mm3, %%mm7     \n\t"\
724 \
725     MOVNTQ(%%mm1, (dst, index, 2))\
726     MOVNTQ(%%mm7, 8(dst, index, 2))\
727 \
728     "add          $8, "#index"  \n\t"\
729     "cmp      "dstw", "#index"  \n\t"\
730     " jb          1b            \n\t"
731 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
732
733 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
734                                      const int16_t **lumSrc, int lumFilterSize,
735                                      const int16_t *chrFilter, const int16_t **chrUSrc,
736                                      const int16_t **chrVSrc,
737                                      int chrFilterSize, const int16_t **alpSrc,
738                                      uint8_t *dest, int dstW, int dstY)
739 {
740     x86_reg dummy=0;
741     x86_reg dstW_reg = dstW;
742     x86_reg uv_off = c->uv_offx2;
743
744     YSCALEYUV2PACKEDX_ACCURATE
745     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
746     "psraw $3, %%mm3    \n\t"
747     "psraw $3, %%mm4    \n\t"
748     "psraw $3, %%mm1    \n\t"
749     "psraw $3, %%mm7    \n\t"
750     WRITEYUY2(%4, "%5", %%REGa)
751     YSCALEYUV2PACKEDX_END
752 }
753
754 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
755                                   const int16_t **lumSrc, int lumFilterSize,
756                                   const int16_t *chrFilter, const int16_t **chrUSrc,
757                                   const int16_t **chrVSrc,
758                                   int chrFilterSize, const int16_t **alpSrc,
759                                   uint8_t *dest, int dstW, int dstY)
760 {
761     x86_reg dummy=0;
762     x86_reg dstW_reg = dstW;
763     x86_reg uv_off = c->uv_offx2;
764
765     YSCALEYUV2PACKEDX
766     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
767     "psraw $3, %%mm3    \n\t"
768     "psraw $3, %%mm4    \n\t"
769     "psraw $3, %%mm1    \n\t"
770     "psraw $3, %%mm7    \n\t"
771     WRITEYUY2(%4, "%5", %%REGa)
772     YSCALEYUV2PACKEDX_END
773 }
774
775 #define REAL_YSCALEYUV2RGB_UV(index, c) \
776     "xor            "#index", "#index"  \n\t"\
777     ".p2align              4            \n\t"\
778     "1:                                 \n\t"\
779     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
780     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
781     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
782     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
783     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
784     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
785     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
786     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
787     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
788     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
789     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
790     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
791     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
792     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
793     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
794     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
795     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
796     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
797     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
798     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
799     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
800     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
801
802 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
803     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
804     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
805     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
806     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
807     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
808     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
809     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
810     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
811     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
812     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
813     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
814     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
815
816 #define REAL_YSCALEYUV2RGB_COEFF(c) \
817     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
818     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
819     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
820     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
821     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
822     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
823     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
824     "paddw             %%mm3, %%mm4     \n\t"\
825     "movq              %%mm2, %%mm0     \n\t"\
826     "movq              %%mm5, %%mm6     \n\t"\
827     "movq              %%mm4, %%mm3     \n\t"\
828     "punpcklwd         %%mm2, %%mm2     \n\t"\
829     "punpcklwd         %%mm5, %%mm5     \n\t"\
830     "punpcklwd         %%mm4, %%mm4     \n\t"\
831     "paddw             %%mm1, %%mm2     \n\t"\
832     "paddw             %%mm1, %%mm5     \n\t"\
833     "paddw             %%mm1, %%mm4     \n\t"\
834     "punpckhwd         %%mm0, %%mm0     \n\t"\
835     "punpckhwd         %%mm6, %%mm6     \n\t"\
836     "punpckhwd         %%mm3, %%mm3     \n\t"\
837     "paddw             %%mm7, %%mm0     \n\t"\
838     "paddw             %%mm7, %%mm6     \n\t"\
839     "paddw             %%mm7, %%mm3     \n\t"\
840     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
841     "packuswb          %%mm0, %%mm2     \n\t"\
842     "packuswb          %%mm6, %%mm5     \n\t"\
843     "packuswb          %%mm3, %%mm4     \n\t"\
844
845 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
846
847 #define YSCALEYUV2RGB(index, c) \
848     REAL_YSCALEYUV2RGB_UV(index, c) \
849     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
850     REAL_YSCALEYUV2RGB_COEFF(c)
851
852 /**
853  * vertical bilinear scale YV12 to RGB
854  */
855 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
856                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
857                                 const int16_t *abuf[2], uint8_t *dest,
858                                 int dstW, int yalpha, int uvalpha, int y)
859 {
860     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
861                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
862
863     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
864         const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
865 #if ARCH_X86_64
866         __asm__ volatile(
867             YSCALEYUV2RGB(%%r8, %5)
868             YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
869             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
870             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
871             "packuswb            %%mm7, %%mm1       \n\t"
872             WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
873             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
874                "a" (&c->redDither),
875                "r" (abuf0), "r" (abuf1)
876             : "%r8"
877         );
878 #else
879         c->u_temp=(intptr_t)abuf0;
880         c->v_temp=(intptr_t)abuf1;
881         __asm__ volatile(
882             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
883             "mov        %4, %%"REG_b"               \n\t"
884             "push %%"REG_BP"                        \n\t"
885             YSCALEYUV2RGB(%%REGBP, %5)
886             "push                   %0              \n\t"
887             "push                   %1              \n\t"
888             "mov          "U_TEMP"(%5), %0          \n\t"
889             "mov          "V_TEMP"(%5), %1          \n\t"
890             YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
891             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
892             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
893             "packuswb            %%mm7, %%mm1       \n\t"
894             "pop                    %1              \n\t"
895             "pop                    %0              \n\t"
896             WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
897             "pop %%"REG_BP"                         \n\t"
898             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
899             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
900                "a" (&c->redDither)
901         );
902 #endif
903     } else {
904         __asm__ volatile(
905             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
906             "mov        %4, %%"REG_b"               \n\t"
907             "push %%"REG_BP"                        \n\t"
908             YSCALEYUV2RGB(%%REGBP, %5)
909             "pcmpeqd %%mm7, %%mm7                   \n\t"
910             WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
911             "pop %%"REG_BP"                         \n\t"
912             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
913             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
914                "a" (&c->redDither)
915         );
916     }
917 }
918
919 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
920                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
921                                 const int16_t *abuf[2], uint8_t *dest,
922                                 int dstW, int yalpha, int uvalpha, int y)
923 {
924     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
925                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
926
927     __asm__ volatile(
928         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
929         "mov        %4, %%"REG_b"               \n\t"
930         "push %%"REG_BP"                        \n\t"
931         YSCALEYUV2RGB(%%REGBP, %5)
932         "pxor    %%mm7, %%mm7                   \n\t"
933         WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
934         "pop %%"REG_BP"                         \n\t"
935         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
936         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
937            "a" (&c->redDither)
938            NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
939     );
940 }
941
942 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
943                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
944                                  const int16_t *abuf[2], uint8_t *dest,
945                                  int dstW, int yalpha, int uvalpha, int y)
946 {
947     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
948                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
949
950     __asm__ volatile(
951         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
952         "mov        %4, %%"REG_b"               \n\t"
953         "push %%"REG_BP"                        \n\t"
954         YSCALEYUV2RGB(%%REGBP, %5)
955         "pxor    %%mm7, %%mm7                   \n\t"
956         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
957 #ifdef DITHER1XBPP
958         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
959         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
960         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
961 #endif
962         WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
963         "pop %%"REG_BP"                         \n\t"
964         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
965         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
966            "a" (&c->redDither)
967            NAMED_CONSTRAINTS_ADD(bF8)
968     );
969 }
970
971 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
972                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
973                                  const int16_t *abuf[2], uint8_t *dest,
974                                  int dstW, int yalpha, int uvalpha, int y)
975 {
976     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
977                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
978
979     __asm__ volatile(
980         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
981         "mov        %4, %%"REG_b"               \n\t"
982         "push %%"REG_BP"                        \n\t"
983         YSCALEYUV2RGB(%%REGBP, %5)
984         "pxor    %%mm7, %%mm7                   \n\t"
985         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
986 #ifdef DITHER1XBPP
987         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
988         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
989         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
990 #endif
991         WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
992         "pop %%"REG_BP"                         \n\t"
993         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
994         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
995            "a" (&c->redDither)
996            NAMED_CONSTRAINTS_ADD(bF8,bFC)
997     );
998 }
999
1000 #define REAL_YSCALEYUV2PACKED(index, c) \
1001     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
1002     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
1003     "psraw                $3, %%mm0                           \n\t"\
1004     "psraw                $3, %%mm1                           \n\t"\
1005     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1006     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1007     "xor            "#index", "#index"                        \n\t"\
1008     ".p2align              4            \n\t"\
1009     "1:                                 \n\t"\
1010     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1011     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1012     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1013     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1014     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1015     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1016     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1017     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1018     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
1019     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1020     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1021     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1022     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1023     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1024     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1025     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
1026     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
1027     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
1028     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
1029     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
1030     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
1031     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1032     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1033     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1034     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1035     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1036     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1037
1038 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
1039
1040 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
1041                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
1042                                   const int16_t *abuf[2], uint8_t *dest,
1043                                   int dstW, int yalpha, int uvalpha, int y)
1044 {
1045     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1046                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1047
1048     __asm__ volatile(
1049         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1050         "mov %4, %%"REG_b"                        \n\t"
1051         "push %%"REG_BP"                        \n\t"
1052         YSCALEYUV2PACKED(%%REGBP, %5)
1053         WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1054         "pop %%"REG_BP"                         \n\t"
1055         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1056         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1057            "a" (&c->redDither)
1058     );
1059 }
1060
1061 #define REAL_YSCALEYUV2RGB1(index, c) \
1062     "xor            "#index", "#index"  \n\t"\
1063     ".p2align              4            \n\t"\
1064     "1:                                 \n\t"\
1065     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1066     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1067     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1068     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1069     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1070     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1071     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1072     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
1073     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
1074     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1075     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1076     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1077     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1078     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1079     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1080     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1081     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1082     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1083     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1084     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1085     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1086     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1087     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1088     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1089     "paddw             %%mm3, %%mm4     \n\t"\
1090     "movq              %%mm2, %%mm0     \n\t"\
1091     "movq              %%mm5, %%mm6     \n\t"\
1092     "movq              %%mm4, %%mm3     \n\t"\
1093     "punpcklwd         %%mm2, %%mm2     \n\t"\
1094     "punpcklwd         %%mm5, %%mm5     \n\t"\
1095     "punpcklwd         %%mm4, %%mm4     \n\t"\
1096     "paddw             %%mm1, %%mm2     \n\t"\
1097     "paddw             %%mm1, %%mm5     \n\t"\
1098     "paddw             %%mm1, %%mm4     \n\t"\
1099     "punpckhwd         %%mm0, %%mm0     \n\t"\
1100     "punpckhwd         %%mm6, %%mm6     \n\t"\
1101     "punpckhwd         %%mm3, %%mm3     \n\t"\
1102     "paddw             %%mm7, %%mm0     \n\t"\
1103     "paddw             %%mm7, %%mm6     \n\t"\
1104     "paddw             %%mm7, %%mm3     \n\t"\
1105     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1106     "packuswb          %%mm0, %%mm2     \n\t"\
1107     "packuswb          %%mm6, %%mm5     \n\t"\
1108     "packuswb          %%mm3, %%mm4     \n\t"\
1109
1110 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
1111
1112 // do vertical chrominance interpolation
1113 #define REAL_YSCALEYUV2RGB1b(index, c) \
1114     "xor            "#index", "#index"  \n\t"\
1115     ".p2align              4            \n\t"\
1116     "1:                                 \n\t"\
1117     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1118     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1119     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1120     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1121     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1122     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1123     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1124     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1125     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
1126     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
1127     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1128     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
1129     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
1130     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1131     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1132     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1133     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1134     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1135     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1136     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1137     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1138     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1139     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1140     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1141     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1142     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1143     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1144     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1145     "paddw             %%mm3, %%mm4     \n\t"\
1146     "movq              %%mm2, %%mm0     \n\t"\
1147     "movq              %%mm5, %%mm6     \n\t"\
1148     "movq              %%mm4, %%mm3     \n\t"\
1149     "punpcklwd         %%mm2, %%mm2     \n\t"\
1150     "punpcklwd         %%mm5, %%mm5     \n\t"\
1151     "punpcklwd         %%mm4, %%mm4     \n\t"\
1152     "paddw             %%mm1, %%mm2     \n\t"\
1153     "paddw             %%mm1, %%mm5     \n\t"\
1154     "paddw             %%mm1, %%mm4     \n\t"\
1155     "punpckhwd         %%mm0, %%mm0     \n\t"\
1156     "punpckhwd         %%mm6, %%mm6     \n\t"\
1157     "punpckhwd         %%mm3, %%mm3     \n\t"\
1158     "paddw             %%mm7, %%mm0     \n\t"\
1159     "paddw             %%mm7, %%mm6     \n\t"\
1160     "paddw             %%mm7, %%mm3     \n\t"\
1161     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1162     "packuswb          %%mm0, %%mm2     \n\t"\
1163     "packuswb          %%mm6, %%mm5     \n\t"\
1164     "packuswb          %%mm3, %%mm4     \n\t"\
1165
1166 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
1167
1168 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1169     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
1170     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
1171     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
1172     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
1173     "packuswb          %%mm1, %%mm7     \n\t"
1174 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1175
1176 /**
1177  * YV12 to RGB without scaling or interpolating
1178  */
1179 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1180                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1181                                 const int16_t *abuf0, uint8_t *dest,
1182                                 int dstW, int uvalpha, int y)
1183 {
1184     const int16_t *ubuf0 = ubuf[0];
1185     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1186
1187     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1188         const int16_t *ubuf1 = ubuf[0];
1189         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1190             __asm__ volatile(
1191                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1192                 "mov        %4, %%"REG_b"               \n\t"
1193                 "push %%"REG_BP"                        \n\t"
1194                 YSCALEYUV2RGB1(%%REGBP, %5)
1195                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1196                 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1197                 "pop %%"REG_BP"                         \n\t"
1198                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1199                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1200                    "a" (&c->redDither)
1201             );
1202         } else {
1203             __asm__ volatile(
1204                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1205                 "mov        %4, %%"REG_b"               \n\t"
1206                 "push %%"REG_BP"                        \n\t"
1207                 YSCALEYUV2RGB1(%%REGBP, %5)
1208                 "pcmpeqd %%mm7, %%mm7                   \n\t"
1209                 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1210                 "pop %%"REG_BP"                         \n\t"
1211                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1212                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1213                    "a" (&c->redDither)
1214             );
1215         }
1216     } else {
1217         const int16_t *ubuf1 = ubuf[1];
1218         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1219             __asm__ volatile(
1220                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1221                 "mov        %4, %%"REG_b"               \n\t"
1222                 "push %%"REG_BP"                        \n\t"
1223                 YSCALEYUV2RGB1b(%%REGBP, %5)
1224                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1225                 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1226                 "pop %%"REG_BP"                         \n\t"
1227                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1228                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1229                    "a" (&c->redDither)
1230             );
1231         } else {
1232             __asm__ volatile(
1233                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1234                 "mov        %4, %%"REG_b"               \n\t"
1235                 "push %%"REG_BP"                        \n\t"
1236                 YSCALEYUV2RGB1b(%%REGBP, %5)
1237                 "pcmpeqd %%mm7, %%mm7                   \n\t"
1238                 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1239                 "pop %%"REG_BP"                         \n\t"
1240                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1241                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1242                    "a" (&c->redDither)
1243             );
1244         }
1245     }
1246 }
1247
1248 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1249                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1250                                 const int16_t *abuf0, uint8_t *dest,
1251                                 int dstW, int uvalpha, int y)
1252 {
1253     const int16_t *ubuf0 = ubuf[0];
1254     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1255
1256     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1257         const int16_t *ubuf1 = ubuf[0];
1258         __asm__ volatile(
1259             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1260             "mov        %4, %%"REG_b"               \n\t"
1261             "push %%"REG_BP"                        \n\t"
1262             YSCALEYUV2RGB1(%%REGBP, %5)
1263             "pxor    %%mm7, %%mm7                   \n\t"
1264             WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1265             "pop %%"REG_BP"                         \n\t"
1266             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1267             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1268                "a" (&c->redDither)
1269                NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1270         );
1271     } else {
1272         const int16_t *ubuf1 = ubuf[1];
1273         __asm__ volatile(
1274             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1275             "mov        %4, %%"REG_b"               \n\t"
1276             "push %%"REG_BP"                        \n\t"
1277             YSCALEYUV2RGB1b(%%REGBP, %5)
1278             "pxor    %%mm7, %%mm7                   \n\t"
1279             WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1280             "pop %%"REG_BP"                         \n\t"
1281             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1282             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1283                "a" (&c->redDither)
1284                NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1285         );
1286     }
1287 }
1288
1289 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1290                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1291                                  const int16_t *abuf0, uint8_t *dest,
1292                                  int dstW, int uvalpha, int y)
1293 {
1294     const int16_t *ubuf0 = ubuf[0];
1295     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1296
1297     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1298         const int16_t *ubuf1 = ubuf[0];
1299         __asm__ volatile(
1300             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1301             "mov        %4, %%"REG_b"               \n\t"
1302             "push %%"REG_BP"                        \n\t"
1303             YSCALEYUV2RGB1(%%REGBP, %5)
1304             "pxor    %%mm7, %%mm7                   \n\t"
1305             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 #ifdef DITHER1XBPP
1307             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1308             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1309             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1310 #endif
1311             WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1312             "pop %%"REG_BP"                         \n\t"
1313             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1314             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1315                "a" (&c->redDither)
1316                NAMED_CONSTRAINTS_ADD(bF8)
1317         );
1318     } else {
1319         const int16_t *ubuf1 = ubuf[1];
1320         __asm__ volatile(
1321             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1322             "mov        %4, %%"REG_b"               \n\t"
1323             "push %%"REG_BP"                        \n\t"
1324             YSCALEYUV2RGB1b(%%REGBP, %5)
1325             "pxor    %%mm7, %%mm7                   \n\t"
1326             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1329             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1330             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1331 #endif
1332             WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1333             "pop %%"REG_BP"                         \n\t"
1334             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1335             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1336                "a" (&c->redDither)
1337                NAMED_CONSTRAINTS_ADD(bF8)
1338         );
1339     }
1340 }
1341
1342 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1343                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1344                                  const int16_t *abuf0, uint8_t *dest,
1345                                  int dstW, int uvalpha, int y)
1346 {
1347     const int16_t *ubuf0 = ubuf[0];
1348     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1349
1350     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1351         const int16_t *ubuf1 = ubuf[0];
1352         __asm__ volatile(
1353             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1354             "mov        %4, %%"REG_b"               \n\t"
1355             "push %%"REG_BP"                        \n\t"
1356             YSCALEYUV2RGB1(%%REGBP, %5)
1357             "pxor    %%mm7, %%mm7                   \n\t"
1358             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1359 #ifdef DITHER1XBPP
1360             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1361             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1362             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1363 #endif
1364             WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1365             "pop %%"REG_BP"                         \n\t"
1366             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1367             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1368                "a" (&c->redDither)
1369                NAMED_CONSTRAINTS_ADD(bF8,bFC)
1370         );
1371     } else {
1372         const int16_t *ubuf1 = ubuf[1];
1373         __asm__ volatile(
1374             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1375             "mov        %4, %%"REG_b"               \n\t"
1376             "push %%"REG_BP"                        \n\t"
1377             YSCALEYUV2RGB1b(%%REGBP, %5)
1378             "pxor    %%mm7, %%mm7                   \n\t"
1379             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1380 #ifdef DITHER1XBPP
1381             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1382             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1383             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1384 #endif
1385             WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1386             "pop %%"REG_BP"                         \n\t"
1387             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1388             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1389                "a" (&c->redDither)
1390                NAMED_CONSTRAINTS_ADD(bF8,bFC)
1391         );
1392     }
1393 }
1394
1395 #define REAL_YSCALEYUV2PACKED1(index, c) \
1396     "xor            "#index", "#index"  \n\t"\
1397     ".p2align              4            \n\t"\
1398     "1:                                 \n\t"\
1399     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1400     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1401     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1402     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1403     "psraw                $7, %%mm3     \n\t" \
1404     "psraw                $7, %%mm4     \n\t" \
1405     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1406     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1407     "psraw                $7, %%mm1     \n\t" \
1408     "psraw                $7, %%mm7     \n\t" \
1409
1410 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
1411
1412 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1413     "xor "#index", "#index"             \n\t"\
1414     ".p2align              4            \n\t"\
1415     "1:                                 \n\t"\
1416     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1417     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1418     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1419     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1420     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1421     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1422     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1423     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1424     "psrlw                $8, %%mm3     \n\t" \
1425     "psrlw                $8, %%mm4     \n\t" \
1426     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1427     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1428     "psraw                $7, %%mm1     \n\t" \
1429     "psraw                $7, %%mm7     \n\t"
1430 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
1431
1432 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1433                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
1434                                   const int16_t *abuf0, uint8_t *dest,
1435                                   int dstW, int uvalpha, int y)
1436 {
1437     const int16_t *ubuf0 = ubuf[0];
1438     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1439
1440     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1441         const int16_t *ubuf1 = ubuf[0];
1442         __asm__ volatile(
1443             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1444             "mov        %4, %%"REG_b"               \n\t"
1445             "push %%"REG_BP"                        \n\t"
1446             YSCALEYUV2PACKED1(%%REGBP, %5)
1447             WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1448             "pop %%"REG_BP"                         \n\t"
1449             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1450             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1451                "a" (&c->redDither)
1452         );
1453     } else {
1454         const int16_t *ubuf1 = ubuf[1];
1455         __asm__ volatile(
1456             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1457             "mov        %4, %%"REG_b"               \n\t"
1458             "push %%"REG_BP"                        \n\t"
1459             YSCALEYUV2PACKED1b(%%REGBP, %5)
1460             WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1461             "pop %%"REG_BP"                         \n\t"
1462             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1463             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1464                "a" (&c->redDither)
1465         );
1466     }
1467 }
1468 static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
1469 {
1470     enum AVPixelFormat dstFormat = c->dstFormat;
1471
1472     c->use_mmx_vfilter= 0;
1473     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
1474         && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
1475             if (c->flags & SWS_ACCURATE_RND) {
1476                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1477                     switch (c->dstFormat) {
1478                     case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
1479 #if HAVE_6REGS
1480                     case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
1481 #endif
1482                     case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
1483                     case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
1484                     case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1485                     default: break;
1486                     }
1487                 }
1488             } else {
1489                 c->use_mmx_vfilter= 1;
1490                 c->yuv2planeX = RENAME(yuv2yuvX    );
1491                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1492                     switch (c->dstFormat) {
1493                     case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
1494 #if HAVE_6REGS
1495                     case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
1496 #endif
1497                     case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
1498                     case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
1499                     case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1500                     default: break;
1501                     }
1502                 }
1503             }
1504         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1505             switch (c->dstFormat) {
1506             case AV_PIX_FMT_RGB32:
1507                 c->yuv2packed1 = RENAME(yuv2rgb32_1);
1508                 c->yuv2packed2 = RENAME(yuv2rgb32_2);
1509                 break;
1510             case AV_PIX_FMT_BGR24:
1511                 c->yuv2packed1 = RENAME(yuv2bgr24_1);
1512                 c->yuv2packed2 = RENAME(yuv2bgr24_2);
1513                 break;
1514             case AV_PIX_FMT_RGB555:
1515                 c->yuv2packed1 = RENAME(yuv2rgb555_1);
1516                 c->yuv2packed2 = RENAME(yuv2rgb555_2);
1517                 break;
1518             case AV_PIX_FMT_RGB565:
1519                 c->yuv2packed1 = RENAME(yuv2rgb565_1);
1520                 c->yuv2packed2 = RENAME(yuv2rgb565_2);
1521                 break;
1522             case AV_PIX_FMT_YUYV422:
1523                 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1524                 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1525                 break;
1526             default:
1527                 break;
1528             }
1529         }
1530     }
1531
1532     if (c->srcBpc == 8 && c->dstBpc <= 14) {
1533     // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1534 #if COMPILE_TEMPLATE_MMXEXT
1535     if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1536         c->hyscale_fast = ff_hyscale_fast_mmxext;
1537         c->hcscale_fast = ff_hcscale_fast_mmxext;
1538     } else {
1539 #endif /* COMPILE_TEMPLATE_MMXEXT */
1540         c->hyscale_fast = NULL;
1541         c->hcscale_fast = NULL;
1542 #if COMPILE_TEMPLATE_MMXEXT
1543     }
1544 #endif /* COMPILE_TEMPLATE_MMXEXT */
1545     }
1546 }