]> git.sesse.net Git - ffmpeg/blob - libswscale/swscale_template.c
sws-PPC: fix after VOFW change.
[ffmpeg] / libswscale / swscale_template.c
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef PAVGB
24 #undef PREFETCH
25
26 #if COMPILE_TEMPLATE_AMD3DNOW
27 #define PREFETCH  "prefetch"
28 #elif COMPILE_TEMPLATE_MMX2
29 #define PREFETCH "prefetchnta"
30 #else
31 #define PREFETCH  " # nop"
32 #endif
33
34 #if COMPILE_TEMPLATE_MMX2
35 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif COMPILE_TEMPLATE_AMD3DNOW
37 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38 #endif
39
40 #if COMPILE_TEMPLATE_MMX2
41 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42 #else
43 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44 #endif
45 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46
47 #if COMPILE_TEMPLATE_ALTIVEC
48 #include "ppc/swscale_altivec_template.c"
49 #endif
50
51 #define YSCALEYUV2YV12X(x, offset, dest, width) \
52     __asm__ volatile(\
53         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55         "movq                             %%mm3, %%mm4      \n\t"\
56         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58         ".p2align                             4             \n\t" /* FIXME Unroll? */\
59         "1:                                                 \n\t"\
60         "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63         "add                                $16, %%"REG_d"  \n\t"\
64         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65         "test                         %%"REG_S", %%"REG_S"  \n\t"\
66         "pmulhw                           %%mm0, %%mm2      \n\t"\
67         "pmulhw                           %%mm0, %%mm5      \n\t"\
68         "paddw                            %%mm2, %%mm3      \n\t"\
69         "paddw                            %%mm5, %%mm4      \n\t"\
70         " jnz                                1b             \n\t"\
71         "psraw                               $3, %%mm3      \n\t"\
72         "psraw                               $3, %%mm4      \n\t"\
73         "packuswb                         %%mm4, %%mm3      \n\t"\
74         MOVNTQ(%%mm3, (%1, %%REGa))\
75         "add                                 $8, %%"REG_a"  \n\t"\
76         "cmp                                 %2, %%"REG_a"  \n\t"\
77         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78         "movq                             %%mm3, %%mm4      \n\t"\
79         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81         "jb                                  1b             \n\t"\
82         :: "r" (&c->redDither),\
83         "r" (dest), "g" ((x86_reg)width)\
84         : "%"REG_a, "%"REG_d, "%"REG_S\
85     );
86
87 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88     __asm__ volatile(\
89         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91         "pxor                             %%mm4, %%mm4      \n\t"\
92         "pxor                             %%mm5, %%mm5      \n\t"\
93         "pxor                             %%mm6, %%mm6      \n\t"\
94         "pxor                             %%mm7, %%mm7      \n\t"\
95         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96         ".p2align                             4             \n\t"\
97         "1:                                                 \n\t"\
98         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100         "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102         "movq                             %%mm0, %%mm3      \n\t"\
103         "punpcklwd                        %%mm1, %%mm0      \n\t"\
104         "punpckhwd                        %%mm1, %%mm3      \n\t"\
105         "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106         "pmaddwd                          %%mm1, %%mm0      \n\t"\
107         "pmaddwd                          %%mm1, %%mm3      \n\t"\
108         "paddd                            %%mm0, %%mm4      \n\t"\
109         "paddd                            %%mm3, %%mm5      \n\t"\
110         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111         "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112         "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113         "test                         %%"REG_S", %%"REG_S"  \n\t"\
114         "movq                             %%mm2, %%mm0      \n\t"\
115         "punpcklwd                        %%mm3, %%mm2      \n\t"\
116         "punpckhwd                        %%mm3, %%mm0      \n\t"\
117         "pmaddwd                          %%mm1, %%mm2      \n\t"\
118         "pmaddwd                          %%mm1, %%mm0      \n\t"\
119         "paddd                            %%mm2, %%mm6      \n\t"\
120         "paddd                            %%mm0, %%mm7      \n\t"\
121         " jnz                                1b             \n\t"\
122         "psrad                              $16, %%mm4      \n\t"\
123         "psrad                              $16, %%mm5      \n\t"\
124         "psrad                              $16, %%mm6      \n\t"\
125         "psrad                              $16, %%mm7      \n\t"\
126         "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127         "packssdw                         %%mm5, %%mm4      \n\t"\
128         "packssdw                         %%mm7, %%mm6      \n\t"\
129         "paddw                            %%mm0, %%mm4      \n\t"\
130         "paddw                            %%mm0, %%mm6      \n\t"\
131         "psraw                               $3, %%mm4      \n\t"\
132         "psraw                               $3, %%mm6      \n\t"\
133         "packuswb                         %%mm6, %%mm4      \n\t"\
134         MOVNTQ(%%mm4, (%1, %%REGa))\
135         "add                                 $8, %%"REG_a"  \n\t"\
136         "cmp                                 %2, %%"REG_a"  \n\t"\
137         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138         "pxor                             %%mm4, %%mm4      \n\t"\
139         "pxor                             %%mm5, %%mm5      \n\t"\
140         "pxor                             %%mm6, %%mm6      \n\t"\
141         "pxor                             %%mm7, %%mm7      \n\t"\
142         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143         "jb                                  1b             \n\t"\
144         :: "r" (&c->redDither),\
145         "r" (dest), "g" ((x86_reg)width)\
146         : "%"REG_a, "%"REG_d, "%"REG_S\
147     );
148
149 #define YSCALEYUV2YV121 \
150     "mov %2, %%"REG_a"                    \n\t"\
151     ".p2align               4             \n\t" /* FIXME Unroll? */\
152     "1:                                   \n\t"\
153     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155     "psraw                 $7, %%mm0      \n\t"\
156     "psraw                 $7, %%mm1      \n\t"\
157     "packuswb           %%mm1, %%mm0      \n\t"\
158     MOVNTQ(%%mm0, (%1, %%REGa))\
159     "add                   $8, %%"REG_a"  \n\t"\
160     "jnc                   1b             \n\t"
161
162 #define YSCALEYUV2YV121_ACCURATE \
163     "mov %2, %%"REG_a"                    \n\t"\
164     "pcmpeqw %%mm7, %%mm7                 \n\t"\
165     "psrlw                 $15, %%mm7     \n\t"\
166     "psllw                  $6, %%mm7     \n\t"\
167     ".p2align                4            \n\t" /* FIXME Unroll? */\
168     "1:                                   \n\t"\
169     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171     "paddsw             %%mm7, %%mm0      \n\t"\
172     "paddsw             %%mm7, %%mm1      \n\t"\
173     "psraw                 $7, %%mm0      \n\t"\
174     "psraw                 $7, %%mm1      \n\t"\
175     "packuswb           %%mm1, %%mm0      \n\t"\
176     MOVNTQ(%%mm0, (%1, %%REGa))\
177     "add                   $8, %%"REG_a"  \n\t"\
178     "jnc                   1b             \n\t"
179
180 /*
181     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183        "r" (dest), "m" (dstW_reg),
184        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186 */
187 #define YSCALEYUV2PACKEDX_UV \
188     __asm__ volatile(\
189         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190         ".p2align                      4                \n\t"\
191         "nop                                            \n\t"\
192         "1:                                             \n\t"\
193         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196         "movq                      %%mm3, %%mm4         \n\t"\
197         ".p2align                      4                \n\t"\
198         "2:                                             \n\t"\
199         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202         "add                         $16, %%"REG_d"     \n\t"\
203         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204         "pmulhw                    %%mm0, %%mm2         \n\t"\
205         "pmulhw                    %%mm0, %%mm5         \n\t"\
206         "paddw                     %%mm2, %%mm3         \n\t"\
207         "paddw                     %%mm5, %%mm4         \n\t"\
208         "test                  %%"REG_S", %%"REG_S"     \n\t"\
209         " jnz                         2b                \n\t"\
210
211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212     "lea                "offset"(%0), %%"REG_d"     \n\t"\
213     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215     "movq                    "#dst1", "#dst2"       \n\t"\
216     ".p2align                      4                \n\t"\
217     "2:                                             \n\t"\
218     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221     "add                         $16, %%"REG_d"            \n\t"\
222     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223     "pmulhw                 "#coeff", "#src1"       \n\t"\
224     "pmulhw                 "#coeff", "#src2"       \n\t"\
225     "paddw                   "#src1", "#dst1"       \n\t"\
226     "paddw                   "#src2", "#dst2"       \n\t"\
227     "test                  %%"REG_S", %%"REG_S"     \n\t"\
228     " jnz                         2b                \n\t"\
229
230 #define YSCALEYUV2PACKEDX \
231     YSCALEYUV2PACKEDX_UV \
232     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233
234 #define YSCALEYUV2PACKEDX_END                     \
235         :: "r" (&c->redDither),                   \
236             "m" (dummy), "m" (dummy), "m" (dummy),\
237             "r" (dest), "m" (dstW_reg)            \
238         : "%"REG_a, "%"REG_d, "%"REG_S            \
239     );
240
241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
242     __asm__ volatile(\
243         "xor %%"REG_a", %%"REG_a"                       \n\t"\
244         ".p2align                      4                \n\t"\
245         "nop                                            \n\t"\
246         "1:                                             \n\t"\
247         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249         "pxor                      %%mm4, %%mm4         \n\t"\
250         "pxor                      %%mm5, %%mm5         \n\t"\
251         "pxor                      %%mm6, %%mm6         \n\t"\
252         "pxor                      %%mm7, %%mm7         \n\t"\
253         ".p2align                      4                \n\t"\
254         "2:                                             \n\t"\
255         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259         "movq                      %%mm0, %%mm3         \n\t"\
260         "punpcklwd                 %%mm1, %%mm0         \n\t"\
261         "punpckhwd                 %%mm1, %%mm3         \n\t"\
262         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263         "pmaddwd                   %%mm1, %%mm0         \n\t"\
264         "pmaddwd                   %%mm1, %%mm3         \n\t"\
265         "paddd                     %%mm0, %%mm4         \n\t"\
266         "paddd                     %%mm3, %%mm5         \n\t"\
267         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270         "test                  %%"REG_S", %%"REG_S"     \n\t"\
271         "movq                      %%mm2, %%mm0         \n\t"\
272         "punpcklwd                 %%mm3, %%mm2         \n\t"\
273         "punpckhwd                 %%mm3, %%mm0         \n\t"\
274         "pmaddwd                   %%mm1, %%mm2         \n\t"\
275         "pmaddwd                   %%mm1, %%mm0         \n\t"\
276         "paddd                     %%mm2, %%mm6         \n\t"\
277         "paddd                     %%mm0, %%mm7         \n\t"\
278         " jnz                         2b                \n\t"\
279         "psrad                       $16, %%mm4         \n\t"\
280         "psrad                       $16, %%mm5         \n\t"\
281         "psrad                       $16, %%mm6         \n\t"\
282         "psrad                       $16, %%mm7         \n\t"\
283         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284         "packssdw                  %%mm5, %%mm4         \n\t"\
285         "packssdw                  %%mm7, %%mm6         \n\t"\
286         "paddw                     %%mm0, %%mm4         \n\t"\
287         "paddw                     %%mm0, %%mm6         \n\t"\
288         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290
291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292     "lea                "offset"(%0), %%"REG_d"     \n\t"\
293     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294     "pxor                      %%mm1, %%mm1         \n\t"\
295     "pxor                      %%mm5, %%mm5         \n\t"\
296     "pxor                      %%mm7, %%mm7         \n\t"\
297     "pxor                      %%mm6, %%mm6         \n\t"\
298     ".p2align                      4                \n\t"\
299     "2:                                             \n\t"\
300     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304     "movq                      %%mm0, %%mm3         \n\t"\
305     "punpcklwd                 %%mm4, %%mm0         \n\t"\
306     "punpckhwd                 %%mm4, %%mm3         \n\t"\
307     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308     "pmaddwd                   %%mm4, %%mm0         \n\t"\
309     "pmaddwd                   %%mm4, %%mm3         \n\t"\
310     "paddd                     %%mm0, %%mm1         \n\t"\
311     "paddd                     %%mm3, %%mm5         \n\t"\
312     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315     "test                  %%"REG_S", %%"REG_S"     \n\t"\
316     "movq                      %%mm2, %%mm0         \n\t"\
317     "punpcklwd                 %%mm3, %%mm2         \n\t"\
318     "punpckhwd                 %%mm3, %%mm0         \n\t"\
319     "pmaddwd                   %%mm4, %%mm2         \n\t"\
320     "pmaddwd                   %%mm4, %%mm0         \n\t"\
321     "paddd                     %%mm2, %%mm7         \n\t"\
322     "paddd                     %%mm0, %%mm6         \n\t"\
323     " jnz                         2b                \n\t"\
324     "psrad                       $16, %%mm1         \n\t"\
325     "psrad                       $16, %%mm5         \n\t"\
326     "psrad                       $16, %%mm7         \n\t"\
327     "psrad                       $16, %%mm6         \n\t"\
328     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329     "packssdw                  %%mm5, %%mm1         \n\t"\
330     "packssdw                  %%mm6, %%mm7         \n\t"\
331     "paddw                     %%mm0, %%mm1         \n\t"\
332     "paddw                     %%mm0, %%mm7         \n\t"\
333     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335
336 #define YSCALEYUV2PACKEDX_ACCURATE \
337     YSCALEYUV2PACKEDX_ACCURATE_UV \
338     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339
340 #define YSCALEYUV2RGBX \
341     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355     "paddw           %%mm3, %%mm4       \n\t"\
356     "movq            %%mm2, %%mm0       \n\t"\
357     "movq            %%mm5, %%mm6       \n\t"\
358     "movq            %%mm4, %%mm3       \n\t"\
359     "punpcklwd       %%mm2, %%mm2       \n\t"\
360     "punpcklwd       %%mm5, %%mm5       \n\t"\
361     "punpcklwd       %%mm4, %%mm4       \n\t"\
362     "paddw           %%mm1, %%mm2       \n\t"\
363     "paddw           %%mm1, %%mm5       \n\t"\
364     "paddw           %%mm1, %%mm4       \n\t"\
365     "punpckhwd       %%mm0, %%mm0       \n\t"\
366     "punpckhwd       %%mm6, %%mm6       \n\t"\
367     "punpckhwd       %%mm3, %%mm3       \n\t"\
368     "paddw           %%mm7, %%mm0       \n\t"\
369     "paddw           %%mm7, %%mm6       \n\t"\
370     "paddw           %%mm7, %%mm3       \n\t"\
371     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372     "packuswb        %%mm0, %%mm2       \n\t"\
373     "packuswb        %%mm6, %%mm5       \n\t"\
374     "packuswb        %%mm3, %%mm4       \n\t"\
375
376 #define REAL_YSCALEYUV2PACKED(index, c) \
377     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379     "psraw                $3, %%mm0                           \n\t"\
380     "psraw                $3, %%mm1                           \n\t"\
381     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383     "xor            "#index", "#index"                        \n\t"\
384     ".p2align              4            \n\t"\
385     "1:                                 \n\t"\
386     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411
412 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413
414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
415     "xor            "#index", "#index"  \n\t"\
416     ".p2align              4            \n\t"\
417     "1:                                 \n\t"\
418     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438
439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452
453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
454     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461     "paddw             %%mm3, %%mm4     \n\t"\
462     "movq              %%mm2, %%mm0     \n\t"\
463     "movq              %%mm5, %%mm6     \n\t"\
464     "movq              %%mm4, %%mm3     \n\t"\
465     "punpcklwd         %%mm2, %%mm2     \n\t"\
466     "punpcklwd         %%mm5, %%mm5     \n\t"\
467     "punpcklwd         %%mm4, %%mm4     \n\t"\
468     "paddw             %%mm1, %%mm2     \n\t"\
469     "paddw             %%mm1, %%mm5     \n\t"\
470     "paddw             %%mm1, %%mm4     \n\t"\
471     "punpckhwd         %%mm0, %%mm0     \n\t"\
472     "punpckhwd         %%mm6, %%mm6     \n\t"\
473     "punpckhwd         %%mm3, %%mm3     \n\t"\
474     "paddw             %%mm7, %%mm0     \n\t"\
475     "paddw             %%mm7, %%mm6     \n\t"\
476     "paddw             %%mm7, %%mm3     \n\t"\
477     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478     "packuswb          %%mm0, %%mm2     \n\t"\
479     "packuswb          %%mm6, %%mm5     \n\t"\
480     "packuswb          %%mm3, %%mm4     \n\t"\
481
482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483
484 #define YSCALEYUV2RGB(index, c) \
485     REAL_YSCALEYUV2RGB_UV(index, c) \
486     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487     REAL_YSCALEYUV2RGB_COEFF(c)
488
489 #define REAL_YSCALEYUV2PACKED1(index, c) \
490     "xor            "#index", "#index"  \n\t"\
491     ".p2align              4            \n\t"\
492     "1:                                 \n\t"\
493     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495     "psraw                $7, %%mm3     \n\t" \
496     "psraw                $7, %%mm4     \n\t" \
497     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499     "psraw                $7, %%mm1     \n\t" \
500     "psraw                $7, %%mm7     \n\t" \
501
502 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503
504 #define REAL_YSCALEYUV2RGB1(index, c) \
505     "xor            "#index", "#index"  \n\t"\
506     ".p2align              4            \n\t"\
507     "1:                                 \n\t"\
508     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530     "paddw             %%mm3, %%mm4     \n\t"\
531     "movq              %%mm2, %%mm0     \n\t"\
532     "movq              %%mm5, %%mm6     \n\t"\
533     "movq              %%mm4, %%mm3     \n\t"\
534     "punpcklwd         %%mm2, %%mm2     \n\t"\
535     "punpcklwd         %%mm5, %%mm5     \n\t"\
536     "punpcklwd         %%mm4, %%mm4     \n\t"\
537     "paddw             %%mm1, %%mm2     \n\t"\
538     "paddw             %%mm1, %%mm5     \n\t"\
539     "paddw             %%mm1, %%mm4     \n\t"\
540     "punpckhwd         %%mm0, %%mm0     \n\t"\
541     "punpckhwd         %%mm6, %%mm6     \n\t"\
542     "punpckhwd         %%mm3, %%mm3     \n\t"\
543     "paddw             %%mm7, %%mm0     \n\t"\
544     "paddw             %%mm7, %%mm6     \n\t"\
545     "paddw             %%mm7, %%mm3     \n\t"\
546     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547     "packuswb          %%mm0, %%mm2     \n\t"\
548     "packuswb          %%mm6, %%mm5     \n\t"\
549     "packuswb          %%mm3, %%mm4     \n\t"\
550
551 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552
553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
554     "xor "#index", "#index"             \n\t"\
555     ".p2align              4            \n\t"\
556     "1:                                 \n\t"\
557     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563     "psrlw                $8, %%mm3     \n\t" \
564     "psrlw                $8, %%mm4     \n\t" \
565     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567     "psraw                $7, %%mm1     \n\t" \
568     "psraw                $7, %%mm7     \n\t"
569 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570
571 // do vertical chrominance interpolation
572 #define REAL_YSCALEYUV2RGB1b(index, c) \
573     "xor            "#index", "#index"  \n\t"\
574     ".p2align              4            \n\t"\
575     "1:                                 \n\t"\
576     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602     "paddw             %%mm3, %%mm4     \n\t"\
603     "movq              %%mm2, %%mm0     \n\t"\
604     "movq              %%mm5, %%mm6     \n\t"\
605     "movq              %%mm4, %%mm3     \n\t"\
606     "punpcklwd         %%mm2, %%mm2     \n\t"\
607     "punpcklwd         %%mm5, %%mm5     \n\t"\
608     "punpcklwd         %%mm4, %%mm4     \n\t"\
609     "paddw             %%mm1, %%mm2     \n\t"\
610     "paddw             %%mm1, %%mm5     \n\t"\
611     "paddw             %%mm1, %%mm4     \n\t"\
612     "punpckhwd         %%mm0, %%mm0     \n\t"\
613     "punpckhwd         %%mm6, %%mm6     \n\t"\
614     "punpckhwd         %%mm3, %%mm3     \n\t"\
615     "paddw             %%mm7, %%mm0     \n\t"\
616     "paddw             %%mm7, %%mm6     \n\t"\
617     "paddw             %%mm7, %%mm3     \n\t"\
618     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619     "packuswb          %%mm0, %%mm2     \n\t"\
620     "packuswb          %%mm6, %%mm5     \n\t"\
621     "packuswb          %%mm3, %%mm4     \n\t"\
622
623 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624
625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630     "packuswb          %%mm1, %%mm7     \n\t"
631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632
633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634     "movq       "#b", "#q2"     \n\t" /* B */\
635     "movq       "#r", "#t"      \n\t" /* R */\
636     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646 \
647     MOVNTQ(   q0,   (dst, index, 4))\
648     MOVNTQ(    b,  8(dst, index, 4))\
649     MOVNTQ(   q2, 16(dst, index, 4))\
650     MOVNTQ(   q3, 24(dst, index, 4))\
651 \
652     "add      $8, "#index"      \n\t"\
653     "cmp "#dstw", "#index"      \n\t"\
654     " jb      1b                \n\t"
655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656
657 #define REAL_WRITERGB16(dst, dstw, index) \
658     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661     "psrlq           $3, %%mm2  \n\t"\
662 \
663     "movq         %%mm2, %%mm1  \n\t"\
664     "movq         %%mm4, %%mm3  \n\t"\
665 \
666     "punpcklbw    %%mm7, %%mm3  \n\t"\
667     "punpcklbw    %%mm5, %%mm2  \n\t"\
668     "punpckhbw    %%mm7, %%mm4  \n\t"\
669     "punpckhbw    %%mm5, %%mm1  \n\t"\
670 \
671     "psllq           $3, %%mm3  \n\t"\
672     "psllq           $3, %%mm4  \n\t"\
673 \
674     "por          %%mm3, %%mm2  \n\t"\
675     "por          %%mm4, %%mm1  \n\t"\
676 \
677     MOVNTQ(%%mm2,  (dst, index, 2))\
678     MOVNTQ(%%mm1, 8(dst, index, 2))\
679 \
680     "add             $8, "#index"   \n\t"\
681     "cmp        "#dstw", "#index"   \n\t"\
682     " jb             1b             \n\t"
683 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684
685 #define REAL_WRITERGB15(dst, dstw, index) \
686     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689     "psrlq           $3, %%mm2  \n\t"\
690     "psrlq           $1, %%mm5  \n\t"\
691 \
692     "movq         %%mm2, %%mm1  \n\t"\
693     "movq         %%mm4, %%mm3  \n\t"\
694 \
695     "punpcklbw    %%mm7, %%mm3  \n\t"\
696     "punpcklbw    %%mm5, %%mm2  \n\t"\
697     "punpckhbw    %%mm7, %%mm4  \n\t"\
698     "punpckhbw    %%mm5, %%mm1  \n\t"\
699 \
700     "psllq           $2, %%mm3  \n\t"\
701     "psllq           $2, %%mm4  \n\t"\
702 \
703     "por          %%mm3, %%mm2  \n\t"\
704     "por          %%mm4, %%mm1  \n\t"\
705 \
706     MOVNTQ(%%mm2,  (dst, index, 2))\
707     MOVNTQ(%%mm1, 8(dst, index, 2))\
708 \
709     "add             $8, "#index"   \n\t"\
710     "cmp        "#dstw", "#index"   \n\t"\
711     " jb             1b             \n\t"
712 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713
714 #define WRITEBGR24OLD(dst, dstw, index) \
715     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716     "movq      %%mm2, %%mm1             \n\t" /* B */\
717     "movq      %%mm5, %%mm6             \n\t" /* R */\
718     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728 \
729     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737 \
738     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751 \
752     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760 \
761     MOVNTQ(%%mm0,   (dst))\
762     MOVNTQ(%%mm2,  8(dst))\
763     MOVNTQ(%%mm3, 16(dst))\
764     "add         $24, "#dst"            \n\t"\
765 \
766     "add          $8, "#index"          \n\t"\
767     "cmp     "#dstw", "#index"          \n\t"\
768     " jb          1b                    \n\t"
769
770 #define WRITEBGR24MMX(dst, dstw, index) \
771     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772     "movq      %%mm2, %%mm1     \n\t" /* B */\
773     "movq      %%mm5, %%mm6     \n\t" /* R */\
774     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784 \
785     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789 \
790     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794 \
795     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799 \
800     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804     MOVNTQ(%%mm0, (dst))\
805 \
806     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810     MOVNTQ(%%mm6, 8(dst))\
811 \
812     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815     MOVNTQ(%%mm5, 16(dst))\
816 \
817     "add         $24, "#dst"    \n\t"\
818 \
819     "add          $8, "#index"  \n\t"\
820     "cmp     "#dstw", "#index"  \n\t"\
821     " jb          1b            \n\t"
822
823 #define WRITEBGR24MMX2(dst, dstw, index) \
824     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830 \
831     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834 \
835     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836     "por    %%mm1, %%mm6        \n\t"\
837     "por    %%mm3, %%mm6        \n\t"\
838     MOVNTQ(%%mm6, (dst))\
839 \
840     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844 \
845     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848 \
849     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850     "por    %%mm3, %%mm6        \n\t"\
851     MOVNTQ(%%mm6, 8(dst))\
852 \
853     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856 \
857     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860 \
861     "por    %%mm1, %%mm3        \n\t"\
862     "por    %%mm3, %%mm6        \n\t"\
863     MOVNTQ(%%mm6, 16(dst))\
864 \
865     "add      $24, "#dst"       \n\t"\
866 \
867     "add       $8, "#index"     \n\t"\
868     "cmp  "#dstw", "#index"     \n\t"\
869     " jb       1b               \n\t"
870
871 #if COMPILE_TEMPLATE_MMX2
872 #undef WRITEBGR24
873 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874 #else
875 #undef WRITEBGR24
876 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877 #endif
878
879 #define REAL_WRITEYUY2(dst, dstw, index) \
880     "packuswb  %%mm3, %%mm3     \n\t"\
881     "packuswb  %%mm4, %%mm4     \n\t"\
882     "packuswb  %%mm7, %%mm1     \n\t"\
883     "punpcklbw %%mm4, %%mm3     \n\t"\
884     "movq      %%mm1, %%mm7     \n\t"\
885     "punpcklbw %%mm3, %%mm1     \n\t"\
886     "punpckhbw %%mm3, %%mm7     \n\t"\
887 \
888     MOVNTQ(%%mm1, (dst, index, 2))\
889     MOVNTQ(%%mm7, 8(dst, index, 2))\
890 \
891     "add          $8, "#index"  \n\t"\
892     "cmp     "#dstw", "#index"  \n\t"\
893     " jb          1b            \n\t"
894 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895
896
897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
900 {
901 #if COMPILE_TEMPLATE_MMX
902     if(!(c->flags & SWS_BITEXACT)) {
903         if (c->flags & SWS_ACCURATE_RND) {
904             if (uDest) {
905                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907             }
908             if (CONFIG_SWSCALE_ALPHA && aDest) {
909                 YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910             }
911
912             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913         } else {
914             if (uDest) {
915                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917             }
918             if (CONFIG_SWSCALE_ALPHA && aDest) {
919                 YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920             }
921
922             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923         }
924         return;
925     }
926 #endif
927 #if COMPILE_TEMPLATE_ALTIVEC
928     yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929                           chrFilter, chrSrc, chrFilterSize,
930                           dest, uDest, vDest, dstW, chrDstW);
931 #else //COMPILE_TEMPLATE_ALTIVEC
932     yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933                 chrFilter, chrSrc, chrFilterSize,
934                 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935 #endif //!COMPILE_TEMPLATE_ALTIVEC
936 }
937
938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939                                      const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
941 {
942     yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943                  chrFilter, chrSrc, chrFilterSize,
944                  dest, uDest, dstW, chrDstW, dstFormat);
945 }
946
947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
949 {
950     int i;
951 #if COMPILE_TEMPLATE_MMX
952     if(!(c->flags & SWS_BITEXACT)) {
953         long p= 4;
954         const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956         x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957
958         if (c->flags & SWS_ACCURATE_RND) {
959             while(p--) {
960                 if (dst[p]) {
961                     __asm__ volatile(
962                         YSCALEYUV2YV121_ACCURATE
963                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
964                         "g" (-counter[p])
965                         : "%"REG_a
966                     );
967                 }
968             }
969         } else {
970             while(p--) {
971                 if (dst[p]) {
972                     __asm__ volatile(
973                         YSCALEYUV2YV121
974                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
975                         "g" (-counter[p])
976                         : "%"REG_a
977                     );
978                 }
979             }
980         }
981         return;
982     }
983 #endif
984     for (i=0; i<dstW; i++) {
985         int val= (lumSrc[i]+64)>>7;
986
987         if (val&256) {
988             if (val<0) val=0;
989             else       val=255;
990         }
991
992         dest[i]= val;
993     }
994
995     if (uDest)
996         for (i=0; i<chrDstW; i++) {
997             int u=(chrSrc[i       ]+64)>>7;
998             int v=(chrSrc[i + VOFW]+64)>>7;
999
1000             if ((u|v)&256) {
1001                 if (u<0)        u=0;
1002                 else if (u>255) u=255;
1003                 if (v<0)        v=0;
1004                 else if (v>255) v=255;
1005             }
1006
1007             uDest[i]= u;
1008             vDest[i]= v;
1009         }
1010
1011     if (CONFIG_SWSCALE_ALPHA && aDest)
1012         for (i=0; i<dstW; i++) {
1013             int val= (alpSrc[i]+64)>>7;
1014             aDest[i]= av_clip_uint8(val);
1015         }
1016 }
1017
1018
1019 /**
1020  * vertical scale YV12 to RGB
1021  */
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1025 {
1026 #if COMPILE_TEMPLATE_MMX
1027     x86_reg dummy=0;
1028     x86_reg dstW_reg = dstW;
1029     if(!(c->flags & SWS_BITEXACT)) {
1030         if (c->flags & SWS_ACCURATE_RND) {
1031             switch(c->dstFormat) {
1032             case PIX_FMT_RGB32:
1033                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034                     YSCALEYUV2PACKEDX_ACCURATE
1035                     YSCALEYUV2RGBX
1036                     "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1037                     "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1038                     "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1039                     YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040                     "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1041                     "psraw                        $3, %%mm1         \n\t"
1042                     "psraw                        $3, %%mm7         \n\t"
1043                     "packuswb                  %%mm7, %%mm1         \n\t"
1044                     WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1045
1046                     YSCALEYUV2PACKEDX_END
1047                 } else {
1048                     YSCALEYUV2PACKEDX_ACCURATE
1049                     YSCALEYUV2RGBX
1050                     "pcmpeqd %%mm7, %%mm7 \n\t"
1051                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1052
1053                     YSCALEYUV2PACKEDX_END
1054                 }
1055                 return;
1056             case PIX_FMT_BGR24:
1057                 YSCALEYUV2PACKEDX_ACCURATE
1058                 YSCALEYUV2RGBX
1059                 "pxor %%mm7, %%mm7 \n\t"
1060                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061                 "add %4, %%"REG_c"                        \n\t"
1062                 WRITEBGR24(%%REGc, %5, %%REGa)
1063
1064
1065                 :: "r" (&c->redDither),
1066                 "m" (dummy), "m" (dummy), "m" (dummy),
1067                 "r" (dest), "m" (dstW_reg)
1068                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1069                 );
1070                 return;
1071             case PIX_FMT_RGB555:
1072                 YSCALEYUV2PACKEDX_ACCURATE
1073                 YSCALEYUV2RGBX
1074                 "pxor %%mm7, %%mm7 \n\t"
1075                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076 #ifdef DITHER1XBPP
1077                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080 #endif
1081
1082                 WRITERGB15(%4, %5, %%REGa)
1083                 YSCALEYUV2PACKEDX_END
1084                 return;
1085             case PIX_FMT_RGB565:
1086                 YSCALEYUV2PACKEDX_ACCURATE
1087                 YSCALEYUV2RGBX
1088                 "pxor %%mm7, %%mm7 \n\t"
1089                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 #ifdef DITHER1XBPP
1091                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094 #endif
1095
1096                 WRITERGB16(%4, %5, %%REGa)
1097                 YSCALEYUV2PACKEDX_END
1098                 return;
1099             case PIX_FMT_YUYV422:
1100                 YSCALEYUV2PACKEDX_ACCURATE
1101                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
1103                 "psraw $3, %%mm3    \n\t"
1104                 "psraw $3, %%mm4    \n\t"
1105                 "psraw $3, %%mm1    \n\t"
1106                 "psraw $3, %%mm7    \n\t"
1107                 WRITEYUY2(%4, %5, %%REGa)
1108                 YSCALEYUV2PACKEDX_END
1109                 return;
1110             }
1111         } else {
1112             switch(c->dstFormat) {
1113             case PIX_FMT_RGB32:
1114                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115                     YSCALEYUV2PACKEDX
1116                     YSCALEYUV2RGBX
1117                     YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118                     "psraw                        $3, %%mm1         \n\t"
1119                     "psraw                        $3, %%mm7         \n\t"
1120                     "packuswb                  %%mm7, %%mm1         \n\t"
1121                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122                     YSCALEYUV2PACKEDX_END
1123                 } else {
1124                     YSCALEYUV2PACKEDX
1125                     YSCALEYUV2RGBX
1126                     "pcmpeqd %%mm7, %%mm7 \n\t"
1127                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128                     YSCALEYUV2PACKEDX_END
1129                 }
1130                 return;
1131             case PIX_FMT_BGR24:
1132                 YSCALEYUV2PACKEDX
1133                 YSCALEYUV2RGBX
1134                 "pxor                    %%mm7, %%mm7       \n\t"
1135                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136                 "add                        %4, %%"REG_c"   \n\t"
1137                 WRITEBGR24(%%REGc, %5, %%REGa)
1138
1139                 :: "r" (&c->redDither),
1140                 "m" (dummy), "m" (dummy), "m" (dummy),
1141                 "r" (dest),  "m" (dstW_reg)
1142                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143                 );
1144                 return;
1145             case PIX_FMT_RGB555:
1146                 YSCALEYUV2PACKEDX
1147                 YSCALEYUV2RGBX
1148                 "pxor %%mm7, %%mm7 \n\t"
1149                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 #ifdef DITHER1XBPP
1151                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1152                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1153                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1154 #endif
1155
1156                 WRITERGB15(%4, %5, %%REGa)
1157                 YSCALEYUV2PACKEDX_END
1158                 return;
1159             case PIX_FMT_RGB565:
1160                 YSCALEYUV2PACKEDX
1161                 YSCALEYUV2RGBX
1162                 "pxor %%mm7, %%mm7 \n\t"
1163                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164 #ifdef DITHER1XBPP
1165                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1166                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1167                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1168 #endif
1169
1170                 WRITERGB16(%4, %5, %%REGa)
1171                 YSCALEYUV2PACKEDX_END
1172                 return;
1173             case PIX_FMT_YUYV422:
1174                 YSCALEYUV2PACKEDX
1175                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1176
1177                 "psraw $3, %%mm3    \n\t"
1178                 "psraw $3, %%mm4    \n\t"
1179                 "psraw $3, %%mm1    \n\t"
1180                 "psraw $3, %%mm7    \n\t"
1181                 WRITEYUY2(%4, %5, %%REGa)
1182                 YSCALEYUV2PACKEDX_END
1183                 return;
1184             }
1185         }
1186     }
1187 #endif /* COMPILE_TEMPLATE_MMX */
1188 #if COMPILE_TEMPLATE_ALTIVEC
1189     /* The following list of supported dstFormat values should
1190        match what's found in the body of ff_yuv2packedX_altivec() */
1191     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192          (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1193           c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194           c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1195             ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196                                    chrFilter, chrSrc, chrFilterSize,
1197                                    dest, dstW, dstY);
1198     else
1199 #endif
1200         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201                        chrFilter, chrSrc, chrFilterSize,
1202                        alpSrc, dest, dstW, dstY);
1203 }
1204
1205 /**
1206  * vertical bilinear scale YV12 to RGB
1207  */
1208 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1210 {
1211     int  yalpha1=4095- yalpha;
1212     int uvalpha1=4095-uvalpha;
1213     int i;
1214
1215 #if COMPILE_TEMPLATE_MMX
1216     if(!(c->flags & SWS_BITEXACT)) {
1217         switch(c->dstFormat) {
1218         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219         case PIX_FMT_RGB32:
1220             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221 #if ARCH_X86_64
1222                 __asm__ volatile(
1223                     YSCALEYUV2RGB(%%r8, %5)
1224                     YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227                     "packuswb            %%mm7, %%mm1       \n\t"
1228                     WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1229
1230                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231                     "a" (&c->redDither)
1232                     ,"r" (abuf0), "r" (abuf1)
1233                     : "%r8"
1234                 );
1235 #else
1236                 c->u_temp=(intptr_t)abuf0;
1237                 c->v_temp=(intptr_t)abuf1;
1238                 __asm__ volatile(
1239                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1240                     "mov        %4, %%"REG_b"               \n\t"
1241                     "push %%"REG_BP"                        \n\t"
1242                     YSCALEYUV2RGB(%%REGBP, %5)
1243                     "push                   %0              \n\t"
1244                     "push                   %1              \n\t"
1245                     "mov          "U_TEMP"(%5), %0          \n\t"
1246                     "mov          "V_TEMP"(%5), %1          \n\t"
1247                     YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250                     "packuswb            %%mm7, %%mm1       \n\t"
1251                     "pop                    %1              \n\t"
1252                     "pop                    %0              \n\t"
1253                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254                     "pop %%"REG_BP"                         \n\t"
1255                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1256
1257                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258                     "a" (&c->redDither)
1259                 );
1260 #endif
1261             } else {
1262                 __asm__ volatile(
1263                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1264                     "mov        %4, %%"REG_b"               \n\t"
1265                     "push %%"REG_BP"                        \n\t"
1266                     YSCALEYUV2RGB(%%REGBP, %5)
1267                     "pcmpeqd %%mm7, %%mm7                   \n\t"
1268                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269                     "pop %%"REG_BP"                         \n\t"
1270                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1271
1272                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273                     "a" (&c->redDither)
1274                 );
1275             }
1276             return;
1277         case PIX_FMT_BGR24:
1278             __asm__ volatile(
1279                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1280                 "mov        %4, %%"REG_b"               \n\t"
1281                 "push %%"REG_BP"                        \n\t"
1282                 YSCALEYUV2RGB(%%REGBP, %5)
1283                 "pxor    %%mm7, %%mm7                   \n\t"
1284                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285                 "pop %%"REG_BP"                         \n\t"
1286                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1287                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288                 "a" (&c->redDither)
1289             );
1290             return;
1291         case PIX_FMT_RGB555:
1292             __asm__ volatile(
1293                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1294                 "mov        %4, %%"REG_b"               \n\t"
1295                 "push %%"REG_BP"                        \n\t"
1296                 YSCALEYUV2RGB(%%REGBP, %5)
1297                 "pxor    %%mm7, %%mm7                   \n\t"
1298                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299 #ifdef DITHER1XBPP
1300                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1301                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1302                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1303 #endif
1304
1305                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306                 "pop %%"REG_BP"                         \n\t"
1307                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1308
1309                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310                 "a" (&c->redDither)
1311             );
1312             return;
1313         case PIX_FMT_RGB565:
1314             __asm__ volatile(
1315                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1316                 "mov        %4, %%"REG_b"               \n\t"
1317                 "push %%"REG_BP"                        \n\t"
1318                 YSCALEYUV2RGB(%%REGBP, %5)
1319                 "pxor    %%mm7, %%mm7                   \n\t"
1320                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321 #ifdef DITHER1XBPP
1322                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1323                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1324                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1325 #endif
1326
1327                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328                 "pop %%"REG_BP"                         \n\t"
1329                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1330                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331                 "a" (&c->redDither)
1332             );
1333             return;
1334         case PIX_FMT_YUYV422:
1335             __asm__ volatile(
1336                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1337                 "mov %4, %%"REG_b"                        \n\t"
1338                 "push %%"REG_BP"                        \n\t"
1339                 YSCALEYUV2PACKED(%%REGBP, %5)
1340                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341                 "pop %%"REG_BP"                         \n\t"
1342                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344                 "a" (&c->redDither)
1345             );
1346             return;
1347         default: break;
1348         }
1349     }
1350 #endif //COMPILE_TEMPLATE_MMX
1351     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1352 }
1353
1354 /**
1355  * YV12 to RGB without scaling or interpolating
1356  */
1357 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1359 {
1360     const int yalpha1=0;
1361     int i;
1362
1363     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364     const int yalpha= 4096; //FIXME ...
1365
1366     if (flags&SWS_FULL_CHR_H_INT) {
1367         c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1368         return;
1369     }
1370
1371 #if COMPILE_TEMPLATE_MMX
1372     if(!(flags & SWS_BITEXACT)) {
1373         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374             switch(dstFormat) {
1375             case PIX_FMT_RGB32:
1376                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377                     __asm__ volatile(
1378                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1379                         "mov        %4, %%"REG_b"               \n\t"
1380                         "push %%"REG_BP"                        \n\t"
1381                         YSCALEYUV2RGB1(%%REGBP, %5)
1382                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384                         "pop %%"REG_BP"                         \n\t"
1385                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1386
1387                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388                         "a" (&c->redDither)
1389                     );
1390                 } else {
1391                     __asm__ volatile(
1392                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393                         "mov        %4, %%"REG_b"               \n\t"
1394                         "push %%"REG_BP"                        \n\t"
1395                         YSCALEYUV2RGB1(%%REGBP, %5)
1396                         "pcmpeqd %%mm7, %%mm7                   \n\t"
1397                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398                         "pop %%"REG_BP"                         \n\t"
1399                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1400
1401                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402                         "a" (&c->redDither)
1403                     );
1404                 }
1405                 return;
1406             case PIX_FMT_BGR24:
1407                 __asm__ volatile(
1408                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409                     "mov        %4, %%"REG_b"               \n\t"
1410                     "push %%"REG_BP"                        \n\t"
1411                     YSCALEYUV2RGB1(%%REGBP, %5)
1412                     "pxor    %%mm7, %%mm7                   \n\t"
1413                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414                     "pop %%"REG_BP"                         \n\t"
1415                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1416
1417                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418                     "a" (&c->redDither)
1419                 );
1420                 return;
1421             case PIX_FMT_RGB555:
1422                 __asm__ volatile(
1423                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1424                     "mov        %4, %%"REG_b"               \n\t"
1425                     "push %%"REG_BP"                        \n\t"
1426                     YSCALEYUV2RGB1(%%REGBP, %5)
1427                     "pxor    %%mm7, %%mm7                   \n\t"
1428                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429 #ifdef DITHER1XBPP
1430                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1431                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1432                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1433 #endif
1434                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435                     "pop %%"REG_BP"                         \n\t"
1436                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1437
1438                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439                     "a" (&c->redDither)
1440                 );
1441                 return;
1442             case PIX_FMT_RGB565:
1443                 __asm__ volatile(
1444                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1445                     "mov        %4, %%"REG_b"               \n\t"
1446                     "push %%"REG_BP"                        \n\t"
1447                     YSCALEYUV2RGB1(%%REGBP, %5)
1448                     "pxor    %%mm7, %%mm7                   \n\t"
1449                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450 #ifdef DITHER1XBPP
1451                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1452                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1453                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1454 #endif
1455
1456                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457                     "pop %%"REG_BP"                         \n\t"
1458                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459
1460                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461                     "a" (&c->redDither)
1462                 );
1463                 return;
1464             case PIX_FMT_YUYV422:
1465                 __asm__ volatile(
1466                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1467                     "mov        %4, %%"REG_b"               \n\t"
1468                     "push %%"REG_BP"                        \n\t"
1469                     YSCALEYUV2PACKED1(%%REGBP, %5)
1470                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471                     "pop %%"REG_BP"                         \n\t"
1472                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473
1474                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475                     "a" (&c->redDither)
1476                 );
1477                 return;
1478             }
1479         } else {
1480             switch(dstFormat) {
1481             case PIX_FMT_RGB32:
1482                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483                     __asm__ volatile(
1484                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1485                         "mov        %4, %%"REG_b"               \n\t"
1486                         "push %%"REG_BP"                        \n\t"
1487                         YSCALEYUV2RGB1b(%%REGBP, %5)
1488                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490                         "pop %%"REG_BP"                         \n\t"
1491                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492
1493                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494                         "a" (&c->redDither)
1495                     );
1496                 } else {
1497                     __asm__ volatile(
1498                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499                         "mov        %4, %%"REG_b"               \n\t"
1500                         "push %%"REG_BP"                        \n\t"
1501                         YSCALEYUV2RGB1b(%%REGBP, %5)
1502                         "pcmpeqd %%mm7, %%mm7                   \n\t"
1503                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504                         "pop %%"REG_BP"                         \n\t"
1505                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1506
1507                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508                         "a" (&c->redDither)
1509                     );
1510                 }
1511                 return;
1512             case PIX_FMT_BGR24:
1513                 __asm__ volatile(
1514                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515                     "mov        %4, %%"REG_b"               \n\t"
1516                     "push %%"REG_BP"                        \n\t"
1517                     YSCALEYUV2RGB1b(%%REGBP, %5)
1518                     "pxor    %%mm7, %%mm7                   \n\t"
1519                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520                     "pop %%"REG_BP"                         \n\t"
1521                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522
1523                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524                     "a" (&c->redDither)
1525                 );
1526                 return;
1527             case PIX_FMT_RGB555:
1528                 __asm__ volatile(
1529                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1530                     "mov        %4, %%"REG_b"               \n\t"
1531                     "push %%"REG_BP"                        \n\t"
1532                     YSCALEYUV2RGB1b(%%REGBP, %5)
1533                     "pxor    %%mm7, %%mm7                   \n\t"
1534                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1535 #ifdef DITHER1XBPP
1536                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1537                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1538                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1539 #endif
1540                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541                     "pop %%"REG_BP"                         \n\t"
1542                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543
1544                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545                     "a" (&c->redDither)
1546                 );
1547                 return;
1548             case PIX_FMT_RGB565:
1549                 __asm__ volatile(
1550                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551                     "mov        %4, %%"REG_b"               \n\t"
1552                     "push %%"REG_BP"                        \n\t"
1553                     YSCALEYUV2RGB1b(%%REGBP, %5)
1554                     "pxor    %%mm7, %%mm7                   \n\t"
1555                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556 #ifdef DITHER1XBPP
1557                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1558                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1559                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1560 #endif
1561
1562                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563                     "pop %%"REG_BP"                         \n\t"
1564                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1565
1566                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567                     "a" (&c->redDither)
1568                 );
1569                 return;
1570             case PIX_FMT_YUYV422:
1571                 __asm__ volatile(
1572                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1573                     "mov        %4, %%"REG_b"               \n\t"
1574                     "push %%"REG_BP"                        \n\t"
1575                     YSCALEYUV2PACKED1b(%%REGBP, %5)
1576                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577                     "pop %%"REG_BP"                         \n\t"
1578                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1579
1580                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1581                     "a" (&c->redDither)
1582                 );
1583                 return;
1584             }
1585         }
1586     }
1587 #endif /* COMPILE_TEMPLATE_MMX */
1588     if (uvalpha < 2048) {
1589         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1590     } else {
1591         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592     }
1593 }
1594
1595 //FIXME yuy2* can read up to 7 samples too much
1596
1597 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1598 {
1599 #if COMPILE_TEMPLATE_MMX
1600     __asm__ volatile(
1601         "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1602         "mov                    %0, %%"REG_a"       \n\t"
1603         "1:                                         \n\t"
1604         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1605         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1606         "pand                %%mm2, %%mm0           \n\t"
1607         "pand                %%mm2, %%mm1           \n\t"
1608         "packuswb            %%mm1, %%mm0           \n\t"
1609         "movq                %%mm0, (%2, %%"REG_a") \n\t"
1610         "add                    $8, %%"REG_a"       \n\t"
1611         " js                    1b                  \n\t"
1612         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1613         : "%"REG_a
1614     );
1615 #else
1616     int i;
1617     for (i=0; i<width; i++)
1618         dst[i]= src[2*i];
1619 #endif
1620 }
1621
1622 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1623 {
1624 #if COMPILE_TEMPLATE_MMX
1625     __asm__ volatile(
1626         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1627         "mov                    %0, %%"REG_a"       \n\t"
1628         "1:                                         \n\t"
1629         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1630         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1631         "psrlw                  $8, %%mm0           \n\t"
1632         "psrlw                  $8, %%mm1           \n\t"
1633         "packuswb            %%mm1, %%mm0           \n\t"
1634         "movq                %%mm0, %%mm1           \n\t"
1635         "psrlw                  $8, %%mm0           \n\t"
1636         "pand                %%mm4, %%mm1           \n\t"
1637         "packuswb            %%mm0, %%mm0           \n\t"
1638         "packuswb            %%mm1, %%mm1           \n\t"
1639         "movd                %%mm0, (%3, %%"REG_a") \n\t"
1640         "movd                %%mm1, (%2, %%"REG_a") \n\t"
1641         "add                    $4, %%"REG_a"       \n\t"
1642         " js                    1b                  \n\t"
1643         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1644         : "%"REG_a
1645     );
1646 #else
1647     int i;
1648     for (i=0; i<width; i++) {
1649         dstU[i]= src1[4*i + 1];
1650         dstV[i]= src1[4*i + 3];
1651     }
1652 #endif
1653     assert(src1 == src2);
1654 }
1655
1656 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1657 {
1658 #if COMPILE_TEMPLATE_MMX
1659     __asm__ volatile(
1660         "mov                    %0, %%"REG_a"       \n\t"
1661         "1:                                         \n\t"
1662         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1663         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1664         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1665         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1666         "psrlw                  $8, %%mm0           \n\t"
1667         "psrlw                  $8, %%mm1           \n\t"
1668         "psrlw                  $8, %%mm2           \n\t"
1669         "psrlw                  $8, %%mm3           \n\t"
1670         "packuswb            %%mm1, %%mm0           \n\t"
1671         "packuswb            %%mm3, %%mm2           \n\t"
1672         "movq                %%mm0, (%3, %%"REG_a") \n\t"
1673         "movq                %%mm2, (%4, %%"REG_a") \n\t"
1674         "add                    $8, %%"REG_a"       \n\t"
1675         " js                    1b                  \n\t"
1676         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1677         : "%"REG_a
1678     );
1679 #else
1680     int i;
1681     for (i=0; i<width; i++) {
1682         dstU[i]= src1[2*i + 1];
1683         dstV[i]= src2[2*i + 1];
1684     }
1685 #endif
1686 }
1687
1688 /* This is almost identical to the previous, end exists only because
1689  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1691 {
1692 #if COMPILE_TEMPLATE_MMX
1693     __asm__ volatile(
1694         "mov                  %0, %%"REG_a"         \n\t"
1695         "1:                                         \n\t"
1696         "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1697         "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1698         "psrlw                $8, %%mm0             \n\t"
1699         "psrlw                $8, %%mm1             \n\t"
1700         "packuswb          %%mm1, %%mm0             \n\t"
1701         "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1702         "add                  $8, %%"REG_a"         \n\t"
1703         " js                  1b                    \n\t"
1704         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1705         : "%"REG_a
1706     );
1707 #else
1708     int i;
1709     for (i=0; i<width; i++)
1710         dst[i]= src[2*i+1];
1711 #endif
1712 }
1713
1714 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1715 {
1716 #if COMPILE_TEMPLATE_MMX
1717     __asm__ volatile(
1718         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1719         "mov                    %0, %%"REG_a"       \n\t"
1720         "1:                                         \n\t"
1721         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1722         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1723         "pand                %%mm4, %%mm0           \n\t"
1724         "pand                %%mm4, %%mm1           \n\t"
1725         "packuswb            %%mm1, %%mm0           \n\t"
1726         "movq                %%mm0, %%mm1           \n\t"
1727         "psrlw                  $8, %%mm0           \n\t"
1728         "pand                %%mm4, %%mm1           \n\t"
1729         "packuswb            %%mm0, %%mm0           \n\t"
1730         "packuswb            %%mm1, %%mm1           \n\t"
1731         "movd                %%mm0, (%3, %%"REG_a") \n\t"
1732         "movd                %%mm1, (%2, %%"REG_a") \n\t"
1733         "add                    $4, %%"REG_a"       \n\t"
1734         " js                    1b                  \n\t"
1735         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1736         : "%"REG_a
1737     );
1738 #else
1739     int i;
1740     for (i=0; i<width; i++) {
1741         dstU[i]= src1[4*i + 0];
1742         dstV[i]= src1[4*i + 2];
1743     }
1744 #endif
1745     assert(src1 == src2);
1746 }
1747
1748 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1749 {
1750 #if COMPILE_TEMPLATE_MMX
1751     __asm__ volatile(
1752         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1753         "mov                    %0, %%"REG_a"       \n\t"
1754         "1:                                         \n\t"
1755         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1756         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1757         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1758         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1759         "pand                %%mm4, %%mm0           \n\t"
1760         "pand                %%mm4, %%mm1           \n\t"
1761         "pand                %%mm4, %%mm2           \n\t"
1762         "pand                %%mm4, %%mm3           \n\t"
1763         "packuswb            %%mm1, %%mm0           \n\t"
1764         "packuswb            %%mm3, %%mm2           \n\t"
1765         "movq                %%mm0, (%3, %%"REG_a") \n\t"
1766         "movq                %%mm2, (%4, %%"REG_a") \n\t"
1767         "add                    $8, %%"REG_a"       \n\t"
1768         " js                    1b                  \n\t"
1769         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1770         : "%"REG_a
1771     );
1772 #else
1773     int i;
1774     for (i=0; i<width; i++) {
1775         dstU[i]= src1[2*i];
1776         dstV[i]= src2[2*i];
1777     }
1778 #endif
1779 }
1780
1781 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782                                     const uint8_t *src, long width)
1783 {
1784 #if COMPILE_TEMPLATE_MMX
1785     __asm__ volatile(
1786         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1787         "mov                    %0, %%"REG_a"       \n\t"
1788         "1:                                         \n\t"
1789         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1790         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1791         "movq                %%mm0, %%mm2           \n\t"
1792         "movq                %%mm1, %%mm3           \n\t"
1793         "pand                %%mm4, %%mm0           \n\t"
1794         "pand                %%mm4, %%mm1           \n\t"
1795         "psrlw                  $8, %%mm2           \n\t"
1796         "psrlw                  $8, %%mm3           \n\t"
1797         "packuswb            %%mm1, %%mm0           \n\t"
1798         "packuswb            %%mm3, %%mm2           \n\t"
1799         "movq                %%mm0, (%2, %%"REG_a") \n\t"
1800         "movq                %%mm2, (%3, %%"REG_a") \n\t"
1801         "add                    $8, %%"REG_a"       \n\t"
1802         " js                    1b                  \n\t"
1803         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1804         : "%"REG_a
1805     );
1806 #else
1807     int i;
1808     for (i = 0; i < width; i++) {
1809         dst1[i] = src[2*i+0];
1810         dst2[i] = src[2*i+1];
1811     }
1812 #endif
1813 }
1814
1815 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816                                     const uint8_t *src1, const uint8_t *src2,
1817                                     long width, uint32_t *unused)
1818 {
1819     RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1820 }
1821
1822 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823                                     const uint8_t *src1, const uint8_t *src2,
1824                                     long width, uint32_t *unused)
1825 {
1826     RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1827 }
1828
1829 // FIXME Maybe dither instead.
1830 #define YUV_NBPS(depth) \
1831 static inline void RENAME(yuv ## depth ## ToUV)(uint8_t *dstU, uint8_t *dstV, \
1832                                      const uint16_t *srcU, const uint16_t *srcV, \
1833                                      long width, uint32_t *unused) \
1834 { \
1835     int i; \
1836     for (i = 0; i < width; i++) { \
1837         dstU[i] = srcU[i]>>(depth-8); \
1838         dstV[i] = srcV[i]>>(depth-8); \
1839     } \
1840 } \
1841 \
1842 static inline void RENAME(yuv ## depth ## ToY)(uint8_t *dstY, const uint16_t *srcY, long width, uint32_t *unused) \
1843 { \
1844     int i; \
1845     for (i = 0; i < width; i++) \
1846         dstY[i] = srcY[i]>>(depth-8); \
1847 } \
1848
1849 YUV_NBPS( 9)
1850 YUV_NBPS(10)
1851
1852 #if COMPILE_TEMPLATE_MMX
1853 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1854 {
1855
1856     if(srcFormat == PIX_FMT_BGR24) {
1857         __asm__ volatile(
1858             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1859             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1860             :
1861         );
1862     } else {
1863         __asm__ volatile(
1864             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1865             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1866             :
1867         );
1868     }
1869
1870     __asm__ volatile(
1871         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1872         "mov                        %2, %%"REG_a"   \n\t"
1873         "pxor                    %%mm7, %%mm7       \n\t"
1874         "1:                                         \n\t"
1875         PREFETCH"               64(%0)              \n\t"
1876         "movd                     (%0), %%mm0       \n\t"
1877         "movd                    2(%0), %%mm1       \n\t"
1878         "movd                    6(%0), %%mm2       \n\t"
1879         "movd                    8(%0), %%mm3       \n\t"
1880         "add                       $12, %0          \n\t"
1881         "punpcklbw               %%mm7, %%mm0       \n\t"
1882         "punpcklbw               %%mm7, %%mm1       \n\t"
1883         "punpcklbw               %%mm7, %%mm2       \n\t"
1884         "punpcklbw               %%mm7, %%mm3       \n\t"
1885         "pmaddwd                 %%mm5, %%mm0       \n\t"
1886         "pmaddwd                 %%mm6, %%mm1       \n\t"
1887         "pmaddwd                 %%mm5, %%mm2       \n\t"
1888         "pmaddwd                 %%mm6, %%mm3       \n\t"
1889         "paddd                   %%mm1, %%mm0       \n\t"
1890         "paddd                   %%mm3, %%mm2       \n\t"
1891         "paddd                   %%mm4, %%mm0       \n\t"
1892         "paddd                   %%mm4, %%mm2       \n\t"
1893         "psrad                     $15, %%mm0       \n\t"
1894         "psrad                     $15, %%mm2       \n\t"
1895         "packssdw                %%mm2, %%mm0       \n\t"
1896         "packuswb                %%mm0, %%mm0       \n\t"
1897         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1898         "add                        $4, %%"REG_a"   \n\t"
1899         " js                        1b              \n\t"
1900     : "+r" (src)
1901     : "r" (dst+width), "g" ((x86_reg)-width)
1902     : "%"REG_a
1903     );
1904 }
1905
1906 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1907 {
1908     __asm__ volatile(
1909         "movq                   24(%4), %%mm6       \n\t"
1910         "mov                        %3, %%"REG_a"   \n\t"
1911         "pxor                    %%mm7, %%mm7       \n\t"
1912         "1:                                         \n\t"
1913         PREFETCH"               64(%0)              \n\t"
1914         "movd                     (%0), %%mm0       \n\t"
1915         "movd                    2(%0), %%mm1       \n\t"
1916         "punpcklbw               %%mm7, %%mm0       \n\t"
1917         "punpcklbw               %%mm7, %%mm1       \n\t"
1918         "movq                    %%mm0, %%mm2       \n\t"
1919         "movq                    %%mm1, %%mm3       \n\t"
1920         "pmaddwd                  (%4), %%mm0       \n\t"
1921         "pmaddwd                 8(%4), %%mm1       \n\t"
1922         "pmaddwd                16(%4), %%mm2       \n\t"
1923         "pmaddwd                 %%mm6, %%mm3       \n\t"
1924         "paddd                   %%mm1, %%mm0       \n\t"
1925         "paddd                   %%mm3, %%mm2       \n\t"
1926
1927         "movd                    6(%0), %%mm1       \n\t"
1928         "movd                    8(%0), %%mm3       \n\t"
1929         "add                       $12, %0          \n\t"
1930         "punpcklbw               %%mm7, %%mm1       \n\t"
1931         "punpcklbw               %%mm7, %%mm3       \n\t"
1932         "movq                    %%mm1, %%mm4       \n\t"
1933         "movq                    %%mm3, %%mm5       \n\t"
1934         "pmaddwd                  (%4), %%mm1       \n\t"
1935         "pmaddwd                 8(%4), %%mm3       \n\t"
1936         "pmaddwd                16(%4), %%mm4       \n\t"
1937         "pmaddwd                 %%mm6, %%mm5       \n\t"
1938         "paddd                   %%mm3, %%mm1       \n\t"
1939         "paddd                   %%mm5, %%mm4       \n\t"
1940
1941         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1942         "paddd                   %%mm3, %%mm0       \n\t"
1943         "paddd                   %%mm3, %%mm2       \n\t"
1944         "paddd                   %%mm3, %%mm1       \n\t"
1945         "paddd                   %%mm3, %%mm4       \n\t"
1946         "psrad                     $15, %%mm0       \n\t"
1947         "psrad                     $15, %%mm2       \n\t"
1948         "psrad                     $15, %%mm1       \n\t"
1949         "psrad                     $15, %%mm4       \n\t"
1950         "packssdw                %%mm1, %%mm0       \n\t"
1951         "packssdw                %%mm4, %%mm2       \n\t"
1952         "packuswb                %%mm0, %%mm0       \n\t"
1953         "packuswb                %%mm2, %%mm2       \n\t"
1954         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1955         "movd                %%mm2, (%2, %%"REG_a") \n\t"
1956         "add                        $4, %%"REG_a"   \n\t"
1957         " js                        1b              \n\t"
1958     : "+r" (src)
1959     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1960     : "%"REG_a
1961     );
1962 }
1963 #endif
1964
1965 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1966 {
1967 #if COMPILE_TEMPLATE_MMX
1968     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1969 #else
1970     int i;
1971     for (i=0; i<width; i++) {
1972         int b= src[i*3+0];
1973         int g= src[i*3+1];
1974         int r= src[i*3+2];
1975
1976         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1977     }
1978 #endif /* COMPILE_TEMPLATE_MMX */
1979 }
1980
1981 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1982 {
1983 #if COMPILE_TEMPLATE_MMX
1984     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1985 #else
1986     int i;
1987     for (i=0; i<width; i++) {
1988         int b= src1[3*i + 0];
1989         int g= src1[3*i + 1];
1990         int r= src1[3*i + 2];
1991
1992         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1993         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1994     }
1995 #endif /* COMPILE_TEMPLATE_MMX */
1996     assert(src1 == src2);
1997 }
1998
1999 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2000 {
2001     int i;
2002     for (i=0; i<width; i++) {
2003         int b= src1[6*i + 0] + src1[6*i + 3];
2004         int g= src1[6*i + 1] + src1[6*i + 4];
2005         int r= src1[6*i + 2] + src1[6*i + 5];
2006
2007         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2008         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2009     }
2010     assert(src1 == src2);
2011 }
2012
2013 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2014 {
2015 #if COMPILE_TEMPLATE_MMX
2016     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2017 #else
2018     int i;
2019     for (i=0; i<width; i++) {
2020         int r= src[i*3+0];
2021         int g= src[i*3+1];
2022         int b= src[i*3+2];
2023
2024         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2025     }
2026 #endif
2027 }
2028
2029 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2030 {
2031 #if COMPILE_TEMPLATE_MMX
2032     assert(src1==src2);
2033     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2034 #else
2035     int i;
2036     assert(src1==src2);
2037     for (i=0; i<width; i++) {
2038         int r= src1[3*i + 0];
2039         int g= src1[3*i + 1];
2040         int b= src1[3*i + 2];
2041
2042         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2043         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2044     }
2045 #endif
2046 }
2047
2048 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2049 {
2050     int i;
2051     assert(src1==src2);
2052     for (i=0; i<width; i++) {
2053         int r= src1[6*i + 0] + src1[6*i + 3];
2054         int g= src1[6*i + 1] + src1[6*i + 4];
2055         int b= src1[6*i + 2] + src1[6*i + 5];
2056
2057         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2058         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2059     }
2060 }
2061
2062
2063 // bilinear / bicubic scaling
2064 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2065                                   const int16_t *filter, const int16_t *filterPos, long filterSize)
2066 {
2067 #if COMPILE_TEMPLATE_MMX
2068     assert(filterSize % 4 == 0 && filterSize>0);
2069     if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2070         x86_reg counter= -2*dstW;
2071         filter-= counter*2;
2072         filterPos-= counter/2;
2073         dst-= counter/2;
2074         __asm__ volatile(
2075 #if defined(PIC)
2076             "push            %%"REG_b"              \n\t"
2077 #endif
2078             "pxor                %%mm7, %%mm7       \n\t"
2079             "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2080             "mov             %%"REG_a", %%"REG_BP"  \n\t"
2081             ".p2align                4              \n\t"
2082             "1:                                     \n\t"
2083             "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2084             "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2085             "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2086             "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2087             "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2088             "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2089             "punpcklbw           %%mm7, %%mm0       \n\t"
2090             "punpcklbw           %%mm7, %%mm2       \n\t"
2091             "pmaddwd             %%mm1, %%mm0       \n\t"
2092             "pmaddwd             %%mm2, %%mm3       \n\t"
2093             "movq                %%mm0, %%mm4       \n\t"
2094             "punpckldq           %%mm3, %%mm0       \n\t"
2095             "punpckhdq           %%mm3, %%mm4       \n\t"
2096             "paddd               %%mm4, %%mm0       \n\t"
2097             "psrad                  $7, %%mm0       \n\t"
2098             "packssdw            %%mm0, %%mm0       \n\t"
2099             "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2100             "add                    $4, %%"REG_BP"  \n\t"
2101             " jnc                   1b              \n\t"
2102
2103             "pop            %%"REG_BP"              \n\t"
2104 #if defined(PIC)
2105             "pop             %%"REG_b"              \n\t"
2106 #endif
2107             : "+a" (counter)
2108             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2109 #if !defined(PIC)
2110             : "%"REG_b
2111 #endif
2112         );
2113     } else if (filterSize==8) {
2114         x86_reg counter= -2*dstW;
2115         filter-= counter*4;
2116         filterPos-= counter/2;
2117         dst-= counter/2;
2118         __asm__ volatile(
2119 #if defined(PIC)
2120             "push             %%"REG_b"             \n\t"
2121 #endif
2122             "pxor                 %%mm7, %%mm7      \n\t"
2123             "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2124             "mov              %%"REG_a", %%"REG_BP" \n\t"
2125             ".p2align                 4             \n\t"
2126             "1:                                     \n\t"
2127             "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2128             "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2129             "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2130             "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2131             "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2132             "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2133             "punpcklbw            %%mm7, %%mm0      \n\t"
2134             "punpcklbw            %%mm7, %%mm2      \n\t"
2135             "pmaddwd              %%mm1, %%mm0      \n\t"
2136             "pmaddwd              %%mm2, %%mm3      \n\t"
2137
2138             "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2139             "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2140             "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2141             "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2142             "punpcklbw            %%mm7, %%mm4      \n\t"
2143             "punpcklbw            %%mm7, %%mm2      \n\t"
2144             "pmaddwd              %%mm1, %%mm4      \n\t"
2145             "pmaddwd              %%mm2, %%mm5      \n\t"
2146             "paddd                %%mm4, %%mm0      \n\t"
2147             "paddd                %%mm5, %%mm3      \n\t"
2148             "movq                 %%mm0, %%mm4      \n\t"
2149             "punpckldq            %%mm3, %%mm0      \n\t"
2150             "punpckhdq            %%mm3, %%mm4      \n\t"
2151             "paddd                %%mm4, %%mm0      \n\t"
2152             "psrad                   $7, %%mm0      \n\t"
2153             "packssdw             %%mm0, %%mm0      \n\t"
2154             "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2155             "add                     $4, %%"REG_BP" \n\t"
2156             " jnc                    1b             \n\t"
2157
2158             "pop             %%"REG_BP"             \n\t"
2159 #if defined(PIC)
2160             "pop              %%"REG_b"             \n\t"
2161 #endif
2162             : "+a" (counter)
2163             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2164 #if !defined(PIC)
2165             : "%"REG_b
2166 #endif
2167         );
2168     } else {
2169         const uint8_t *offset = src+filterSize;
2170         x86_reg counter= -2*dstW;
2171         //filter-= counter*filterSize/2;
2172         filterPos-= counter/2;
2173         dst-= counter/2;
2174         __asm__ volatile(
2175             "pxor                  %%mm7, %%mm7     \n\t"
2176             ".p2align                  4            \n\t"
2177             "1:                                     \n\t"
2178             "mov                      %2, %%"REG_c" \n\t"
2179             "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2180             "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2181             "mov                      %5, %%"REG_c" \n\t"
2182             "pxor                  %%mm4, %%mm4     \n\t"
2183             "pxor                  %%mm5, %%mm5     \n\t"
2184             "2:                                     \n\t"
2185             "movq                   (%1), %%mm1     \n\t"
2186             "movq               (%1, %6), %%mm3     \n\t"
2187             "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2188             "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2189             "punpcklbw             %%mm7, %%mm0     \n\t"
2190             "punpcklbw             %%mm7, %%mm2     \n\t"
2191             "pmaddwd               %%mm1, %%mm0     \n\t"
2192             "pmaddwd               %%mm2, %%mm3     \n\t"
2193             "paddd                 %%mm3, %%mm5     \n\t"
2194             "paddd                 %%mm0, %%mm4     \n\t"
2195             "add                      $8, %1        \n\t"
2196             "add                      $4, %%"REG_c" \n\t"
2197             "cmp                      %4, %%"REG_c" \n\t"
2198             " jb                      2b            \n\t"
2199             "add                      %6, %1        \n\t"
2200             "movq                  %%mm4, %%mm0     \n\t"
2201             "punpckldq             %%mm5, %%mm4     \n\t"
2202             "punpckhdq             %%mm5, %%mm0     \n\t"
2203             "paddd                 %%mm0, %%mm4     \n\t"
2204             "psrad                    $7, %%mm4     \n\t"
2205             "packssdw              %%mm4, %%mm4     \n\t"
2206             "mov                      %3, %%"REG_a" \n\t"
2207             "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2208             "add                      $4, %0        \n\t"
2209             " jnc                     1b            \n\t"
2210
2211             : "+r" (counter), "+r" (filter)
2212             : "m" (filterPos), "m" (dst), "m"(offset),
2213             "m" (src), "r" ((x86_reg)filterSize*2)
2214             : "%"REG_a, "%"REG_c, "%"REG_d
2215         );
2216     }
2217 #else
2218 #if COMPILE_TEMPLATE_ALTIVEC
2219     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2220 #else
2221     int i;
2222     for (i=0; i<dstW; i++) {
2223         int j;
2224         int srcPos= filterPos[i];
2225         int val=0;
2226         //printf("filterPos: %d\n", filterPos[i]);
2227         for (j=0; j<filterSize; j++) {
2228             //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2230         }
2231         //filter += hFilterSize;
2232         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2233         //dst[i] = val>>7;
2234     }
2235 #endif /* COMPILE_TEMPLATE_ALTIVEC */
2236 #endif /* COMPILE_MMX */
2237 }
2238
2239 //FIXME all pal and rgb srcFormats could do this convertion as well
2240 //FIXME all scalers more complex than bilinear could do half of this transform
2241 static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
2242 {
2243     int i;
2244     for (i = 0; i < width; i++) {
2245         dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2246         dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2247     }
2248 }
2249 static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
2250 {
2251     int i;
2252     for (i = 0; i < width; i++) {
2253         dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2254         dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2255     }
2256 }
2257 static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
2258 {
2259     int i;
2260     for (i = 0; i < width; i++)
2261         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2262 }
2263 static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
2264 {
2265     int i;
2266     for (i = 0; i < width; i++)
2267         dst[i] = (dst[i]*14071 + 33561947)>>14;
2268 }
2269
2270 #define FAST_BILINEAR_X86 \
2271     "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2272     "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2273     "shll      $16, %%edi    \n\t"                                              \
2274     "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2275     "mov        %1, %%"REG_D"\n\t"                                              \
2276     "shrl       $9, %%esi    \n\t"                                              \
2277
2278 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2279                                         long dstWidth, const uint8_t *src, int srcW,
2280                                         int xInc)
2281 {
2282 #if ARCH_X86
2283 #if COMPILE_TEMPLATE_MMX2
2284     int32_t *filterPos = c->hLumFilterPos;
2285     int16_t *filter    = c->hLumFilter;
2286     int     canMMX2BeUsed  = c->canMMX2BeUsed;
2287     void    *mmx2FilterCode= c->lumMmx2FilterCode;
2288     int i;
2289 #if defined(PIC)
2290     DECLARE_ALIGNED(8, uint64_t, ebxsave);
2291 #endif
2292     if (canMMX2BeUsed) {
2293         __asm__ volatile(
2294 #if defined(PIC)
2295             "mov               %%"REG_b", %5        \n\t"
2296 #endif
2297             "pxor                  %%mm7, %%mm7     \n\t"
2298             "mov                      %0, %%"REG_c" \n\t"
2299             "mov                      %1, %%"REG_D" \n\t"
2300             "mov                      %2, %%"REG_d" \n\t"
2301             "mov                      %3, %%"REG_b" \n\t"
2302             "xor               %%"REG_a", %%"REG_a" \n\t" // i
2303             PREFETCH"        (%%"REG_c")            \n\t"
2304             PREFETCH"      32(%%"REG_c")            \n\t"
2305             PREFETCH"      64(%%"REG_c")            \n\t"
2306
2307 #if ARCH_X86_64
2308
2309 #define CALL_MMX2_FILTER_CODE \
2310             "movl            (%%"REG_b"), %%esi     \n\t"\
2311             "call                    *%4            \n\t"\
2312             "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2313             "add               %%"REG_S", %%"REG_c" \n\t"\
2314             "add               %%"REG_a", %%"REG_D" \n\t"\
2315             "xor               %%"REG_a", %%"REG_a" \n\t"\
2316
2317 #else
2318
2319 #define CALL_MMX2_FILTER_CODE \
2320             "movl (%%"REG_b"), %%esi        \n\t"\
2321             "call         *%4                       \n\t"\
2322             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2323             "add               %%"REG_a", %%"REG_D" \n\t"\
2324             "xor               %%"REG_a", %%"REG_a" \n\t"\
2325
2326 #endif /* ARCH_X86_64 */
2327
2328             CALL_MMX2_FILTER_CODE
2329             CALL_MMX2_FILTER_CODE
2330             CALL_MMX2_FILTER_CODE
2331             CALL_MMX2_FILTER_CODE
2332             CALL_MMX2_FILTER_CODE
2333             CALL_MMX2_FILTER_CODE
2334             CALL_MMX2_FILTER_CODE
2335             CALL_MMX2_FILTER_CODE
2336
2337 #if defined(PIC)
2338             "mov                      %5, %%"REG_b" \n\t"
2339 #endif
2340             :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2341             "m" (mmx2FilterCode)
2342 #if defined(PIC)
2343             ,"m" (ebxsave)
2344 #endif
2345             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2346 #if !defined(PIC)
2347             ,"%"REG_b
2348 #endif
2349         );
2350         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2351     } else {
2352 #endif /* COMPILE_TEMPLATE_MMX2 */
2353     x86_reg xInc_shr16 = xInc >> 16;
2354     uint16_t xInc_mask = xInc & 0xffff;
2355     x86_reg dstWidth_reg = dstWidth;
2356     //NO MMX just normal asm ...
2357     __asm__ volatile(
2358         "xor %%"REG_a", %%"REG_a"            \n\t" // i
2359         "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2360         "xorl    %%ecx, %%ecx                \n\t" // xalpha
2361         ".p2align    4                       \n\t"
2362         "1:                                  \n\t"
2363         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2364         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2365         FAST_BILINEAR_X86
2366         "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2367         "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2368         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2369
2370         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2371         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2372         FAST_BILINEAR_X86
2373         "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2374         "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2375         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2376
2377
2378         "add        $2, %%"REG_a"            \n\t"
2379         "cmp        %2, %%"REG_a"            \n\t"
2380         " jb        1b                       \n\t"
2381
2382
2383         :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2384         : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2385     );
2386 #if COMPILE_TEMPLATE_MMX2
2387     } //if MMX2 can't be used
2388 #endif
2389 #else
2390     int i;
2391     unsigned int xpos=0;
2392     for (i=0;i<dstWidth;i++) {
2393         register unsigned int xx=xpos>>16;
2394         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2395         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2396         xpos+=xInc;
2397     }
2398 #endif /* ARCH_X86 */
2399 }
2400
2401       // *** horizontal scale Y line to temp buffer
2402 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2403                                    const int16_t *hLumFilter,
2404                                    const int16_t *hLumFilterPos, int hLumFilterSize,
2405                                    uint8_t *formatConvBuffer,
2406                                    uint32_t *pal, int isAlpha)
2407 {
2408     void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2409     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2410
2411     src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2412
2413     if (toYV12) {
2414         toYV12(formatConvBuffer, src, srcW, pal);
2415         src= formatConvBuffer;
2416     }
2417
2418     if (!c->hyscale_fast) {
2419         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2420     } else { // fast bilinear upscale / crap downscale
2421         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2422     }
2423
2424     if (convertRange)
2425         convertRange(dst, dstWidth);
2426 }
2427
2428 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2429                                         long dstWidth, const uint8_t *src1,
2430                                         const uint8_t *src2, int srcW, int xInc)
2431 {
2432 #if ARCH_X86
2433 #if COMPILE_TEMPLATE_MMX2
2434     int32_t *filterPos = c->hChrFilterPos;
2435     int16_t *filter    = c->hChrFilter;
2436     int     canMMX2BeUsed  = c->canMMX2BeUsed;
2437     void    *mmx2FilterCode= c->chrMmx2FilterCode;
2438     int i;
2439 #if defined(PIC)
2440     DECLARE_ALIGNED(8, uint64_t, ebxsave);
2441 #endif
2442     if (canMMX2BeUsed) {
2443         __asm__ volatile(
2444 #if defined(PIC)
2445             "mov          %%"REG_b", %6         \n\t"
2446 #endif
2447             "pxor             %%mm7, %%mm7      \n\t"
2448             "mov                 %0, %%"REG_c"  \n\t"
2449             "mov                 %1, %%"REG_D"  \n\t"
2450             "mov                 %2, %%"REG_d"  \n\t"
2451             "mov                 %3, %%"REG_b"  \n\t"
2452             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2453             PREFETCH"   (%%"REG_c")             \n\t"
2454             PREFETCH" 32(%%"REG_c")             \n\t"
2455             PREFETCH" 64(%%"REG_c")             \n\t"
2456
2457             CALL_MMX2_FILTER_CODE
2458             CALL_MMX2_FILTER_CODE
2459             CALL_MMX2_FILTER_CODE
2460             CALL_MMX2_FILTER_CODE
2461             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2462             "mov                 %5, %%"REG_c"  \n\t" // src
2463             "mov                 %1, %%"REG_D"  \n\t" // buf1
2464             "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2465             PREFETCH"   (%%"REG_c")             \n\t"
2466             PREFETCH" 32(%%"REG_c")             \n\t"
2467             PREFETCH" 64(%%"REG_c")             \n\t"
2468
2469             CALL_MMX2_FILTER_CODE
2470             CALL_MMX2_FILTER_CODE
2471             CALL_MMX2_FILTER_CODE
2472             CALL_MMX2_FILTER_CODE
2473
2474 #if defined(PIC)
2475             "mov %6, %%"REG_b"    \n\t"
2476 #endif
2477             :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2478             "m" (mmx2FilterCode), "m" (src2)
2479 #if defined(PIC)
2480             ,"m" (ebxsave)
2481 #endif
2482             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2483 #if !defined(PIC)
2484             ,"%"REG_b
2485 #endif
2486         );
2487         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2488             //printf("%d %d %d\n", dstWidth, i, srcW);
2489             dst[i] = src1[srcW-1]*128;
2490             dst[i+VOFW] = src2[srcW-1]*128;
2491         }
2492     } else {
2493 #endif /* COMPILE_TEMPLATE_MMX2 */
2494         x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2495         uint16_t xInc_mask = xInc & 0xffff;
2496         x86_reg dstWidth_reg = dstWidth;
2497         __asm__ volatile(
2498             "xor %%"REG_a", %%"REG_a"               \n\t" // i
2499             "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2500             "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2501             ".p2align    4                          \n\t"
2502             "1:                                     \n\t"
2503             "mov        %0, %%"REG_S"               \n\t"
2504             "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2505             "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2506             FAST_BILINEAR_X86
2507             "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2508
2509             "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2510             "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2511             FAST_BILINEAR_X86
2512             "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2513
2514             "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2515             "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2516             "add        $1, %%"REG_a"               \n\t"
2517             "cmp        %2, %%"REG_a"               \n\t"
2518             " jb        1b                          \n\t"
2519
2520 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2521 which is needed to support GCC 4.0. */
2522 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2523             :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2524 #else
2525             :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2526 #endif
2527             "r" (src2)
2528             : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2529         );
2530 #if COMPILE_TEMPLATE_MMX2
2531     } //if MMX2 can't be used
2532 #endif
2533 #else
2534     int i;
2535     unsigned int xpos=0;
2536     for (i=0;i<dstWidth;i++) {
2537         register unsigned int xx=xpos>>16;
2538         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2539         dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2540         dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2541         /* slower
2542         dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2543         dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2544         */
2545         xpos+=xInc;
2546     }
2547 #endif /* ARCH_X86 */
2548 }
2549
2550 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2551                                    int srcW, int xInc, const int16_t *hChrFilter,
2552                                    const int16_t *hChrFilterPos, int hChrFilterSize,
2553                                    uint8_t *formatConvBuffer,
2554                                    uint32_t *pal)
2555 {
2556
2557     src1 += c->chrSrcOffset;
2558     src2 += c->chrSrcOffset;
2559
2560     if (c->chrToYV12) {
2561         c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2562         src1= formatConvBuffer;
2563         src2= formatConvBuffer+VOFW;
2564     }
2565
2566     if (!c->hcscale_fast) {
2567         c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2568         c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2569     } else { // fast bilinear upscale / crap downscale
2570         c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2571     }
2572
2573     if (c->chrConvertRange)
2574         c->chrConvertRange(dst, dstWidth);
2575 }
2576
2577 #define DEBUG_SWSCALE_BUFFERS 0
2578 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2579
2580 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2581                            int srcSliceH, uint8_t* dst[], int dstStride[])
2582 {
2583     /* load a few things into local vars to make the code more readable? and faster */
2584     const int srcW= c->srcW;
2585     const int dstW= c->dstW;
2586     const int dstH= c->dstH;
2587     const int chrDstW= c->chrDstW;
2588     const int chrSrcW= c->chrSrcW;
2589     const int lumXInc= c->lumXInc;
2590     const int chrXInc= c->chrXInc;
2591     const enum PixelFormat dstFormat= c->dstFormat;
2592     const int flags= c->flags;
2593     int16_t *vLumFilterPos= c->vLumFilterPos;
2594     int16_t *vChrFilterPos= c->vChrFilterPos;
2595     int16_t *hLumFilterPos= c->hLumFilterPos;
2596     int16_t *hChrFilterPos= c->hChrFilterPos;
2597     int16_t *vLumFilter= c->vLumFilter;
2598     int16_t *vChrFilter= c->vChrFilter;
2599     int16_t *hLumFilter= c->hLumFilter;
2600     int16_t *hChrFilter= c->hChrFilter;
2601     int32_t *lumMmxFilter= c->lumMmxFilter;
2602     int32_t *chrMmxFilter= c->chrMmxFilter;
2603     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2604     const int vLumFilterSize= c->vLumFilterSize;
2605     const int vChrFilterSize= c->vChrFilterSize;
2606     const int hLumFilterSize= c->hLumFilterSize;
2607     const int hChrFilterSize= c->hChrFilterSize;
2608     int16_t **lumPixBuf= c->lumPixBuf;
2609     int16_t **chrPixBuf= c->chrPixBuf;
2610     int16_t **alpPixBuf= c->alpPixBuf;
2611     const int vLumBufSize= c->vLumBufSize;
2612     const int vChrBufSize= c->vChrBufSize;
2613     uint8_t *formatConvBuffer= c->formatConvBuffer;
2614     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2615     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2616     int lastDstY;
2617     uint32_t *pal=c->pal_yuv;
2618
2619     /* vars which will change and which we need to store back in the context */
2620     int dstY= c->dstY;
2621     int lumBufIndex= c->lumBufIndex;
2622     int chrBufIndex= c->chrBufIndex;
2623     int lastInLumBuf= c->lastInLumBuf;
2624     int lastInChrBuf= c->lastInChrBuf;
2625
2626     if (isPacked(c->srcFormat)) {
2627         src[0]=
2628         src[1]=
2629         src[2]=
2630         src[3]= src[0];
2631         srcStride[0]=
2632         srcStride[1]=
2633         srcStride[2]=
2634         srcStride[3]= srcStride[0];
2635     }
2636     srcStride[1]<<= c->vChrDrop;
2637     srcStride[2]<<= c->vChrDrop;
2638
2639     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2640                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2641                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2642     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2643                    srcSliceY,    srcSliceH,    dstY,    dstH);
2644     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2645                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2646
2647     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2648         static int warnedAlready=0; //FIXME move this into the context perhaps
2649         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2650             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2651                    "         ->cannot do aligned memory accesses anymore\n");
2652             warnedAlready=1;
2653         }
2654     }
2655
2656     /* Note the user might start scaling the picture in the middle so this
2657        will not get executed. This is not really intended but works
2658        currently, so people might do it. */
2659     if (srcSliceY ==0) {
2660         lumBufIndex=-1;
2661         chrBufIndex=-1;
2662         dstY=0;
2663         lastInLumBuf= -1;
2664         lastInChrBuf= -1;
2665     }
2666
2667     lastDstY= dstY;
2668
2669     for (;dstY < dstH; dstY++) {
2670         unsigned char *dest =dst[0]+dstStride[0]*dstY;
2671         const int chrDstY= dstY>>c->chrDstVSubSample;
2672         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2673         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2674         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2675
2676         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2677         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2678         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2679         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2680         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2681         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2682         int enough_lines;
2683
2684         //handle holes (FAST_BILINEAR & weird filters)
2685         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2686         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2687         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2688         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2689
2690         DEBUG_BUFFERS("dstY: %d\n", dstY);
2691         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2692                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2693         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2694                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2695
2696         // Do we have enough lines in this slice to output the dstY line
2697         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2698
2699         if (!enough_lines) {
2700             lastLumSrcY = srcSliceY + srcSliceH - 1;
2701             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2702             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2703                                             lastLumSrcY, lastChrSrcY);
2704         }
2705
2706         //Do horizontal scaling
2707         while(lastInLumBuf < lastLumSrcY) {
2708             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2709             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2710             lumBufIndex++;
2711             assert(lumBufIndex < 2*vLumBufSize);
2712             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2713             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2714             RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2715                             hLumFilter, hLumFilterPos, hLumFilterSize,
2716                             formatConvBuffer,
2717                             pal, 0);
2718             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2719                 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2720                                 hLumFilter, hLumFilterPos, hLumFilterSize,
2721                                 formatConvBuffer,
2722                                 pal, 1);
2723             lastInLumBuf++;
2724             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2725                                lumBufIndex,    lastInLumBuf);
2726         }
2727         while(lastInChrBuf < lastChrSrcY) {
2728             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2729             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2730             chrBufIndex++;
2731             assert(chrBufIndex < 2*vChrBufSize);
2732             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2733             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2734             //FIXME replace parameters through context struct (some at least)
2735
2736             if (c->needs_hcscale)
2737                 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2738                                 hChrFilter, hChrFilterPos, hChrFilterSize,
2739                                 formatConvBuffer,
2740                                 pal);
2741             lastInChrBuf++;
2742             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2743                                chrBufIndex,    lastInChrBuf);
2744         }
2745         //wrap buf index around to stay inside the ring buffer
2746         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2747         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2748         if (!enough_lines)
2749             break; //we can't output a dstY line so let's try with the next slice
2750
2751 #if COMPILE_TEMPLATE_MMX
2752         c->blueDither= ff_dither8[dstY&1];
2753         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2754             c->greenDither= ff_dither8[dstY&1];
2755         else
2756             c->greenDither= ff_dither4[dstY&1];
2757         c->redDither= ff_dither8[(dstY+1)&1];
2758 #endif
2759         if (dstY < dstH-2) {
2760             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2761             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2762             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2763 #if COMPILE_TEMPLATE_MMX
2764             int i;
2765             if (flags & SWS_ACCURATE_RND) {
2766                 int s= APCK_SIZE / 8;
2767                 for (i=0; i<vLumFilterSize; i+=2) {
2768                     *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2769                     *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2770                               lumMmxFilter[s*i+APCK_COEF/4  ]=
2771                               lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2772                         + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2773                     if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2774                         *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2775                         *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2776                                   alpMmxFilter[s*i+APCK_COEF/4  ]=
2777                                   alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2778                     }
2779                 }
2780                 for (i=0; i<vChrFilterSize; i+=2) {
2781                     *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2782                     *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2783                               chrMmxFilter[s*i+APCK_COEF/4  ]=
2784                               chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2785                         + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2786                 }
2787             } else {
2788                 for (i=0; i<vLumFilterSize; i++) {
2789                     lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2790                     lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2791                     lumMmxFilter[4*i+2]=
2792                     lumMmxFilter[4*i+3]=
2793                         ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2794                     if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2795                         alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2796                         alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2797                         alpMmxFilter[4*i+2]=
2798                         alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2799                     }
2800                 }
2801                 for (i=0; i<vChrFilterSize; i++) {
2802                     chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2803                     chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2804                     chrMmxFilter[4*i+2]=
2805                     chrMmxFilter[4*i+3]=
2806                         ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2807                 }
2808             }
2809 #endif
2810             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2811                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2812                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2813                 c->yuv2nv12X(c,
2814                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2815                              vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2816                              dest, uDest, dstW, chrDstW, dstFormat);
2817             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2818                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2819                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2820                 if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2821                     yuv2yuvX16inC(
2822                                   vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2823                                   vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2824                                   alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2825                                   dstFormat);
2826                 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2827                     const int16_t *lumBuf = lumSrcPtr[0];
2828                     const int16_t *chrBuf= chrSrcPtr[0];
2829                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2830                     c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2831                 } else { //General YV12
2832                     c->yuv2yuvX(c,
2833                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2834                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2836                 }
2837             } else {
2838                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2839                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2840                 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2841                     int chrAlpha= vChrFilter[2*dstY+1];
2842                     if(flags & SWS_FULL_CHR_H_INT) {
2843                         yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2844                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2845                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2846                                          alpSrcPtr, dest, dstW, dstY);
2847                     } else {
2848                         c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2849                                        alpPixBuf ? *alpSrcPtr : NULL,
2850                                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
2851                     }
2852                 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2853                     int lumAlpha= vLumFilter[2*dstY+1];
2854                     int chrAlpha= vChrFilter[2*dstY+1];
2855                     lumMmxFilter[2]=
2856                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2857                     chrMmxFilter[2]=
2858                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2859                     if(flags & SWS_FULL_CHR_H_INT) {
2860                         yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2861                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2862                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2863                                          alpSrcPtr, dest, dstW, dstY);
2864                     } else {
2865                         c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2866                                        alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2867                                        dest, dstW, lumAlpha, chrAlpha, dstY);
2868                     }
2869                 } else { //general RGB
2870                     if(flags & SWS_FULL_CHR_H_INT) {
2871                         yuv2rgbXinC_full(c,
2872                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2873                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2874                                          alpSrcPtr, dest, dstW, dstY);
2875                     } else {
2876                         c->yuv2packedX(c,
2877                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879                                        alpSrcPtr, dest, dstW, dstY);
2880                     }
2881                 }
2882             }
2883         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2884             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2885             const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2886             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2887             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2888                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2890                 yuv2nv12XinC(
2891                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2892                              vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893                              dest, uDest, dstW, chrDstW, dstFormat);
2894             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2895                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2896                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2897                 if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2898                     yuv2yuvX16inC(
2899                                   vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2900                                   vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2901                                   alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2902                                   dstFormat);
2903                 } else {
2904                     yuv2yuvXinC(
2905                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2906                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2907                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2908                 }
2909             } else {
2910                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2911                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2912                 if(flags & SWS_FULL_CHR_H_INT) {
2913                     yuv2rgbXinC_full(c,
2914                                      vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2915                                      vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2916                                      alpSrcPtr, dest, dstW, dstY);
2917                 } else {
2918                     yuv2packedXinC(c,
2919                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2920                                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2921                                    alpSrcPtr, dest, dstW, dstY);
2922                 }
2923             }
2924         }
2925     }
2926
2927     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2928         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2929
2930 #if COMPILE_TEMPLATE_MMX
2931     if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2932     /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2933     if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2934     else                             __asm__ volatile("emms"  :::"memory");
2935 #endif
2936     /* store changed local vars back in the context */
2937     c->dstY= dstY;
2938     c->lumBufIndex= lumBufIndex;
2939     c->chrBufIndex= chrBufIndex;
2940     c->lastInLumBuf= lastInLumBuf;
2941     c->lastInChrBuf= lastInChrBuf;
2942
2943     return dstY - lastDstY;
2944 }
2945
2946 static void RENAME(sws_init_swScale)(SwsContext *c)
2947 {
2948     enum PixelFormat srcFormat = c->srcFormat;
2949
2950     c->yuv2nv12X    = RENAME(yuv2nv12X   );
2951     c->yuv2yuv1     = RENAME(yuv2yuv1    );
2952     c->yuv2yuvX     = RENAME(yuv2yuvX    );
2953     c->yuv2packed1  = RENAME(yuv2packed1 );
2954     c->yuv2packed2  = RENAME(yuv2packed2 );
2955     c->yuv2packedX  = RENAME(yuv2packedX );
2956
2957     c->hScale       = RENAME(hScale      );
2958
2959 #if COMPILE_TEMPLATE_MMX
2960     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2961     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2962 #else
2963     if (c->flags & SWS_FAST_BILINEAR)
2964 #endif
2965     {
2966         c->hyscale_fast = RENAME(hyscale_fast);
2967         c->hcscale_fast = RENAME(hcscale_fast);
2968     }
2969
2970     c->chrToYV12 = NULL;
2971     switch(srcFormat) {
2972         case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2973         case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2974         case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2975         case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2976         case PIX_FMT_RGB8     :
2977         case PIX_FMT_BGR8     :
2978         case PIX_FMT_PAL8     :
2979         case PIX_FMT_BGR4_BYTE:
2980         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2981         case PIX_FMT_YUV420P9 : c->chrToYV12 = (void*)RENAME(yuv9ToUV ); break;
2982         case PIX_FMT_YUV422P10:
2983         case PIX_FMT_YUV420P10: c->chrToYV12 = (void*)RENAME(yuv10ToUV); break;
2984         case PIX_FMT_YUV420P16BE:
2985         case PIX_FMT_YUV422P16BE:
2986         case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2987         case PIX_FMT_YUV420P16LE:
2988         case PIX_FMT_YUV422P16LE:
2989         case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2990     }
2991     if (c->chrSrcHSubSample) {
2992         switch(srcFormat) {
2993         case PIX_FMT_RGB48BE:
2994         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2995         case PIX_FMT_BGR48BE:
2996         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
2997         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half;  break;
2998         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
2999         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
3000         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
3001         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
3002         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half;  break;
3003         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
3004         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
3005         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
3006         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
3007         }
3008     } else {
3009         switch(srcFormat) {
3010         case PIX_FMT_RGB48BE:
3011         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
3012         case PIX_FMT_BGR48BE:
3013         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
3014         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV;  break;
3015         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
3016         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
3017         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
3018         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
3019         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV;  break;
3020         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
3021         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
3022         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
3023         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
3024         }
3025     }
3026
3027     c->lumToYV12 = NULL;
3028     c->alpToYV12 = NULL;
3029     switch (srcFormat) {
3030     case PIX_FMT_YUV420P9 : c->lumToYV12 = (void*)RENAME(yuv9ToY ); break;
3031     case PIX_FMT_YUV422P10:
3032     case PIX_FMT_YUV420P10: c->lumToYV12 = (void*)RENAME(yuv10ToY); break;
3033     case PIX_FMT_YUYV422  :
3034     case PIX_FMT_YUV420P16BE:
3035     case PIX_FMT_YUV422P16BE:
3036     case PIX_FMT_YUV444P16BE:
3037     case PIX_FMT_GRAY8A   :
3038     case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3039     case PIX_FMT_UYVY422  :
3040     case PIX_FMT_YUV420P16LE:
3041     case PIX_FMT_YUV422P16LE:
3042     case PIX_FMT_YUV444P16LE:
3043     case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3044     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3045     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3046     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3047     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3048     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3049     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3050     case PIX_FMT_RGB8     :
3051     case PIX_FMT_BGR8     :
3052     case PIX_FMT_PAL8     :
3053     case PIX_FMT_BGR4_BYTE:
3054     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3055     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3056     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3057     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY;  break;
3058     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
3059     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY;  break;
3060     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
3061     case PIX_FMT_RGB48BE:
3062     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3063     case PIX_FMT_BGR48BE:
3064     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
3065     }
3066     if (c->alpPixBuf) {
3067         switch (srcFormat) {
3068         case PIX_FMT_RGB32  :
3069         case PIX_FMT_RGB32_1:
3070         case PIX_FMT_BGR32  :
3071         case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3072         case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
3073         case PIX_FMT_PAL8   : c->alpToYV12 = palToA; break;
3074         }
3075     }
3076
3077     switch (srcFormat) {
3078     case PIX_FMT_GRAY8A :
3079         c->alpSrcOffset = 1;
3080         break;
3081     case PIX_FMT_RGB32  :
3082     case PIX_FMT_BGR32  :
3083         c->alpSrcOffset = 3;
3084         break;
3085     case PIX_FMT_RGB48LE:
3086     case PIX_FMT_BGR48LE:
3087         c->lumSrcOffset = 1;
3088         c->chrSrcOffset = 1;
3089         c->alpSrcOffset = 1;
3090         break;
3091     }
3092
3093     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3094         if (c->srcRange) {
3095             c->lumConvertRange = RENAME(lumRangeFromJpeg);
3096             c->chrConvertRange = RENAME(chrRangeFromJpeg);
3097         } else {
3098             c->lumConvertRange = RENAME(lumRangeToJpeg);
3099             c->chrConvertRange = RENAME(chrRangeToJpeg);
3100         }
3101     }
3102
3103     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3104           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3105         c->needs_hcscale = 1;
3106 }