]> git.sesse.net Git - ffmpeg/blob - libswscale/swscale_template.c
Do not set src[1] to the palette, it is now in the context
[ffmpeg] / libswscale / swscale_template.c
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  *
20  * The C code (not assembly, MMX, ...) of this file can be used
21  * under the LGPL license.
22  */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
31
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34 #define EMMS     "femms"
35 #else
36 #define EMMS     "emms"
37 #endif
38
39 #ifdef HAVE_3DNOW
40 #define PREFETCH  "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH  " # nop"
47 #define PREFETCHW " # nop"
48 #endif
49
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
55
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
61
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
72
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74     asm volatile(\
75     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77     "movq                             %%mm3, %%mm4      \n\t"\
78     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80     ASMALIGN(4) /* FIXME Unroll? */\
81     "1:                                                 \n\t"\
82     "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85     "add                                $16, %%"REG_d"  \n\t"\
86     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87     "test                         %%"REG_S", %%"REG_S"  \n\t"\
88     "pmulhw                           %%mm0, %%mm2      \n\t"\
89     "pmulhw                           %%mm0, %%mm5      \n\t"\
90     "paddw                            %%mm2, %%mm3      \n\t"\
91     "paddw                            %%mm5, %%mm4      \n\t"\
92     " jnz                                1b             \n\t"\
93     "psraw                               $3, %%mm3      \n\t"\
94     "psraw                               $3, %%mm4      \n\t"\
95     "packuswb                         %%mm4, %%mm3      \n\t"\
96     MOVNTQ(%%mm3, (%1, %%REGa))\
97     "add                                 $8, %%"REG_a"  \n\t"\
98     "cmp                                 %2, %%"REG_a"  \n\t"\
99     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100     "movq                             %%mm3, %%mm4      \n\t"\
101     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103     "jb                                  1b             \n\t"\
104     :: "r" (&c->redDither),\
105     "r" (dest), "g" (width)\
106     : "%"REG_a, "%"REG_d, "%"REG_S\
107     );
108
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110     asm volatile(\
111     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113     "pxor                             %%mm4, %%mm4      \n\t"\
114     "pxor                             %%mm5, %%mm5      \n\t"\
115     "pxor                             %%mm6, %%mm6      \n\t"\
116     "pxor                             %%mm7, %%mm7      \n\t"\
117     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118     ASMALIGN(4) \
119     "1:                                                 \n\t"\
120     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122     "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124     "movq                             %%mm0, %%mm3      \n\t"\
125     "punpcklwd                        %%mm1, %%mm0      \n\t"\
126     "punpckhwd                        %%mm1, %%mm3      \n\t"\
127     "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128     "pmaddwd                          %%mm1, %%mm0      \n\t"\
129     "pmaddwd                          %%mm1, %%mm3      \n\t"\
130     "paddd                            %%mm0, %%mm4      \n\t"\
131     "paddd                            %%mm3, %%mm5      \n\t"\
132     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133     "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134     "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135     "test                         %%"REG_S", %%"REG_S"  \n\t"\
136     "movq                             %%mm2, %%mm0      \n\t"\
137     "punpcklwd                        %%mm3, %%mm2      \n\t"\
138     "punpckhwd                        %%mm3, %%mm0      \n\t"\
139     "pmaddwd                          %%mm1, %%mm2      \n\t"\
140     "pmaddwd                          %%mm1, %%mm0      \n\t"\
141     "paddd                            %%mm2, %%mm6      \n\t"\
142     "paddd                            %%mm0, %%mm7      \n\t"\
143     " jnz                                1b             \n\t"\
144     "psrad                              $16, %%mm4      \n\t"\
145     "psrad                              $16, %%mm5      \n\t"\
146     "psrad                              $16, %%mm6      \n\t"\
147     "psrad                              $16, %%mm7      \n\t"\
148     "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149     "packssdw                         %%mm5, %%mm4      \n\t"\
150     "packssdw                         %%mm7, %%mm6      \n\t"\
151     "paddw                            %%mm0, %%mm4      \n\t"\
152     "paddw                            %%mm0, %%mm6      \n\t"\
153     "psraw                               $3, %%mm4      \n\t"\
154     "psraw                               $3, %%mm6      \n\t"\
155     "packuswb                         %%mm6, %%mm4      \n\t"\
156     MOVNTQ(%%mm4, (%1, %%REGa))\
157     "add                                 $8, %%"REG_a"  \n\t"\
158     "cmp                                 %2, %%"REG_a"  \n\t"\
159     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160     "pxor                             %%mm4, %%mm4      \n\t"\
161     "pxor                             %%mm5, %%mm5      \n\t"\
162     "pxor                             %%mm6, %%mm6      \n\t"\
163     "pxor                             %%mm7, %%mm7      \n\t"\
164     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165     "jb                                  1b             \n\t"\
166     :: "r" (&c->redDither),\
167     "r" (dest), "g" (width)\
168     : "%"REG_a, "%"REG_d, "%"REG_S\
169     );
170
171 #define YSCALEYUV2YV121 \
172     "mov %2, %%"REG_a"                    \n\t"\
173     ASMALIGN(4) /* FIXME Unroll? */\
174     "1:                                   \n\t"\
175     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177     "psraw                 $7, %%mm0      \n\t"\
178     "psraw                 $7, %%mm1      \n\t"\
179     "packuswb           %%mm1, %%mm0      \n\t"\
180     MOVNTQ(%%mm0, (%1, %%REGa))\
181     "add                   $8, %%"REG_a"  \n\t"\
182     "jnc                   1b             \n\t"
183
184 #define YSCALEYUV2YV121_ACCURATE \
185     "mov %2, %%"REG_a"                    \n\t"\
186     "pcmpeqw %%mm7, %%mm7                 \n\t"\
187     "psrlw                 $15, %%mm7     \n\t"\
188     "psllw                  $6, %%mm7     \n\t"\
189     ASMALIGN(4) /* FIXME Unroll? */\
190     "1:                                   \n\t"\
191     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193     "paddsw             %%mm7, %%mm0      \n\t"\
194     "paddsw             %%mm7, %%mm1      \n\t"\
195     "psraw                 $7, %%mm0      \n\t"\
196     "psraw                 $7, %%mm1      \n\t"\
197     "packuswb           %%mm1, %%mm0      \n\t"\
198     MOVNTQ(%%mm0, (%1, %%REGa))\
199     "add                   $8, %%"REG_a"  \n\t"\
200     "jnc                   1b             \n\t"
201
202 /*
203     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205        "r" (dest), "m" (dstW),
206        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208 */
209 #define YSCALEYUV2PACKEDX \
210     asm volatile(\
211     "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212     ASMALIGN(4)\
213     "nop                                            \n\t"\
214     "1:                                             \n\t"\
215     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217     "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218     "movq                      %%mm3, %%mm4         \n\t"\
219     ASMALIGN(4)\
220     "2:                                             \n\t"\
221     "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222     "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224     "add                         $16, %%"REG_d"     \n\t"\
225     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226     "pmulhw                    %%mm0, %%mm2         \n\t"\
227     "pmulhw                    %%mm0, %%mm5         \n\t"\
228     "paddw                     %%mm2, %%mm3         \n\t"\
229     "paddw                     %%mm5, %%mm4         \n\t"\
230     "test                  %%"REG_S", %%"REG_S"     \n\t"\
231     " jnz                         2b                \n\t"\
232 \
233     "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235     "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236     "movq                      %%mm1, %%mm7         \n\t"\
237     ASMALIGN(4)\
238     "2:                                             \n\t"\
239     "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240     "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242     "add                         $16, %%"REG_d"            \n\t"\
243     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244     "pmulhw                    %%mm0, %%mm2         \n\t"\
245     "pmulhw                    %%mm0, %%mm5         \n\t"\
246     "paddw                     %%mm2, %%mm1         \n\t"\
247     "paddw                     %%mm5, %%mm7         \n\t"\
248     "test                  %%"REG_S", %%"REG_S"     \n\t"\
249     " jnz                         2b                \n\t"\
250
251 #define YSCALEYUV2PACKEDX_END                 \
252     :: "r" (&c->redDither),                   \
253         "m" (dummy), "m" (dummy), "m" (dummy),\
254         "r" (dest), "m" (dstW)                \
255     : "%"REG_a, "%"REG_d, "%"REG_S            \
256     );
257
258 #define YSCALEYUV2PACKEDX_ACCURATE \
259     asm volatile(\
260     "xor %%"REG_a", %%"REG_a"                       \n\t"\
261     ASMALIGN(4)\
262     "nop                                            \n\t"\
263     "1:                                             \n\t"\
264     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266     "pxor                      %%mm4, %%mm4         \n\t"\
267     "pxor                      %%mm5, %%mm5         \n\t"\
268     "pxor                      %%mm6, %%mm6         \n\t"\
269     "pxor                      %%mm7, %%mm7         \n\t"\
270     ASMALIGN(4)\
271     "2:                                             \n\t"\
272     "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
275     "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276     "movq                      %%mm0, %%mm3         \n\t"\
277     "punpcklwd                 %%mm1, %%mm0         \n\t"\
278     "punpckhwd                 %%mm1, %%mm3         \n\t"\
279     "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
280     "pmaddwd                   %%mm1, %%mm0         \n\t"\
281     "pmaddwd                   %%mm1, %%mm3         \n\t"\
282     "paddd                     %%mm0, %%mm4         \n\t"\
283     "paddd                     %%mm3, %%mm5         \n\t"\
284     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
286     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
287     "test                  %%"REG_S", %%"REG_S"     \n\t"\
288     "movq                      %%mm2, %%mm0         \n\t"\
289     "punpcklwd                 %%mm3, %%mm2         \n\t"\
290     "punpckhwd                 %%mm3, %%mm0         \n\t"\
291     "pmaddwd                   %%mm1, %%mm2         \n\t"\
292     "pmaddwd                   %%mm1, %%mm0         \n\t"\
293     "paddd                     %%mm2, %%mm6         \n\t"\
294     "paddd                     %%mm0, %%mm7         \n\t"\
295     " jnz                         2b                \n\t"\
296     "psrad                       $16, %%mm4         \n\t"\
297     "psrad                       $16, %%mm5         \n\t"\
298     "psrad                       $16, %%mm6         \n\t"\
299     "psrad                       $16, %%mm7         \n\t"\
300     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301     "packssdw                  %%mm5, %%mm4         \n\t"\
302     "packssdw                  %%mm7, %%mm6         \n\t"\
303     "paddw                     %%mm0, %%mm4         \n\t"\
304     "paddw                     %%mm0, %%mm6         \n\t"\
305     "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306     "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307 \
308     "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310     "pxor                      %%mm1, %%mm1         \n\t"\
311     "pxor                      %%mm5, %%mm5         \n\t"\
312     "pxor                      %%mm7, %%mm7         \n\t"\
313     "pxor                      %%mm6, %%mm6         \n\t"\
314     ASMALIGN(4)\
315     "2:                                             \n\t"\
316     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
319     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320     "movq                      %%mm0, %%mm3         \n\t"\
321     "punpcklwd                 %%mm4, %%mm0         \n\t"\
322     "punpckhwd                 %%mm4, %%mm3         \n\t"\
323     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324     "pmaddwd                   %%mm4, %%mm0         \n\t"\
325     "pmaddwd                   %%mm4, %%mm3         \n\t"\
326     "paddd                     %%mm0, %%mm1         \n\t"\
327     "paddd                     %%mm3, %%mm5         \n\t"\
328     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
330     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
331     "test                  %%"REG_S", %%"REG_S"     \n\t"\
332     "movq                      %%mm2, %%mm0         \n\t"\
333     "punpcklwd                 %%mm3, %%mm2         \n\t"\
334     "punpckhwd                 %%mm3, %%mm0         \n\t"\
335     "pmaddwd                   %%mm4, %%mm2         \n\t"\
336     "pmaddwd                   %%mm4, %%mm0         \n\t"\
337     "paddd                     %%mm2, %%mm7         \n\t"\
338     "paddd                     %%mm0, %%mm6         \n\t"\
339     " jnz                         2b                \n\t"\
340     "psrad                       $16, %%mm1         \n\t"\
341     "psrad                       $16, %%mm5         \n\t"\
342     "psrad                       $16, %%mm7         \n\t"\
343     "psrad                       $16, %%mm6         \n\t"\
344     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345     "packssdw                  %%mm5, %%mm1         \n\t"\
346     "packssdw                  %%mm6, %%mm7         \n\t"\
347     "paddw                     %%mm0, %%mm1         \n\t"\
348     "paddw                     %%mm0, %%mm7         \n\t"\
349     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351
352 #define YSCALEYUV2RGBX \
353     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367     "paddw           %%mm3, %%mm4       \n\t"\
368     "movq            %%mm2, %%mm0       \n\t"\
369     "movq            %%mm5, %%mm6       \n\t"\
370     "movq            %%mm4, %%mm3       \n\t"\
371     "punpcklwd       %%mm2, %%mm2       \n\t"\
372     "punpcklwd       %%mm5, %%mm5       \n\t"\
373     "punpcklwd       %%mm4, %%mm4       \n\t"\
374     "paddw           %%mm1, %%mm2       \n\t"\
375     "paddw           %%mm1, %%mm5       \n\t"\
376     "paddw           %%mm1, %%mm4       \n\t"\
377     "punpckhwd       %%mm0, %%mm0       \n\t"\
378     "punpckhwd       %%mm6, %%mm6       \n\t"\
379     "punpckhwd       %%mm3, %%mm3       \n\t"\
380     "paddw           %%mm7, %%mm0       \n\t"\
381     "paddw           %%mm7, %%mm6       \n\t"\
382     "paddw           %%mm7, %%mm3       \n\t"\
383     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384     "packuswb        %%mm0, %%mm2       \n\t"\
385     "packuswb        %%mm6, %%mm5       \n\t"\
386     "packuswb        %%mm3, %%mm4       \n\t"\
387     "pxor            %%mm7, %%mm7       \n\t"
388 #if 0
389 #define FULL_YSCALEYUV2RGB \
390     "pxor                 %%mm7, %%mm7  \n\t"\
391     "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392     "punpcklwd            %%mm6, %%mm6  \n\t"\
393     "punpcklwd            %%mm6, %%mm6  \n\t"\
394     "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395     "punpcklwd            %%mm5, %%mm5  \n\t"\
396     "punpcklwd            %%mm5, %%mm5  \n\t"\
397     "xor              %%"REG_a", %%"REG_a"  \n\t"\
398     ASMALIGN(4)\
399     "1:                                 \n\t"\
400     "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401     "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402     "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403     "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404     "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405     "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406     "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407     "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408     "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409     "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410     "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411     "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412     "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413     "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414     "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415     "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416     "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417     "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418 \
419 \
420     "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421     "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422     "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423     "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424     "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425     "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426     "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427 \
428 \
429     "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430     "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431     "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432     "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433     "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434     "packuswb             %%mm3, %%mm3  \n\t"\
435 \
436     "packuswb             %%mm0, %%mm0  \n\t"\
437     "paddw                %%mm4, %%mm2  \n\t"\
438     "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439 \
440     "packuswb             %%mm1, %%mm1  \n\t"
441 #endif
442
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446     "psraw                $3, %%mm0                           \n\t"\
447     "psraw                $3, %%mm1                           \n\t"\
448     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450     "xor            "#index", "#index"                        \n\t"\
451     ASMALIGN(4)\
452     "1:                                 \n\t"\
453     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478
479 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480
481 #define REAL_YSCALEYUV2RGB(index, c) \
482     "xor            "#index", "#index"  \n\t"\
483     ASMALIGN(4)\
484     "1:                                 \n\t"\
485     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524     "paddw             %%mm3, %%mm4     \n\t"\
525     "movq              %%mm2, %%mm0     \n\t"\
526     "movq              %%mm5, %%mm6     \n\t"\
527     "movq              %%mm4, %%mm3     \n\t"\
528     "punpcklwd         %%mm2, %%mm2     \n\t"\
529     "punpcklwd         %%mm5, %%mm5     \n\t"\
530     "punpcklwd         %%mm4, %%mm4     \n\t"\
531     "paddw             %%mm1, %%mm2     \n\t"\
532     "paddw             %%mm1, %%mm5     \n\t"\
533     "paddw             %%mm1, %%mm4     \n\t"\
534     "punpckhwd         %%mm0, %%mm0     \n\t"\
535     "punpckhwd         %%mm6, %%mm6     \n\t"\
536     "punpckhwd         %%mm3, %%mm3     \n\t"\
537     "paddw             %%mm7, %%mm0     \n\t"\
538     "paddw             %%mm7, %%mm6     \n\t"\
539     "paddw             %%mm7, %%mm3     \n\t"\
540     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541     "packuswb          %%mm0, %%mm2     \n\t"\
542     "packuswb          %%mm6, %%mm5     \n\t"\
543     "packuswb          %%mm3, %%mm4     \n\t"\
544     "pxor              %%mm7, %%mm7     \n\t"
545 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548     "xor            "#index", "#index"  \n\t"\
549     ASMALIGN(4)\
550     "1:                                 \n\t"\
551     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553     "psraw                $7, %%mm3     \n\t" \
554     "psraw                $7, %%mm4     \n\t" \
555     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557     "psraw                $7, %%mm1     \n\t" \
558     "psraw                $7, %%mm7     \n\t" \
559
560 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563     "xor            "#index", "#index"  \n\t"\
564     ASMALIGN(4)\
565     "1:                                 \n\t"\
566     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588     "paddw             %%mm3, %%mm4     \n\t"\
589     "movq              %%mm2, %%mm0     \n\t"\
590     "movq              %%mm5, %%mm6     \n\t"\
591     "movq              %%mm4, %%mm3     \n\t"\
592     "punpcklwd         %%mm2, %%mm2     \n\t"\
593     "punpcklwd         %%mm5, %%mm5     \n\t"\
594     "punpcklwd         %%mm4, %%mm4     \n\t"\
595     "paddw             %%mm1, %%mm2     \n\t"\
596     "paddw             %%mm1, %%mm5     \n\t"\
597     "paddw             %%mm1, %%mm4     \n\t"\
598     "punpckhwd         %%mm0, %%mm0     \n\t"\
599     "punpckhwd         %%mm6, %%mm6     \n\t"\
600     "punpckhwd         %%mm3, %%mm3     \n\t"\
601     "paddw             %%mm7, %%mm0     \n\t"\
602     "paddw             %%mm7, %%mm6     \n\t"\
603     "paddw             %%mm7, %%mm3     \n\t"\
604     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605     "packuswb          %%mm0, %%mm2     \n\t"\
606     "packuswb          %%mm6, %%mm5     \n\t"\
607     "packuswb          %%mm3, %%mm4     \n\t"\
608     "pxor              %%mm7, %%mm7     \n\t"
609 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612     "xor "#index", "#index"             \n\t"\
613     ASMALIGN(4)\
614     "1:                                 \n\t"\
615     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621     "psrlw                $8, %%mm3     \n\t" \
622     "psrlw                $8, %%mm4     \n\t" \
623     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625     "psraw                $7, %%mm1     \n\t" \
626     "psraw                $7, %%mm7     \n\t"
627 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631     "xor            "#index", "#index"  \n\t"\
632     ASMALIGN(4)\
633     "1:                                 \n\t"\
634     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660     "paddw             %%mm3, %%mm4     \n\t"\
661     "movq              %%mm2, %%mm0     \n\t"\
662     "movq              %%mm5, %%mm6     \n\t"\
663     "movq              %%mm4, %%mm3     \n\t"\
664     "punpcklwd         %%mm2, %%mm2     \n\t"\
665     "punpcklwd         %%mm5, %%mm5     \n\t"\
666     "punpcklwd         %%mm4, %%mm4     \n\t"\
667     "paddw             %%mm1, %%mm2     \n\t"\
668     "paddw             %%mm1, %%mm5     \n\t"\
669     "paddw             %%mm1, %%mm4     \n\t"\
670     "punpckhwd         %%mm0, %%mm0     \n\t"\
671     "punpckhwd         %%mm6, %%mm6     \n\t"\
672     "punpckhwd         %%mm3, %%mm3     \n\t"\
673     "paddw             %%mm7, %%mm0     \n\t"\
674     "paddw             %%mm7, %%mm6     \n\t"\
675     "paddw             %%mm7, %%mm3     \n\t"\
676     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677     "packuswb          %%mm0, %%mm2     \n\t"\
678     "packuswb          %%mm6, %%mm5     \n\t"\
679     "packuswb          %%mm3, %%mm4     \n\t"\
680     "pxor              %%mm7, %%mm7     \n\t"
681 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685     "movq      %%mm2, %%mm1     \n\t" /* B */\
686     "movq      %%mm5, %%mm6     \n\t" /* R */\
687     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697 \
698     MOVNTQ(%%mm0,   (dst, index, 4))\
699     MOVNTQ(%%mm2,  8(dst, index, 4))\
700     MOVNTQ(%%mm1, 16(dst, index, 4))\
701     MOVNTQ(%%mm3, 24(dst, index, 4))\
702 \
703     "add      $8, "#index"      \n\t"\
704     "cmp "#dstw", "#index"      \n\t"\
705     " jb      1b                \n\t"
706 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707
708 #define REAL_WRITERGB16(dst, dstw, index) \
709     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712     "psrlq           $3, %%mm2  \n\t"\
713 \
714     "movq         %%mm2, %%mm1  \n\t"\
715     "movq         %%mm4, %%mm3  \n\t"\
716 \
717     "punpcklbw    %%mm7, %%mm3  \n\t"\
718     "punpcklbw    %%mm5, %%mm2  \n\t"\
719     "punpckhbw    %%mm7, %%mm4  \n\t"\
720     "punpckhbw    %%mm5, %%mm1  \n\t"\
721 \
722     "psllq           $3, %%mm3  \n\t"\
723     "psllq           $3, %%mm4  \n\t"\
724 \
725     "por          %%mm3, %%mm2  \n\t"\
726     "por          %%mm4, %%mm1  \n\t"\
727 \
728     MOVNTQ(%%mm2,  (dst, index, 2))\
729     MOVNTQ(%%mm1, 8(dst, index, 2))\
730 \
731     "add             $8, "#index"   \n\t"\
732     "cmp        "#dstw", "#index"   \n\t"\
733     " jb             1b             \n\t"
734 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
735
736 #define REAL_WRITERGB15(dst, dstw, index) \
737     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740     "psrlq           $3, %%mm2  \n\t"\
741     "psrlq           $1, %%mm5  \n\t"\
742 \
743     "movq         %%mm2, %%mm1  \n\t"\
744     "movq         %%mm4, %%mm3  \n\t"\
745 \
746     "punpcklbw    %%mm7, %%mm3  \n\t"\
747     "punpcklbw    %%mm5, %%mm2  \n\t"\
748     "punpckhbw    %%mm7, %%mm4  \n\t"\
749     "punpckhbw    %%mm5, %%mm1  \n\t"\
750 \
751     "psllq           $2, %%mm3  \n\t"\
752     "psllq           $2, %%mm4  \n\t"\
753 \
754     "por          %%mm3, %%mm2  \n\t"\
755     "por          %%mm4, %%mm1  \n\t"\
756 \
757     MOVNTQ(%%mm2,  (dst, index, 2))\
758     MOVNTQ(%%mm1, 8(dst, index, 2))\
759 \
760     "add             $8, "#index"   \n\t"\
761     "cmp        "#dstw", "#index"   \n\t"\
762     " jb             1b             \n\t"
763 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
764
765 #define WRITEBGR24OLD(dst, dstw, index) \
766     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767     "movq      %%mm2, %%mm1             \n\t" /* B */\
768     "movq      %%mm5, %%mm6             \n\t" /* R */\
769     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779 \
780     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788 \
789     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802 \
803     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811 \
812     MOVNTQ(%%mm0,   (dst))\
813     MOVNTQ(%%mm2,  8(dst))\
814     MOVNTQ(%%mm3, 16(dst))\
815     "add         $24, "#dst"            \n\t"\
816 \
817     "add          $8, "#index"          \n\t"\
818     "cmp     "#dstw", "#index"          \n\t"\
819     " jb          1b                    \n\t"
820
821 #define WRITEBGR24MMX(dst, dstw, index) \
822     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823     "movq      %%mm2, %%mm1     \n\t" /* B */\
824     "movq      %%mm5, %%mm6     \n\t" /* R */\
825     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835 \
836     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840 \
841     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845 \
846     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850 \
851     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855     MOVNTQ(%%mm0, (dst))\
856 \
857     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861     MOVNTQ(%%mm6, 8(dst))\
862 \
863     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866     MOVNTQ(%%mm5, 16(dst))\
867 \
868     "add         $24, "#dst"    \n\t"\
869 \
870     "add          $8, "#index"  \n\t"\
871     "cmp     "#dstw", "#index"  \n\t"\
872     " jb          1b            \n\t"
873
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881 \
882     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885 \
886     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887     "por    %%mm1, %%mm6        \n\t"\
888     "por    %%mm3, %%mm6        \n\t"\
889     MOVNTQ(%%mm6, (dst))\
890 \
891     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895 \
896     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899 \
900     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901     "por    %%mm3, %%mm6        \n\t"\
902     MOVNTQ(%%mm6, 8(dst))\
903 \
904     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907 \
908     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911 \
912     "por    %%mm1, %%mm3        \n\t"\
913     "por    %%mm3, %%mm6        \n\t"\
914     MOVNTQ(%%mm6, 16(dst))\
915 \
916     "add      $24, "#dst"       \n\t"\
917 \
918     "add       $8, "#index"     \n\t"\
919     "cmp  "#dstw", "#index"     \n\t"\
920     " jb       1b               \n\t"
921
922 #ifdef HAVE_MMX2
923 #undef WRITEBGR24
924 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925 #else
926 #undef WRITEBGR24
927 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928 #endif
929
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931     "packuswb  %%mm3, %%mm3     \n\t"\
932     "packuswb  %%mm4, %%mm4     \n\t"\
933     "packuswb  %%mm7, %%mm1     \n\t"\
934     "punpcklbw %%mm4, %%mm3     \n\t"\
935     "movq      %%mm1, %%mm7     \n\t"\
936     "punpcklbw %%mm3, %%mm1     \n\t"\
937     "punpckhbw %%mm3, %%mm7     \n\t"\
938 \
939     MOVNTQ(%%mm1, (dst, index, 2))\
940     MOVNTQ(%%mm7, 8(dst, index, 2))\
941 \
942     "add          $8, "#index"  \n\t"\
943     "cmp     "#dstw", "#index"  \n\t"\
944     " jb          1b            \n\t"
945 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946
947
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951 {
952 #ifdef HAVE_MMX
953     if(!(c->flags & SWS_BITEXACT)){
954         if (c->flags & SWS_ACCURATE_RND){
955             if (uDest){
956                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
958             }
959
960             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
961         }else{
962             if (uDest){
963                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
965             }
966
967             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
968         }
969         return;
970     }
971 #endif
972 #ifdef HAVE_ALTIVEC
973 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974                       chrFilter, chrSrc, chrFilterSize,
975                       dest, uDest, vDest, dstW, chrDstW);
976 #else //HAVE_ALTIVEC
977 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978             chrFilter, chrSrc, chrFilterSize,
979             dest, uDest, vDest, dstW, chrDstW);
980 #endif //!HAVE_ALTIVEC
981 }
982
983 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
986 {
987 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988              chrFilter, chrSrc, chrFilterSize,
989              dest, uDest, dstW, chrDstW, dstFormat);
990 }
991
992 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994 {
995     int i;
996 #ifdef HAVE_MMX
997     if(!(c->flags & SWS_BITEXACT)){
998         long p= uDest ? 3 : 1;
999         uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000         uint8_t *dst[3]= {dest, uDest, vDest};
1001         long counter[3] = {dstW, chrDstW, chrDstW};
1002
1003         if (c->flags & SWS_ACCURATE_RND){
1004             while(p--){
1005                 asm volatile(
1006                     YSCALEYUV2YV121_ACCURATE
1007                     :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008                     "g" (-counter[p])
1009                     : "%"REG_a
1010                 );
1011             }
1012         }else{
1013             while(p--){
1014                 asm volatile(
1015                     YSCALEYUV2YV121
1016                     :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017                     "g" (-counter[p])
1018                     : "%"REG_a
1019                 );
1020             }
1021         }
1022         return;
1023     }
1024 #endif
1025     for (i=0; i<dstW; i++)
1026     {
1027         int val= (lumSrc[i]+64)>>7;
1028
1029         if (val&256){
1030             if (val<0) val=0;
1031             else       val=255;
1032         }
1033
1034         dest[i]= val;
1035     }
1036
1037     if (uDest)
1038         for (i=0; i<chrDstW; i++)
1039         {
1040             int u=(chrSrc[i       ]+64)>>7;
1041             int v=(chrSrc[i + VOFW]+64)>>7;
1042
1043             if ((u|v)&256){
1044                 if (u<0)        u=0;
1045                 else if (u>255) u=255;
1046                 if (v<0)        v=0;
1047                 else if (v>255) v=255;
1048             }
1049
1050             uDest[i]= u;
1051             vDest[i]= v;
1052         }
1053 }
1054
1055
1056 /**
1057  * vertical scale YV12 to RGB
1058  */
1059 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060                                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061                                        uint8_t *dest, long dstW, long dstY)
1062 {
1063 #ifdef HAVE_MMX
1064     long dummy=0;
1065     if(!(c->flags & SWS_BITEXACT)){
1066         if (c->flags & SWS_ACCURATE_RND){
1067             switch(c->dstFormat){
1068             case PIX_FMT_RGB32:
1069                 YSCALEYUV2PACKEDX_ACCURATE
1070                 YSCALEYUV2RGBX
1071                 WRITEBGR32(%4, %5, %%REGa)
1072
1073                 YSCALEYUV2PACKEDX_END
1074                 return;
1075             case PIX_FMT_BGR24:
1076                 YSCALEYUV2PACKEDX_ACCURATE
1077                 YSCALEYUV2RGBX
1078                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079                 "add %4, %%"REG_c"                        \n\t"
1080                 WRITEBGR24(%%REGc, %5, %%REGa)
1081
1082
1083                 :: "r" (&c->redDither),
1084                 "m" (dummy), "m" (dummy), "m" (dummy),
1085                 "r" (dest), "m" (dstW)
1086                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1087                 );
1088                 return;
1089             case PIX_FMT_RGB555:
1090                 YSCALEYUV2PACKEDX_ACCURATE
1091                 YSCALEYUV2RGBX
1092                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 #ifdef DITHER1XBPP
1094                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1095                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1096                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1097 #endif
1098
1099                 WRITERGB15(%4, %5, %%REGa)
1100                 YSCALEYUV2PACKEDX_END
1101                 return;
1102             case PIX_FMT_RGB565:
1103                 YSCALEYUV2PACKEDX_ACCURATE
1104                 YSCALEYUV2RGBX
1105                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1106 #ifdef DITHER1XBPP
1107                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1108                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1109                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1110 #endif
1111
1112                 WRITERGB16(%4, %5, %%REGa)
1113                 YSCALEYUV2PACKEDX_END
1114                 return;
1115             case PIX_FMT_YUYV422:
1116                 YSCALEYUV2PACKEDX_ACCURATE
1117                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1118
1119                 "psraw $3, %%mm3    \n\t"
1120                 "psraw $3, %%mm4    \n\t"
1121                 "psraw $3, %%mm1    \n\t"
1122                 "psraw $3, %%mm7    \n\t"
1123                 WRITEYUY2(%4, %5, %%REGa)
1124                 YSCALEYUV2PACKEDX_END
1125                 return;
1126             }
1127         }else{
1128             switch(c->dstFormat)
1129             {
1130             case PIX_FMT_RGB32:
1131                 YSCALEYUV2PACKEDX
1132                 YSCALEYUV2RGBX
1133                 WRITEBGR32(%4, %5, %%REGa)
1134                 YSCALEYUV2PACKEDX_END
1135                 return;
1136             case PIX_FMT_BGR24:
1137                 YSCALEYUV2PACKEDX
1138                 YSCALEYUV2RGBX
1139                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1140                 "add                        %4, %%"REG_c"   \n\t"
1141                 WRITEBGR24(%%REGc, %5, %%REGa)
1142
1143                 :: "r" (&c->redDither),
1144                 "m" (dummy), "m" (dummy), "m" (dummy),
1145                 "r" (dest),  "m" (dstW)
1146                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147                 );
1148                 return;
1149             case PIX_FMT_RGB555:
1150                 YSCALEYUV2PACKEDX
1151                 YSCALEYUV2RGBX
1152                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 #ifdef DITHER1XBPP
1154                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1155                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1156                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1157 #endif
1158
1159                 WRITERGB15(%4, %5, %%REGa)
1160                 YSCALEYUV2PACKEDX_END
1161                 return;
1162             case PIX_FMT_RGB565:
1163                 YSCALEYUV2PACKEDX
1164                 YSCALEYUV2RGBX
1165                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166 #ifdef DITHER1XBPP
1167                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1168                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1169                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1170 #endif
1171
1172                 WRITERGB16(%4, %5, %%REGa)
1173                 YSCALEYUV2PACKEDX_END
1174                 return;
1175             case PIX_FMT_YUYV422:
1176                 YSCALEYUV2PACKEDX
1177                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179                 "psraw $3, %%mm3    \n\t"
1180                 "psraw $3, %%mm4    \n\t"
1181                 "psraw $3, %%mm1    \n\t"
1182                 "psraw $3, %%mm7    \n\t"
1183                 WRITEYUY2(%4, %5, %%REGa)
1184                 YSCALEYUV2PACKEDX_END
1185                 return;
1186             }
1187         }
1188     }
1189 #endif /* HAVE_MMX */
1190 #ifdef HAVE_ALTIVEC
1191     /* The following list of supported dstFormat values should
1192        match what's found in the body of altivec_yuv2packedX() */
1193     if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1194         c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195         c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1196             altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197                                  chrFilter, chrSrc, chrFilterSize,
1198                                  dest, dstW, dstY);
1199     else
1200 #endif
1201         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202                        chrFilter, chrSrc, chrFilterSize,
1203                        dest, dstW, dstY);
1204 }
1205
1206 /**
1207  * vertical bilinear scale YV12 to RGB
1208  */
1209 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210                           uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1211 {
1212     int  yalpha1=4095- yalpha;
1213     int uvalpha1=4095-uvalpha;
1214     int i;
1215
1216 #if 0 //isn't used
1217     if (flags&SWS_FULL_CHR_H_INT)
1218     {
1219         switch(dstFormat)
1220         {
1221 #ifdef HAVE_MMX
1222         case PIX_FMT_RGB32:
1223             asm volatile(
1224
1225
1226 FULL_YSCALEYUV2RGB
1227             "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1228             "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1229
1230             "movq      %%mm3, %%mm1    \n\t"
1231             "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1232             "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1233
1234             MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1235             MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1236
1237             "add $4, %%"REG_a"  \n\t"
1238             "cmp %5, %%"REG_a"  \n\t"
1239             " jb 1b             \n\t"
1240
1241             :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242             "m" (yalpha1), "m" (uvalpha1)
1243             : "%"REG_a
1244             );
1245             break;
1246         case PIX_FMT_BGR24:
1247             asm volatile(
1248
1249 FULL_YSCALEYUV2RGB
1250
1251                                               // lsb ... msb
1252             "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1253             "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1254
1255             "movq      %%mm3, %%mm1     \n\t"
1256             "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1257             "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1258
1259             "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1260             "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1261             "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1262             "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1263             "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1264             "movq      %%mm1, %%mm2     \n\t"
1265             "psllq       $48, %%mm1     \n\t" // 000000BG
1266             "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1267
1268             "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1269             "psrld       $16, %%mm2     \n\t" // R000R000
1270             "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1271             "por       %%mm2, %%mm1     \n\t" // RBGRR000
1272
1273             "mov          %4, %%"REG_b" \n\t"
1274             "add   %%"REG_a", %%"REG_b" \n\t"
1275
1276 #ifdef HAVE_MMX2
1277             //FIXME Alignment
1278             "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1279             "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1280 #else
1281             "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1282             "psrlq  $32, %%mm3                          \n\t"
1283             "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1284             "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1285 #endif
1286             "add     $4, %%"REG_a"                      \n\t"
1287             "cmp     %5, %%"REG_a"                      \n\t"
1288             " jb     1b                                 \n\t"
1289
1290             :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291             "m" (yalpha1), "m" (uvalpha1)
1292             : "%"REG_a, "%"REG_b
1293             );
1294             break;
1295         case PIX_FMT_BGR555:
1296             asm volatile(
1297
1298 FULL_YSCALEYUV2RGB
1299 #ifdef DITHER1XBPP
1300             "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1301             "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1302             "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1303 #endif
1304             "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1305             "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1306             "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1307
1308             "psrlw                   $3, %%mm3  \n\t"
1309             "psllw                   $2, %%mm1  \n\t"
1310             "psllw                   $7, %%mm0  \n\t"
1311             "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1312             "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1313
1314             "por                  %%mm3, %%mm1  \n\t"
1315             "por                  %%mm1, %%mm0  \n\t"
1316
1317             MOVNTQ(%%mm0, (%4, %%REGa, 2))
1318
1319             "add $4, %%"REG_a"  \n\t"
1320             "cmp %5, %%"REG_a"  \n\t"
1321             " jb 1b             \n\t"
1322
1323             :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324             "m" (yalpha1), "m" (uvalpha1)
1325             : "%"REG_a
1326             );
1327             break;
1328         case PIX_FMT_BGR565:
1329             asm volatile(
1330
1331 FULL_YSCALEYUV2RGB
1332 #ifdef DITHER1XBPP
1333             "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1334             "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1335             "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1336 #endif
1337             "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1338             "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1339             "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1340
1341             "psrlw                   $3, %%mm3  \n\t"
1342             "psllw                   $3, %%mm1  \n\t"
1343             "psllw                   $8, %%mm0  \n\t"
1344             "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1345             "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1346
1347             "por                  %%mm3, %%mm1  \n\t"
1348             "por                  %%mm1, %%mm0  \n\t"
1349
1350             MOVNTQ(%%mm0, (%4, %%REGa, 2))
1351
1352             "add $4, %%"REG_a"  \n\t"
1353             "cmp %5, %%"REG_a"  \n\t"
1354             " jb 1b             \n\t"
1355
1356             :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357             "m" (yalpha1), "m" (uvalpha1)
1358             : "%"REG_a
1359             );
1360             break;
1361 #endif /* HAVE_MMX */
1362         case PIX_FMT_BGR32:
1363 #ifndef HAVE_MMX
1364         case PIX_FMT_RGB32:
1365 #endif
1366             if (dstFormat==PIX_FMT_RGB32)
1367             {
1368                 int i;
1369 #ifdef WORDS_BIGENDIAN
1370                 dest++;
1371 #endif
1372                 for (i=0;i<dstW;i++){
1373                     // vertical linear interpolation && yuv2rgb in a single step:
1374                     int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375                     int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376                     int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377                     dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378                     dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379                     dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380                     dest+= 4;
1381                 }
1382             }
1383             else if (dstFormat==PIX_FMT_BGR24)
1384             {
1385                 int i;
1386                 for (i=0;i<dstW;i++){
1387                     // vertical linear interpolation && yuv2rgb in a single step:
1388                     int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389                     int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390                     int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391                     dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392                     dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393                     dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394                     dest+= 3;
1395                 }
1396             }
1397             else if (dstFormat==PIX_FMT_BGR565)
1398             {
1399                 int i;
1400                 for (i=0;i<dstW;i++){
1401                     // vertical linear interpolation && yuv2rgb in a single step:
1402                     int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403                     int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404                     int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1405
1406                     ((uint16_t*)dest)[i] =
1407                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410                 }
1411             }
1412             else if (dstFormat==PIX_FMT_BGR555)
1413             {
1414                 int i;
1415                 for (i=0;i<dstW;i++){
1416                     // vertical linear interpolation && yuv2rgb in a single step:
1417                     int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418                     int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419                     int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1420
1421                     ((uint16_t*)dest)[i] =
1422                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425                 }
1426             }
1427         }//FULL_UV_IPOL
1428     else
1429     {
1430 #endif // if 0
1431 #ifdef HAVE_MMX
1432     if(!(c->flags & SWS_BITEXACT)){
1433         switch(c->dstFormat)
1434         {
1435             //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436             case PIX_FMT_RGB32:
1437                 asm volatile(
1438                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1439                 "mov        %4, %%"REG_b"               \n\t"
1440                 "push %%"REG_BP"                        \n\t"
1441                 YSCALEYUV2RGB(%%REGBP, %5)
1442                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443                 "pop %%"REG_BP"                         \n\t"
1444                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1445
1446                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447                 "a" (&c->redDither)
1448                 );
1449                 return;
1450             case PIX_FMT_BGR24:
1451                 asm volatile(
1452                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1453                 "mov        %4, %%"REG_b"               \n\t"
1454                 "push %%"REG_BP"                        \n\t"
1455                 YSCALEYUV2RGB(%%REGBP, %5)
1456                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457                 "pop %%"REG_BP"                         \n\t"
1458                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460                 "a" (&c->redDither)
1461                 );
1462                 return;
1463             case PIX_FMT_RGB555:
1464                 asm volatile(
1465                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1466                 "mov        %4, %%"REG_b"               \n\t"
1467                 "push %%"REG_BP"                        \n\t"
1468                 YSCALEYUV2RGB(%%REGBP, %5)
1469                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1470 #ifdef DITHER1XBPP
1471                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1472                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1473                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1474 #endif
1475
1476                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477                 "pop %%"REG_BP"                         \n\t"
1478                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479
1480                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481                 "a" (&c->redDither)
1482                 );
1483                 return;
1484             case PIX_FMT_RGB565:
1485                 asm volatile(
1486                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1487                 "mov        %4, %%"REG_b"               \n\t"
1488                 "push %%"REG_BP"                        \n\t"
1489                 YSCALEYUV2RGB(%%REGBP, %5)
1490                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1491 #ifdef DITHER1XBPP
1492                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1493                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1494                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1495 #endif
1496
1497                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498                 "pop %%"REG_BP"                         \n\t"
1499                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1500                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501                 "a" (&c->redDither)
1502                 );
1503                 return;
1504             case PIX_FMT_YUYV422:
1505                 asm volatile(
1506                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1507                 "mov %4, %%"REG_b"                        \n\t"
1508                 "push %%"REG_BP"                        \n\t"
1509                 YSCALEYUV2PACKED(%%REGBP, %5)
1510                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511                 "pop %%"REG_BP"                         \n\t"
1512                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1513                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514                 "a" (&c->redDither)
1515                 );
1516                 return;
1517             default: break;
1518         }
1519     }
1520 #endif //HAVE_MMX
1521 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1522 }
1523
1524 /**
1525  * YV12 to RGB without scaling or interpolating
1526  */
1527 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528                           uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1529 {
1530     const int yalpha1=0;
1531     int i;
1532
1533     uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534     const int yalpha= 4096; //FIXME ...
1535
1536     if (flags&SWS_FULL_CHR_H_INT)
1537     {
1538         RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539         return;
1540     }
1541
1542 #ifdef HAVE_MMX
1543     if(!(flags & SWS_BITEXACT)){
1544         if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1545         {
1546             switch(dstFormat)
1547             {
1548             case PIX_FMT_RGB32:
1549                 asm volatile(
1550                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551                 "mov        %4, %%"REG_b"               \n\t"
1552                 "push %%"REG_BP"                        \n\t"
1553                 YSCALEYUV2RGB1(%%REGBP, %5)
1554                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555                 "pop %%"REG_BP"                         \n\t"
1556                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1557
1558                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559                 "a" (&c->redDither)
1560                 );
1561                 return;
1562             case PIX_FMT_BGR24:
1563                 asm volatile(
1564                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1565                 "mov        %4, %%"REG_b"               \n\t"
1566                 "push %%"REG_BP"                        \n\t"
1567                 YSCALEYUV2RGB1(%%REGBP, %5)
1568                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569                 "pop %%"REG_BP"                         \n\t"
1570                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1571
1572                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573                 "a" (&c->redDither)
1574                 );
1575                 return;
1576             case PIX_FMT_RGB555:
1577                 asm volatile(
1578                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1579                 "mov        %4, %%"REG_b"               \n\t"
1580                 "push %%"REG_BP"                        \n\t"
1581                 YSCALEYUV2RGB1(%%REGBP, %5)
1582                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1583 #ifdef DITHER1XBPP
1584                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1585                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1586                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1587 #endif
1588                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589                 "pop %%"REG_BP"                         \n\t"
1590                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1591
1592                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593                 "a" (&c->redDither)
1594                 );
1595                 return;
1596             case PIX_FMT_RGB565:
1597                 asm volatile(
1598                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1599                 "mov        %4, %%"REG_b"               \n\t"
1600                 "push %%"REG_BP"                        \n\t"
1601                 YSCALEYUV2RGB1(%%REGBP, %5)
1602                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1603 #ifdef DITHER1XBPP
1604                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1605                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1606                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1607 #endif
1608
1609                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610                 "pop %%"REG_BP"                         \n\t"
1611                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1612
1613                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614                 "a" (&c->redDither)
1615                 );
1616                 return;
1617             case PIX_FMT_YUYV422:
1618                 asm volatile(
1619                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1620                 "mov        %4, %%"REG_b"               \n\t"
1621                 "push %%"REG_BP"                        \n\t"
1622                 YSCALEYUV2PACKED1(%%REGBP, %5)
1623                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624                 "pop %%"REG_BP"                         \n\t"
1625                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1626
1627                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628                 "a" (&c->redDither)
1629                 );
1630                 return;
1631             }
1632         }
1633         else
1634         {
1635             switch(dstFormat)
1636             {
1637             case PIX_FMT_RGB32:
1638                 asm volatile(
1639                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640                 "mov        %4, %%"REG_b"               \n\t"
1641                 "push %%"REG_BP"                        \n\t"
1642                 YSCALEYUV2RGB1b(%%REGBP, %5)
1643                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644                 "pop %%"REG_BP"                         \n\t"
1645                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1646
1647                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648                 "a" (&c->redDither)
1649                 );
1650                 return;
1651             case PIX_FMT_BGR24:
1652                 asm volatile(
1653                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1654                 "mov        %4, %%"REG_b"               \n\t"
1655                 "push %%"REG_BP"                        \n\t"
1656                 YSCALEYUV2RGB1b(%%REGBP, %5)
1657                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658                 "pop %%"REG_BP"                         \n\t"
1659                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1660
1661                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662                 "a" (&c->redDither)
1663                 );
1664                 return;
1665             case PIX_FMT_RGB555:
1666                 asm volatile(
1667                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1668                 "mov        %4, %%"REG_b"               \n\t"
1669                 "push %%"REG_BP"                        \n\t"
1670                 YSCALEYUV2RGB1b(%%REGBP, %5)
1671                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1672 #ifdef DITHER1XBPP
1673                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1674                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1675                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1676 #endif
1677                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678                 "pop %%"REG_BP"                         \n\t"
1679                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1680
1681                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682                 "a" (&c->redDither)
1683                 );
1684                 return;
1685             case PIX_FMT_RGB565:
1686                 asm volatile(
1687                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1688                 "mov        %4, %%"REG_b"               \n\t"
1689                 "push %%"REG_BP"                        \n\t"
1690                 YSCALEYUV2RGB1b(%%REGBP, %5)
1691                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1692 #ifdef DITHER1XBPP
1693                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1694                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1695                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1696 #endif
1697
1698                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699                 "pop %%"REG_BP"                         \n\t"
1700                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1701
1702                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703                 "a" (&c->redDither)
1704                 );
1705                 return;
1706             case PIX_FMT_YUYV422:
1707                 asm volatile(
1708                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1709                 "mov        %4, %%"REG_b"               \n\t"
1710                 "push %%"REG_BP"                        \n\t"
1711                 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713                 "pop %%"REG_BP"                         \n\t"
1714                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1715
1716                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717                 "a" (&c->redDither)
1718                 );
1719                 return;
1720             }
1721         }
1722     }
1723 #endif /* HAVE_MMX */
1724     if (uvalpha < 2048)
1725     {
1726         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1727     }else{
1728         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1729     }
1730 }
1731
1732 //FIXME yuy2* can read up to 7 samples too much
1733
1734 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1735 {
1736 #ifdef HAVE_MMX
1737     asm volatile(
1738     "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1739     "mov                    %0, %%"REG_a"       \n\t"
1740     "1:                                         \n\t"
1741     "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1742     "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1743     "pand                %%mm2, %%mm0           \n\t"
1744     "pand                %%mm2, %%mm1           \n\t"
1745     "packuswb            %%mm1, %%mm0           \n\t"
1746     "movq                %%mm0, (%2, %%"REG_a") \n\t"
1747     "add                    $8, %%"REG_a"       \n\t"
1748     " js                    1b                  \n\t"
1749     : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750     : "%"REG_a
1751     );
1752 #else
1753     int i;
1754     for (i=0; i<width; i++)
1755         dst[i]= src[2*i];
1756 #endif
1757 }
1758
1759 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1760 {
1761 #ifdef HAVE_MMX
1762     asm volatile(
1763     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1764     "mov                    %0, %%"REG_a"       \n\t"
1765     "1:                                         \n\t"
1766     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1767     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1768     "psrlw                  $8, %%mm0           \n\t"
1769     "psrlw                  $8, %%mm1           \n\t"
1770     "packuswb            %%mm1, %%mm0           \n\t"
1771     "movq                %%mm0, %%mm1           \n\t"
1772     "psrlw                  $8, %%mm0           \n\t"
1773     "pand                %%mm4, %%mm1           \n\t"
1774     "packuswb            %%mm0, %%mm0           \n\t"
1775     "packuswb            %%mm1, %%mm1           \n\t"
1776     "movd                %%mm0, (%3, %%"REG_a") \n\t"
1777     "movd                %%mm1, (%2, %%"REG_a") \n\t"
1778     "add                    $4, %%"REG_a"       \n\t"
1779     " js                    1b                  \n\t"
1780     : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781     : "%"REG_a
1782     );
1783 #else
1784     int i;
1785     for (i=0; i<width; i++)
1786     {
1787         dstU[i]= src1[4*i + 1];
1788         dstV[i]= src1[4*i + 3];
1789     }
1790 #endif
1791     assert(src1 == src2);
1792 }
1793
1794 /* This is almost identical to the previous, end exists only because
1795  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1797 {
1798 #ifdef HAVE_MMX
1799     asm volatile(
1800     "mov                  %0, %%"REG_a"         \n\t"
1801     "1:                                         \n\t"
1802     "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1803     "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1804     "psrlw                $8, %%mm0             \n\t"
1805     "psrlw                $8, %%mm1             \n\t"
1806     "packuswb          %%mm1, %%mm0             \n\t"
1807     "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1808     "add                  $8, %%"REG_a"         \n\t"
1809     " js                  1b                    \n\t"
1810     : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811     : "%"REG_a
1812     );
1813 #else
1814     int i;
1815     for (i=0; i<width; i++)
1816         dst[i]= src[2*i+1];
1817 #endif
1818 }
1819
1820 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1821 {
1822 #ifdef HAVE_MMX
1823     asm volatile(
1824     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1825     "mov                    %0, %%"REG_a"       \n\t"
1826     "1:                                         \n\t"
1827     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1828     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1829     "pand                %%mm4, %%mm0           \n\t"
1830     "pand                %%mm4, %%mm1           \n\t"
1831     "packuswb            %%mm1, %%mm0           \n\t"
1832     "movq                %%mm0, %%mm1           \n\t"
1833     "psrlw                  $8, %%mm0           \n\t"
1834     "pand                %%mm4, %%mm1           \n\t"
1835     "packuswb            %%mm0, %%mm0           \n\t"
1836     "packuswb            %%mm1, %%mm1           \n\t"
1837     "movd                %%mm0, (%3, %%"REG_a") \n\t"
1838     "movd                %%mm1, (%2, %%"REG_a") \n\t"
1839     "add                    $4, %%"REG_a"       \n\t"
1840     " js                    1b                  \n\t"
1841     : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842     : "%"REG_a
1843     );
1844 #else
1845     int i;
1846     for (i=0; i<width; i++)
1847     {
1848         dstU[i]= src1[4*i + 0];
1849         dstV[i]= src1[4*i + 2];
1850     }
1851 #endif
1852     assert(src1 == src2);
1853 }
1854
1855 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1857 {\
1858     int i;\
1859     for (i=0; i<width; i++)\
1860     {\
1861         int b= (((type*)src)[i]>>shb)&maskb;\
1862         int g= (((type*)src)[i]>>shg)&maskg;\
1863         int r= (((type*)src)[i]>>shr)&maskr;\
1864 \
1865         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1866     }\
1867 }
1868
1869 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1870 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1871 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1872 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1873 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1875
1876 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1878 {\
1879     int i;\
1880     for (i=0; i<width; i++)\
1881     {\
1882         int b= (((type*)src)[i]&maskb)>>shb;\
1883         int g= (((type*)src)[i]&maskg)>>shg;\
1884         int r= (((type*)src)[i]&maskr)>>shr;\
1885 \
1886         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1888     }\
1889 }\
1890 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1891 {\
1892     int i;\
1893     for (i=0; i<width; i++)\
1894     {\
1895         int pix0= ((type*)src)[2*i+0];\
1896         int pix1= ((type*)src)[2*i+1];\
1897         int g= (pix0&maskg)+(pix1&maskg);\
1898         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1900 \
1901         g>>=shg;\
1902 \
1903         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1905     }\
1906 }
1907
1908 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1909 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1910 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1911 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1912 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1914
1915 #ifdef HAVE_MMX
1916 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1917 {
1918
1919     if(srcFormat == PIX_FMT_BGR24){
1920         asm volatile(
1921             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1922             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1923             :
1924         );
1925     }else{
1926         asm volatile(
1927             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1928             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1929             :
1930         );
1931     }
1932
1933     asm volatile(
1934         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1935         "mov                        %2, %%"REG_a"   \n\t"
1936         "pxor                    %%mm7, %%mm7       \n\t"
1937         "1:                                         \n\t"
1938         PREFETCH"               64(%0)              \n\t"
1939         "movd                     (%0), %%mm0       \n\t"
1940         "movd                    2(%0), %%mm1       \n\t"
1941         "movd                    6(%0), %%mm2       \n\t"
1942         "movd                    8(%0), %%mm3       \n\t"
1943         "add                       $12, %0          \n\t"
1944         "punpcklbw               %%mm7, %%mm0       \n\t"
1945         "punpcklbw               %%mm7, %%mm1       \n\t"
1946         "punpcklbw               %%mm7, %%mm2       \n\t"
1947         "punpcklbw               %%mm7, %%mm3       \n\t"
1948         "pmaddwd                 %%mm5, %%mm0       \n\t"
1949         "pmaddwd                 %%mm6, %%mm1       \n\t"
1950         "pmaddwd                 %%mm5, %%mm2       \n\t"
1951         "pmaddwd                 %%mm6, %%mm3       \n\t"
1952         "paddd                   %%mm1, %%mm0       \n\t"
1953         "paddd                   %%mm3, %%mm2       \n\t"
1954         "paddd                   %%mm4, %%mm0       \n\t"
1955         "paddd                   %%mm4, %%mm2       \n\t"
1956         "psrad                     $15, %%mm0       \n\t"
1957         "psrad                     $15, %%mm2       \n\t"
1958         "packssdw                %%mm2, %%mm0       \n\t"
1959         "packuswb                %%mm0, %%mm0       \n\t"
1960         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1961         "add                        $4, %%"REG_a"   \n\t"
1962         " js                        1b              \n\t"
1963     : "+r" (src)
1964     : "r" (dst+width), "g" (-width)
1965     : "%"REG_a
1966     );
1967 }
1968
1969 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1970 {
1971     asm volatile(
1972         "movq                    24+%4, %%mm6       \n\t"
1973         "mov                        %3, %%"REG_a"   \n\t"
1974         "pxor                    %%mm7, %%mm7       \n\t"
1975         "1:                                         \n\t"
1976         PREFETCH"               64(%0)              \n\t"
1977         "movd                     (%0), %%mm0       \n\t"
1978         "movd                    2(%0), %%mm1       \n\t"
1979         "punpcklbw               %%mm7, %%mm0       \n\t"
1980         "punpcklbw               %%mm7, %%mm1       \n\t"
1981         "movq                    %%mm0, %%mm2       \n\t"
1982         "movq                    %%mm1, %%mm3       \n\t"
1983         "pmaddwd                    %4, %%mm0       \n\t"
1984         "pmaddwd                  8+%4, %%mm1       \n\t"
1985         "pmaddwd                 16+%4, %%mm2       \n\t"
1986         "pmaddwd                 %%mm6, %%mm3       \n\t"
1987         "paddd                   %%mm1, %%mm0       \n\t"
1988         "paddd                   %%mm3, %%mm2       \n\t"
1989
1990         "movd                    6(%0), %%mm1       \n\t"
1991         "movd                    8(%0), %%mm3       \n\t"
1992         "add                       $12, %0          \n\t"
1993         "punpcklbw               %%mm7, %%mm1       \n\t"
1994         "punpcklbw               %%mm7, %%mm3       \n\t"
1995         "movq                    %%mm1, %%mm4       \n\t"
1996         "movq                    %%mm3, %%mm5       \n\t"
1997         "pmaddwd                    %4, %%mm1       \n\t"
1998         "pmaddwd                  8+%4, %%mm3       \n\t"
1999         "pmaddwd                 16+%4, %%mm4       \n\t"
2000         "pmaddwd                 %%mm6, %%mm5       \n\t"
2001         "paddd                   %%mm3, %%mm1       \n\t"
2002         "paddd                   %%mm5, %%mm4       \n\t"
2003
2004         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
2005         "paddd                   %%mm3, %%mm0       \n\t"
2006         "paddd                   %%mm3, %%mm2       \n\t"
2007         "paddd                   %%mm3, %%mm1       \n\t"
2008         "paddd                   %%mm3, %%mm4       \n\t"
2009         "psrad                     $15, %%mm0       \n\t"
2010         "psrad                     $15, %%mm2       \n\t"
2011         "psrad                     $15, %%mm1       \n\t"
2012         "psrad                     $15, %%mm4       \n\t"
2013         "packssdw                %%mm1, %%mm0       \n\t"
2014         "packssdw                %%mm4, %%mm2       \n\t"
2015         "packuswb                %%mm0, %%mm0       \n\t"
2016         "packuswb                %%mm2, %%mm2       \n\t"
2017         "movd                %%mm0, (%1, %%"REG_a") \n\t"
2018         "movd                %%mm2, (%2, %%"REG_a") \n\t"
2019         "add                        $4, %%"REG_a"   \n\t"
2020         " js                        1b              \n\t"
2021     : "+r" (src)
2022     : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023     : "%"REG_a
2024     );
2025 }
2026 #endif
2027
2028 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2029 {
2030 #ifdef HAVE_MMX
2031     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
2032 #else
2033     int i;
2034     for (i=0; i<width; i++)
2035     {
2036         int b= src[i*3+0];
2037         int g= src[i*3+1];
2038         int r= src[i*3+2];
2039
2040         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2041     }
2042 #endif /* HAVE_MMX */
2043 }
2044
2045 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2046 {
2047 #ifdef HAVE_MMX
2048     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
2049 #else
2050     int i;
2051     for (i=0; i<width; i++)
2052     {
2053         int b= src1[3*i + 0];
2054         int g= src1[3*i + 1];
2055         int r= src1[3*i + 2];
2056
2057         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2059     }
2060 #endif /* HAVE_MMX */
2061     assert(src1 == src2);
2062 }
2063
2064 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2065 {
2066     int i;
2067     for (i=0; i<width; i++)
2068     {
2069         int b= src1[6*i + 0] + src1[6*i + 3];
2070         int g= src1[6*i + 1] + src1[6*i + 4];
2071         int r= src1[6*i + 2] + src1[6*i + 5];
2072
2073         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2075     }
2076     assert(src1 == src2);
2077 }
2078
2079 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2080 {
2081 #ifdef HAVE_MMX
2082     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2083 #else
2084     int i;
2085     for (i=0; i<width; i++)
2086     {
2087         int r= src[i*3+0];
2088         int g= src[i*3+1];
2089         int b= src[i*3+2];
2090
2091         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2092     }
2093 #endif
2094 }
2095
2096 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2097 {
2098     int i;
2099     assert(src1==src2);
2100 #ifdef HAVE_MMX
2101     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2102 #else
2103     for (i=0; i<width; i++)
2104     {
2105         int r= src1[3*i + 0];
2106         int g= src1[3*i + 1];
2107         int b= src1[3*i + 2];
2108
2109         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2111     }
2112 #endif
2113 }
2114
2115 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2116 {
2117     int i;
2118     assert(src1==src2);
2119     for (i=0; i<width; i++)
2120     {
2121         int r= src1[6*i + 0] + src1[6*i + 3];
2122         int g= src1[6*i + 1] + src1[6*i + 4];
2123         int b= src1[6*i + 2] + src1[6*i + 5];
2124
2125         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2127     }
2128 }
2129
2130
2131 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2132 {
2133     int i;
2134     for (i=0; i<width; i++)
2135     {
2136         int d= src[i];
2137
2138         dst[i]= pal[d] & 0xFF;
2139     }
2140 }
2141
2142 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2143 {
2144     int i;
2145     assert(src1 == src2);
2146     for (i=0; i<width; i++)
2147     {
2148         int p= pal[src1[i]];
2149
2150         dstU[i]= p>>8;
2151         dstV[i]= p>>16;
2152     }
2153 }
2154
2155 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2156 {
2157     int i, j;
2158     for (i=0; i<width/8; i++){
2159         int d= ~src[i];
2160         for(j=0; j<8; j++)
2161             dst[8*i+j]= ((d>>(7-j))&1)*255;
2162     }
2163 }
2164
2165 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2166 {
2167     int i, j;
2168     for (i=0; i<width/8; i++){
2169         int d= src[i];
2170         for(j=0; j<8; j++)
2171             dst[8*i+j]= ((d>>(7-j))&1)*255;
2172     }
2173 }
2174
2175 // bilinear / bicubic scaling
2176 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2177                                   int16_t *filter, int16_t *filterPos, long filterSize)
2178 {
2179 #ifdef HAVE_MMX
2180     assert(filterSize % 4 == 0 && filterSize>0);
2181     if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2182     {
2183         long counter= -2*dstW;
2184         filter-= counter*2;
2185         filterPos-= counter/2;
2186         dst-= counter/2;
2187         asm volatile(
2188 #if defined(PIC)
2189         "push            %%"REG_b"              \n\t"
2190 #endif
2191         "pxor                %%mm7, %%mm7       \n\t"
2192         "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2193         "mov             %%"REG_a", %%"REG_BP"  \n\t"
2194         ASMALIGN(4)
2195         "1:                                     \n\t"
2196         "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2197         "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2198         "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2199         "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2200         "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2201         "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2202         "punpcklbw           %%mm7, %%mm0       \n\t"
2203         "punpcklbw           %%mm7, %%mm2       \n\t"
2204         "pmaddwd             %%mm1, %%mm0       \n\t"
2205         "pmaddwd             %%mm2, %%mm3       \n\t"
2206         "movq                %%mm0, %%mm4       \n\t"
2207         "punpckldq           %%mm3, %%mm0       \n\t"
2208         "punpckhdq           %%mm3, %%mm4       \n\t"
2209         "paddd               %%mm4, %%mm0       \n\t"
2210         "psrad                  $7, %%mm0       \n\t"
2211         "packssdw            %%mm0, %%mm0       \n\t"
2212         "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2213         "add                    $4, %%"REG_BP"  \n\t"
2214         " jnc                   1b              \n\t"
2215
2216         "pop            %%"REG_BP"              \n\t"
2217 #if defined(PIC)
2218         "pop             %%"REG_b"              \n\t"
2219 #endif
2220         : "+a" (counter)
2221         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2222 #if !defined(PIC)
2223         : "%"REG_b
2224 #endif
2225         );
2226     }
2227     else if (filterSize==8)
2228     {
2229         long counter= -2*dstW;
2230         filter-= counter*4;
2231         filterPos-= counter/2;
2232         dst-= counter/2;
2233         asm volatile(
2234 #if defined(PIC)
2235         "push             %%"REG_b"             \n\t"
2236 #endif
2237         "pxor                 %%mm7, %%mm7      \n\t"
2238         "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2239         "mov              %%"REG_a", %%"REG_BP" \n\t"
2240         ASMALIGN(4)
2241         "1:                                     \n\t"
2242         "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2243         "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2244         "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2245         "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2246         "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2247         "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2248         "punpcklbw            %%mm7, %%mm0      \n\t"
2249         "punpcklbw            %%mm7, %%mm2      \n\t"
2250         "pmaddwd              %%mm1, %%mm0      \n\t"
2251         "pmaddwd              %%mm2, %%mm3      \n\t"
2252
2253         "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2254         "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2255         "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2256         "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2257         "punpcklbw            %%mm7, %%mm4      \n\t"
2258         "punpcklbw            %%mm7, %%mm2      \n\t"
2259         "pmaddwd              %%mm1, %%mm4      \n\t"
2260         "pmaddwd              %%mm2, %%mm5      \n\t"
2261         "paddd                %%mm4, %%mm0      \n\t"
2262         "paddd                %%mm5, %%mm3      \n\t"
2263         "movq                 %%mm0, %%mm4      \n\t"
2264         "punpckldq            %%mm3, %%mm0      \n\t"
2265         "punpckhdq            %%mm3, %%mm4      \n\t"
2266         "paddd                %%mm4, %%mm0      \n\t"
2267         "psrad                   $7, %%mm0      \n\t"
2268         "packssdw             %%mm0, %%mm0      \n\t"
2269         "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2270         "add                     $4, %%"REG_BP" \n\t"
2271         " jnc                    1b             \n\t"
2272
2273         "pop             %%"REG_BP"             \n\t"
2274 #if defined(PIC)
2275         "pop              %%"REG_b"             \n\t"
2276 #endif
2277         : "+a" (counter)
2278         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2279 #if !defined(PIC)
2280         : "%"REG_b
2281 #endif
2282         );
2283     }
2284     else
2285     {
2286         uint8_t *offset = src+filterSize;
2287         long counter= -2*dstW;
2288         //filter-= counter*filterSize/2;
2289         filterPos-= counter/2;
2290         dst-= counter/2;
2291         asm volatile(
2292         "pxor                  %%mm7, %%mm7     \n\t"
2293         ASMALIGN(4)
2294         "1:                                     \n\t"
2295         "mov                      %2, %%"REG_c" \n\t"
2296         "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2297         "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2298         "mov                      %5, %%"REG_c" \n\t"
2299         "pxor                  %%mm4, %%mm4     \n\t"
2300         "pxor                  %%mm5, %%mm5     \n\t"
2301         "2:                                     \n\t"
2302         "movq                   (%1), %%mm1     \n\t"
2303         "movq               (%1, %6), %%mm3     \n\t"
2304         "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2305         "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2306         "punpcklbw             %%mm7, %%mm0     \n\t"
2307         "punpcklbw             %%mm7, %%mm2     \n\t"
2308         "pmaddwd               %%mm1, %%mm0     \n\t"
2309         "pmaddwd               %%mm2, %%mm3     \n\t"
2310         "paddd                 %%mm3, %%mm5     \n\t"
2311         "paddd                 %%mm0, %%mm4     \n\t"
2312         "add                      $8, %1        \n\t"
2313         "add                      $4, %%"REG_c" \n\t"
2314         "cmp                      %4, %%"REG_c" \n\t"
2315         " jb                      2b            \n\t"
2316         "add                      %6, %1        \n\t"
2317         "movq                  %%mm4, %%mm0     \n\t"
2318         "punpckldq             %%mm5, %%mm4     \n\t"
2319         "punpckhdq             %%mm5, %%mm0     \n\t"
2320         "paddd                 %%mm0, %%mm4     \n\t"
2321         "psrad                    $7, %%mm4     \n\t"
2322         "packssdw              %%mm4, %%mm4     \n\t"
2323         "mov                      %3, %%"REG_a" \n\t"
2324         "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2325         "add                      $4, %0        \n\t"
2326         " jnc                     1b            \n\t"
2327
2328         : "+r" (counter), "+r" (filter)
2329         : "m" (filterPos), "m" (dst), "m"(offset),
2330           "m" (src), "r" (filterSize*2)
2331         : "%"REG_a, "%"REG_c, "%"REG_d
2332         );
2333     }
2334 #else
2335 #ifdef HAVE_ALTIVEC
2336     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2337 #else
2338     int i;
2339     for (i=0; i<dstW; i++)
2340     {
2341         int j;
2342         int srcPos= filterPos[i];
2343         int val=0;
2344         //printf("filterPos: %d\n", filterPos[i]);
2345         for (j=0; j<filterSize; j++)
2346         {
2347             //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2349         }
2350         //filter += hFilterSize;
2351         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2352         //dst[i] = val>>7;
2353     }
2354 #endif /* HAVE_ALTIVEC */
2355 #endif /* HAVE_MMX */
2356 }
2357       // *** horizontal scale Y line to temp buffer
2358 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2359                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2362                                    int32_t *mmx2FilterPos, uint32_t *pal)
2363 {
2364     if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2365     {
2366         RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2367         src= formatConvBuffer;
2368     }
2369     else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2370     {
2371         RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2372         src= formatConvBuffer;
2373     }
2374     else if (srcFormat==PIX_FMT_RGB32)
2375     {
2376         RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2377         src= formatConvBuffer;
2378     }
2379     else if (srcFormat==PIX_FMT_RGB32_1)
2380     {
2381         RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2382         src= formatConvBuffer;
2383     }
2384     else if (srcFormat==PIX_FMT_BGR24)
2385     {
2386         RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2387         src= formatConvBuffer;
2388     }
2389     else if (srcFormat==PIX_FMT_BGR565)
2390     {
2391         RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2392         src= formatConvBuffer;
2393     }
2394     else if (srcFormat==PIX_FMT_BGR555)
2395     {
2396         RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2397         src= formatConvBuffer;
2398     }
2399     else if (srcFormat==PIX_FMT_BGR32)
2400     {
2401         RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2402         src= formatConvBuffer;
2403     }
2404     else if (srcFormat==PIX_FMT_BGR32_1)
2405     {
2406         RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2407         src= formatConvBuffer;
2408     }
2409     else if (srcFormat==PIX_FMT_RGB24)
2410     {
2411         RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2412         src= formatConvBuffer;
2413     }
2414     else if (srcFormat==PIX_FMT_RGB565)
2415     {
2416         RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2417         src= formatConvBuffer;
2418     }
2419     else if (srcFormat==PIX_FMT_RGB555)
2420     {
2421         RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2422         src= formatConvBuffer;
2423     }
2424     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2425     {
2426         RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2427         src= formatConvBuffer;
2428     }
2429     else if (srcFormat==PIX_FMT_MONOBLACK)
2430     {
2431         RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2432         src= formatConvBuffer;
2433     }
2434     else if (srcFormat==PIX_FMT_MONOWHITE)
2435     {
2436         RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2437         src= formatConvBuffer;
2438     }
2439
2440 #ifdef HAVE_MMX
2441     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2442     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2443 #else
2444     if (!(flags&SWS_FAST_BILINEAR))
2445 #endif
2446     {
2447         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2448     }
2449     else // fast bilinear upscale / crap downscale
2450     {
2451 #if defined(ARCH_X86)
2452 #ifdef HAVE_MMX2
2453         int i;
2454 #if defined(PIC)
2455         uint64_t ebxsave __attribute__((aligned(8)));
2456 #endif
2457         if (canMMX2BeUsed)
2458         {
2459             asm volatile(
2460 #if defined(PIC)
2461             "mov               %%"REG_b", %5        \n\t"
2462 #endif
2463             "pxor                  %%mm7, %%mm7     \n\t"
2464             "mov                      %0, %%"REG_c" \n\t"
2465             "mov                      %1, %%"REG_D" \n\t"
2466             "mov                      %2, %%"REG_d" \n\t"
2467             "mov                      %3, %%"REG_b" \n\t"
2468             "xor               %%"REG_a", %%"REG_a" \n\t" // i
2469             PREFETCH"        (%%"REG_c")            \n\t"
2470             PREFETCH"      32(%%"REG_c")            \n\t"
2471             PREFETCH"      64(%%"REG_c")            \n\t"
2472
2473 #ifdef ARCH_X86_64
2474
2475 #define FUNNY_Y_CODE \
2476             "movl            (%%"REG_b"), %%esi     \n\t"\
2477             "call                    *%4            \n\t"\
2478             "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2479             "add               %%"REG_S", %%"REG_c" \n\t"\
2480             "add               %%"REG_a", %%"REG_D" \n\t"\
2481             "xor               %%"REG_a", %%"REG_a" \n\t"\
2482
2483 #else
2484
2485 #define FUNNY_Y_CODE \
2486             "movl (%%"REG_b"), %%esi        \n\t"\
2487             "call         *%4                       \n\t"\
2488             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489             "add               %%"REG_a", %%"REG_D" \n\t"\
2490             "xor               %%"REG_a", %%"REG_a" \n\t"\
2491
2492 #endif /* ARCH_X86_64 */
2493
2494 FUNNY_Y_CODE
2495 FUNNY_Y_CODE
2496 FUNNY_Y_CODE
2497 FUNNY_Y_CODE
2498 FUNNY_Y_CODE
2499 FUNNY_Y_CODE
2500 FUNNY_Y_CODE
2501 FUNNY_Y_CODE
2502
2503 #if defined(PIC)
2504             "mov                      %5, %%"REG_b" \n\t"
2505 #endif
2506             :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507             "m" (funnyYCode)
2508 #if defined(PIC)
2509             ,"m" (ebxsave)
2510 #endif
2511             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2512 #if !defined(PIC)
2513             ,"%"REG_b
2514 #endif
2515             );
2516             for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2517         }
2518         else
2519         {
2520 #endif /* HAVE_MMX2 */
2521         long xInc_shr16 = xInc >> 16;
2522         uint16_t xInc_mask = xInc & 0xffff;
2523         //NO MMX just normal asm ...
2524         asm volatile(
2525         "xor %%"REG_a", %%"REG_a"            \n\t" // i
2526         "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2527         "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2528         ASMALIGN(4)
2529         "1:                                  \n\t"
2530         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2531         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2532         "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2533         "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534         "shll      $16, %%edi                \n\t"
2535         "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536         "mov        %1, %%"REG_D"            \n\t"
2537         "shrl       $9, %%esi                \n\t"
2538         "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2539         "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2540         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2541
2542         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2543         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2544         "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2545         "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546         "shll      $16, %%edi                \n\t"
2547         "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548         "mov        %1, %%"REG_D"            \n\t"
2549         "shrl       $9, %%esi                \n\t"
2550         "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2551         "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2552         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2553
2554
2555         "add        $2, %%"REG_a"            \n\t"
2556         "cmp        %2, %%"REG_a"            \n\t"
2557         " jb        1b                       \n\t"
2558
2559
2560         :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561         : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2562         );
2563 #ifdef HAVE_MMX2
2564         } //if MMX2 can't be used
2565 #endif
2566 #else
2567         int i;
2568         unsigned int xpos=0;
2569         for (i=0;i<dstWidth;i++)
2570         {
2571             register unsigned int xx=xpos>>16;
2572             register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573             dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2574             xpos+=xInc;
2575         }
2576 #endif /* defined(ARCH_X86) */
2577     }
2578
2579     if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2580         int i;
2581         //FIXME all pal and rgb srcFormats could do this convertion as well
2582         //FIXME all scalers more complex than bilinear could do half of this transform
2583         if(c->srcRange){
2584             for (i=0; i<dstWidth; i++)
2585                 dst[i]= (dst[i]*14071 + 33561947)>>14;
2586         }else{
2587             for (i=0; i<dstWidth; i++)
2588                 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2589         }
2590     }
2591 }
2592
2593 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2594                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2597                                    int32_t *mmx2FilterPos, uint32_t *pal)
2598 {
2599     if (srcFormat==PIX_FMT_YUYV422)
2600     {
2601         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2602         src1= formatConvBuffer;
2603         src2= formatConvBuffer+VOFW;
2604     }
2605     else if (srcFormat==PIX_FMT_UYVY422)
2606     {
2607         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2608         src1= formatConvBuffer;
2609         src2= formatConvBuffer+VOFW;
2610     }
2611     else if (srcFormat==PIX_FMT_RGB32)
2612     {
2613         if(c->chrSrcHSubSample)
2614             RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2615         else
2616             RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2617         src1= formatConvBuffer;
2618         src2= formatConvBuffer+VOFW;
2619     }
2620     else if (srcFormat==PIX_FMT_RGB32_1)
2621     {
2622         if(c->chrSrcHSubSample)
2623             RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2624         else
2625             RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2626         src1= formatConvBuffer;
2627         src2= formatConvBuffer+VOFW;
2628     }
2629     else if (srcFormat==PIX_FMT_BGR24)
2630     {
2631         if(c->chrSrcHSubSample)
2632             RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2633         else
2634             RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2635         src1= formatConvBuffer;
2636         src2= formatConvBuffer+VOFW;
2637     }
2638     else if (srcFormat==PIX_FMT_BGR565)
2639     {
2640         if(c->chrSrcHSubSample)
2641             RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2642         else
2643             RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2644         src1= formatConvBuffer;
2645         src2= formatConvBuffer+VOFW;
2646     }
2647     else if (srcFormat==PIX_FMT_BGR555)
2648     {
2649         if(c->chrSrcHSubSample)
2650             RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2651         else
2652             RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2653         src1= formatConvBuffer;
2654         src2= formatConvBuffer+VOFW;
2655     }
2656     else if (srcFormat==PIX_FMT_BGR32)
2657     {
2658         if(c->chrSrcHSubSample)
2659             RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2660         else
2661             RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2662         src1= formatConvBuffer;
2663         src2= formatConvBuffer+VOFW;
2664     }
2665     else if (srcFormat==PIX_FMT_BGR32_1)
2666     {
2667         if(c->chrSrcHSubSample)
2668             RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2669         else
2670             RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2671         src1= formatConvBuffer;
2672         src2= formatConvBuffer+VOFW;
2673     }
2674     else if (srcFormat==PIX_FMT_RGB24)
2675     {
2676         if(c->chrSrcHSubSample)
2677             RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2678         else
2679             RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2680         src1= formatConvBuffer;
2681         src2= formatConvBuffer+VOFW;
2682     }
2683     else if (srcFormat==PIX_FMT_RGB565)
2684     {
2685         if(c->chrSrcHSubSample)
2686             RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2687         else
2688             RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2689         src1= formatConvBuffer;
2690         src2= formatConvBuffer+VOFW;
2691     }
2692     else if (srcFormat==PIX_FMT_RGB555)
2693     {
2694         if(c->chrSrcHSubSample)
2695             RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2696         else
2697             RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2698         src1= formatConvBuffer;
2699         src2= formatConvBuffer+VOFW;
2700     }
2701     else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2702     {
2703         return;
2704     }
2705     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2706     {
2707         RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2708         src1= formatConvBuffer;
2709         src2= formatConvBuffer+VOFW;
2710     }
2711
2712 #ifdef HAVE_MMX
2713     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2714     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2715 #else
2716     if (!(flags&SWS_FAST_BILINEAR))
2717 #endif
2718     {
2719         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2720         RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2721     }
2722     else // fast bilinear upscale / crap downscale
2723     {
2724 #if defined(ARCH_X86)
2725 #ifdef HAVE_MMX2
2726         int i;
2727 #if defined(PIC)
2728         uint64_t ebxsave __attribute__((aligned(8)));
2729 #endif
2730         if (canMMX2BeUsed)
2731         {
2732             asm volatile(
2733 #if defined(PIC)
2734             "mov          %%"REG_b", %6         \n\t"
2735 #endif
2736             "pxor             %%mm7, %%mm7      \n\t"
2737             "mov                 %0, %%"REG_c"  \n\t"
2738             "mov                 %1, %%"REG_D"  \n\t"
2739             "mov                 %2, %%"REG_d"  \n\t"
2740             "mov                 %3, %%"REG_b"  \n\t"
2741             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2742             PREFETCH"   (%%"REG_c")             \n\t"
2743             PREFETCH" 32(%%"REG_c")             \n\t"
2744             PREFETCH" 64(%%"REG_c")             \n\t"
2745
2746 #ifdef ARCH_X86_64
2747
2748 #define FUNNY_UV_CODE \
2749             "movl       (%%"REG_b"), %%esi      \n\t"\
2750             "call               *%4             \n\t"\
2751             "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752             "add          %%"REG_S", %%"REG_c"  \n\t"\
2753             "add          %%"REG_a", %%"REG_D"  \n\t"\
2754             "xor          %%"REG_a", %%"REG_a"  \n\t"\
2755
2756 #else
2757
2758 #define FUNNY_UV_CODE \
2759             "movl       (%%"REG_b"), %%esi      \n\t"\
2760             "call               *%4             \n\t"\
2761             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762             "add          %%"REG_a", %%"REG_D"  \n\t"\
2763             "xor          %%"REG_a", %%"REG_a"  \n\t"\
2764
2765 #endif /* ARCH_X86_64 */
2766
2767 FUNNY_UV_CODE
2768 FUNNY_UV_CODE
2769 FUNNY_UV_CODE
2770 FUNNY_UV_CODE
2771             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2772             "mov                 %5, %%"REG_c"  \n\t" // src
2773             "mov                 %1, %%"REG_D"  \n\t" // buf1
2774             "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2775             PREFETCH"   (%%"REG_c")             \n\t"
2776             PREFETCH" 32(%%"REG_c")             \n\t"
2777             PREFETCH" 64(%%"REG_c")             \n\t"
2778
2779 FUNNY_UV_CODE
2780 FUNNY_UV_CODE
2781 FUNNY_UV_CODE
2782 FUNNY_UV_CODE
2783
2784 #if defined(PIC)
2785             "mov %6, %%"REG_b"    \n\t"
2786 #endif
2787             :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788             "m" (funnyUVCode), "m" (src2)
2789 #if defined(PIC)
2790             ,"m" (ebxsave)
2791 #endif
2792             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2793 #if !defined(PIC)
2794              ,"%"REG_b
2795 #endif
2796             );
2797             for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2798             {
2799                 //printf("%d %d %d\n", dstWidth, i, srcW);
2800                 dst[i] = src1[srcW-1]*128;
2801                 dst[i+VOFW] = src2[srcW-1]*128;
2802             }
2803         }
2804         else
2805         {
2806 #endif /* HAVE_MMX2 */
2807             long xInc_shr16 = (long) (xInc >> 16);
2808             uint16_t xInc_mask = xInc & 0xffff;
2809             asm volatile(
2810             "xor %%"REG_a", %%"REG_a"               \n\t" // i
2811             "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2812             "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2813             ASMALIGN(4)
2814             "1:                                     \n\t"
2815             "mov        %0, %%"REG_S"               \n\t"
2816             "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2817             "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2818             "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2819             "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820             "shll      $16, %%edi                   \n\t"
2821             "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822             "mov        %1, %%"REG_D"               \n\t"
2823             "shrl       $9, %%esi                   \n\t"
2824             "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2825
2826             "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2827             "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2828             "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2829             "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830             "shll      $16, %%edi                   \n\t"
2831             "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832             "mov        %1, %%"REG_D"               \n\t"
2833             "shrl       $9, %%esi                   \n\t"
2834             "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2835
2836             "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2837             "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2838             "add        $1, %%"REG_a"               \n\t"
2839             "cmp        %2, %%"REG_a"               \n\t"
2840             " jb        1b                          \n\t"
2841
2842 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843    which is needed to support GCC 4.0. */
2844 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2845             :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2846 #else
2847             :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2848 #endif
2849             "r" (src2)
2850             : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2851             );
2852 #ifdef HAVE_MMX2
2853         } //if MMX2 can't be used
2854 #endif
2855 #else
2856         int i;
2857         unsigned int xpos=0;
2858         for (i=0;i<dstWidth;i++)
2859         {
2860             register unsigned int xx=xpos>>16;
2861             register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862             dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2863             dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2864             /* slower
2865             dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2866             dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2867             */
2868             xpos+=xInc;
2869         }
2870 #endif /* defined(ARCH_X86) */
2871     }
2872     if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2873         int i;
2874         //FIXME all pal and rgb srcFormats could do this convertion as well
2875         //FIXME all scalers more complex than bilinear could do half of this transform
2876         if(c->srcRange){
2877             for (i=0; i<dstWidth; i++){
2878                 dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2879                 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2880             }
2881         }else{
2882             for (i=0; i<dstWidth; i++){
2883                 dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2884                 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2885             }
2886         }
2887     }
2888 }
2889
2890 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2891                            int srcSliceH, uint8_t* dst[], int dstStride[]){
2892
2893     /* load a few things into local vars to make the code more readable? and faster */
2894     const int srcW= c->srcW;
2895     const int dstW= c->dstW;
2896     const int dstH= c->dstH;
2897     const int chrDstW= c->chrDstW;
2898     const int chrSrcW= c->chrSrcW;
2899     const int lumXInc= c->lumXInc;
2900     const int chrXInc= c->chrXInc;
2901     const int dstFormat= c->dstFormat;
2902     const int srcFormat= c->srcFormat;
2903     const int flags= c->flags;
2904     const int canMMX2BeUsed= c->canMMX2BeUsed;
2905     int16_t *vLumFilterPos= c->vLumFilterPos;
2906     int16_t *vChrFilterPos= c->vChrFilterPos;
2907     int16_t *hLumFilterPos= c->hLumFilterPos;
2908     int16_t *hChrFilterPos= c->hChrFilterPos;
2909     int16_t *vLumFilter= c->vLumFilter;
2910     int16_t *vChrFilter= c->vChrFilter;
2911     int16_t *hLumFilter= c->hLumFilter;
2912     int16_t *hChrFilter= c->hChrFilter;
2913     int32_t *lumMmxFilter= c->lumMmxFilter;
2914     int32_t *chrMmxFilter= c->chrMmxFilter;
2915     const int vLumFilterSize= c->vLumFilterSize;
2916     const int vChrFilterSize= c->vChrFilterSize;
2917     const int hLumFilterSize= c->hLumFilterSize;
2918     const int hChrFilterSize= c->hChrFilterSize;
2919     int16_t **lumPixBuf= c->lumPixBuf;
2920     int16_t **chrPixBuf= c->chrPixBuf;
2921     const int vLumBufSize= c->vLumBufSize;
2922     const int vChrBufSize= c->vChrBufSize;
2923     uint8_t *funnyYCode= c->funnyYCode;
2924     uint8_t *funnyUVCode= c->funnyUVCode;
2925     uint8_t *formatConvBuffer= c->formatConvBuffer;
2926     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2928     int lastDstY;
2929     uint32_t *pal=c->pal_yuv;
2930
2931     /* vars which will change and which we need to store back in the context */
2932     int dstY= c->dstY;
2933     int lumBufIndex= c->lumBufIndex;
2934     int chrBufIndex= c->chrBufIndex;
2935     int lastInLumBuf= c->lastInLumBuf;
2936     int lastInChrBuf= c->lastInChrBuf;
2937
2938     if (isPacked(c->srcFormat)){
2939         src[0]=
2940         src[1]=
2941         src[2]= src[0];
2942         srcStride[0]=
2943         srcStride[1]=
2944         srcStride[2]= srcStride[0];
2945     }
2946     srcStride[1]<<= c->vChrDrop;
2947     srcStride[2]<<= c->vChrDrop;
2948
2949     //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2950     //       (int)dst[0], (int)dst[1], (int)dst[2]);
2951
2952 #if 0 //self test FIXME move to a vfilter or something
2953     {
2954     static volatile int i=0;
2955     i++;
2956     if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2957         selfTest(src, srcStride, c->srcW, c->srcH);
2958     i--;
2959     }
2960 #endif
2961
2962     //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2963     //dstStride[0],dstStride[1],dstStride[2]);
2964
2965     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2966     {
2967         static int firstTime=1; //FIXME move this into the context perhaps
2968         if (flags & SWS_PRINT_INFO && firstTime)
2969         {
2970             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2971                    "         ->cannot do aligned memory accesses anymore\n");
2972             firstTime=0;
2973         }
2974     }
2975
2976     /* Note the user might start scaling the picture in the middle so this
2977        will not get executed. This is not really intended but works
2978        currently, so people might do it. */
2979     if (srcSliceY ==0){
2980         lumBufIndex=0;
2981         chrBufIndex=0;
2982         dstY=0;
2983         lastInLumBuf= -1;
2984         lastInChrBuf= -1;
2985     }
2986
2987     lastDstY= dstY;
2988
2989     for (;dstY < dstH; dstY++){
2990         unsigned char *dest =dst[0]+dstStride[0]*dstY;
2991         const int chrDstY= dstY>>c->chrDstVSubSample;
2992         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2993         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2994
2995         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2996         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2997         const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2998         const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2999
3000         //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3001         // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3002         //handle holes (FAST_BILINEAR & weird filters)
3003         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3004         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3005         //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3006         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3007         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3008
3009         // Do we have enough lines in this slice to output the dstY line
3010         if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3011         {
3012             //Do horizontal scaling
3013             while(lastInLumBuf < lastLumSrcY)
3014             {
3015                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3016                 lumBufIndex++;
3017                 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3018                 assert(lumBufIndex < 2*vLumBufSize);
3019                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3020                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3021                 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3022                 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3023                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3024                                 funnyYCode, c->srcFormat, formatConvBuffer,
3025                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3026                 lastInLumBuf++;
3027             }
3028             while(lastInChrBuf < lastChrSrcY)
3029             {
3030                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3031                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3032                 chrBufIndex++;
3033                 assert(chrBufIndex < 2*vChrBufSize);
3034                 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3035                 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3036                 //FIXME replace parameters through context struct (some at least)
3037
3038                 if (!(isGray(srcFormat) || isGray(dstFormat)))
3039                     RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3040                                     flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3041                                     funnyUVCode, c->srcFormat, formatConvBuffer,
3042                                     c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3043                 lastInChrBuf++;
3044             }
3045             //wrap buf index around to stay inside the ring buffer
3046             if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3047             if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3048         }
3049         else // not enough lines left in this slice -> load the rest in the buffer
3050         {
3051             /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3052             firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3053             lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3054             vChrBufSize, vLumBufSize);*/
3055
3056             //Do horizontal scaling
3057             while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3058             {
3059                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3060                 lumBufIndex++;
3061                 assert(lumBufIndex < 2*vLumBufSize);
3062                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3063                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3064                 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3065                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3066                                 funnyYCode, c->srcFormat, formatConvBuffer,
3067                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3068                 lastInLumBuf++;
3069             }
3070             while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3071             {
3072                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3073                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3074                 chrBufIndex++;
3075                 assert(chrBufIndex < 2*vChrBufSize);
3076                 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3077                 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3078
3079                 if (!(isGray(srcFormat) || isGray(dstFormat)))
3080                     RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3081                             flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3082                             funnyUVCode, c->srcFormat, formatConvBuffer,
3083                             c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3084                 lastInChrBuf++;
3085             }
3086             //wrap buf index around to stay inside the ring buffer
3087             if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3088             if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3089             break; //we can't output a dstY line so let's try with the next slice
3090         }
3091
3092 #ifdef HAVE_MMX
3093         c->blueDither= ff_dither8[dstY&1];
3094         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3095             c->greenDither= ff_dither8[dstY&1];
3096         else
3097             c->greenDither= ff_dither4[dstY&1];
3098         c->redDither= ff_dither8[(dstY+1)&1];
3099 #endif
3100         if (dstY < dstH-2)
3101         {
3102             int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3103             int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3104 #ifdef HAVE_MMX
3105             int i;
3106         if (flags & SWS_ACCURATE_RND){
3107             int s= APCK_SIZE / 8;
3108             for (i=0; i<vLumFilterSize; i+=2){
3109                 *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
3110                 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
3111                           lumMmxFilter[s*i+APCK_COEF/4  ]=
3112                           lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
3113                     + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3114             }
3115             for (i=0; i<vChrFilterSize; i+=2){
3116                 *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
3117                 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
3118                           chrMmxFilter[s*i+APCK_COEF/4  ]=
3119                           chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3120                     + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3121             }
3122         }else{
3123             for (i=0; i<vLumFilterSize; i++)
3124             {
3125                 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3126                 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3127                 lumMmxFilter[4*i+2]=
3128                 lumMmxFilter[4*i+3]=
3129                     ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3130             }
3131             for (i=0; i<vChrFilterSize; i++)
3132             {
3133                 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3134                 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3135                 chrMmxFilter[4*i+2]=
3136                 chrMmxFilter[4*i+3]=
3137                     ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3138             }
3139         }
3140 #endif
3141             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3142                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3143                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3144                 RENAME(yuv2nv12X)(c,
3145                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3146                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3147                     dest, uDest, dstW, chrDstW, dstFormat);
3148             }
3149             else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3150             {
3151                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3152                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3153                 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3154                 {
3155                     int16_t *lumBuf = lumPixBuf[0];
3156                     int16_t *chrBuf= chrPixBuf[0];
3157                     RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3158                 }
3159                 else //General YV12
3160                 {
3161                     RENAME(yuv2yuvX)(c,
3162                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3163                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3164                         dest, uDest, vDest, dstW, chrDstW);
3165                 }
3166             }
3167             else
3168             {
3169                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3170                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3171                 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3172                 {
3173                     int chrAlpha= vChrFilter[2*dstY+1];
3174                     if(flags & SWS_FULL_CHR_H_INT){
3175                         yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3176                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3177                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3178                             dest, dstW, dstY);
3179                     }else{
3180                         RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3181                             dest, dstW, chrAlpha, dstFormat, flags, dstY);
3182                     }
3183                 }
3184                 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3185                 {
3186                     int lumAlpha= vLumFilter[2*dstY+1];
3187                     int chrAlpha= vChrFilter[2*dstY+1];
3188                     lumMmxFilter[2]=
3189                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3190                     chrMmxFilter[2]=
3191                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3192                     if(flags & SWS_FULL_CHR_H_INT){
3193                         yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3194                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3195                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3196                             dest, dstW, dstY);
3197                     }else{
3198                         RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3199                             dest, dstW, lumAlpha, chrAlpha, dstY);
3200                     }
3201                 }
3202                 else //general RGB
3203                 {
3204                     if(flags & SWS_FULL_CHR_H_INT){
3205                         yuv2rgbXinC_full(c,
3206                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3207                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3208                             dest, dstW, dstY);
3209                     }else{
3210                         RENAME(yuv2packedX)(c,
3211                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3212                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3213                             dest, dstW, dstY);
3214                     }
3215                 }
3216             }
3217         }
3218         else // hmm looks like we can't use MMX here without overwriting this array's tail
3219         {
3220             int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3221             int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3222             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3223                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3224                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3225                 yuv2nv12XinC(
3226                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3227                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3228                     dest, uDest, dstW, chrDstW, dstFormat);
3229             }
3230             else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3231             {
3232                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3233                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3234                 yuv2yuvXinC(
3235                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3236                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3237                     dest, uDest, vDest, dstW, chrDstW);
3238             }
3239             else
3240             {
3241                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3242                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3243                 if(flags & SWS_FULL_CHR_H_INT){
3244                     yuv2rgbXinC_full(c,
3245                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3246                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3247                         dest, dstW, dstY);
3248                 }else{
3249                     yuv2packedXinC(c,
3250                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3251                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3252                         dest, dstW, dstY);
3253                 }
3254             }
3255         }
3256     }
3257
3258 #ifdef HAVE_MMX
3259     asm volatile(SFENCE:::"memory");
3260     asm volatile(EMMS:::"memory");
3261 #endif
3262     /* store changed local vars back in the context */
3263     c->dstY= dstY;
3264     c->lumBufIndex= lumBufIndex;
3265     c->chrBufIndex= chrBufIndex;
3266     c->lastInLumBuf= lastInLumBuf;
3267     c->lastInChrBuf= lastInChrBuf;
3268
3269     return dstY - lastDstY;
3270 }