]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
Rename time variable to the more descriptive block_time.
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS     "femms"
30 #else
31 #define EMMS     "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69                         "xor %%"REG_a", %%"REG_a"       \n\t"\
70                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71                         "movq %%mm3, %%mm4              \n\t"\
72                         "lea " offset "(%0), %%"REG_d"  \n\t"\
73                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74                         ".balign 16                     \n\t" /* FIXME Unroll? */\
75                         "1:                             \n\t"\
76                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79                         "add $16, %%"REG_d"             \n\t"\
80                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81                         "test %%"REG_S", %%"REG_S"      \n\t"\
82                         "pmulhw %%mm0, %%mm2            \n\t"\
83                         "pmulhw %%mm0, %%mm5            \n\t"\
84                         "paddw %%mm2, %%mm3             \n\t"\
85                         "paddw %%mm5, %%mm4             \n\t"\
86                         " jnz 1b                        \n\t"\
87                         "psraw $3, %%mm3                \n\t"\
88                         "psraw $3, %%mm4                \n\t"\
89                         "packuswb %%mm4, %%mm3          \n\t"\
90                         MOVNTQ(%%mm3, (%1, %%REGa))\
91                         "add $8, %%"REG_a"              \n\t"\
92                         "cmp %2, %%"REG_a"              \n\t"\
93                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94                         "movq %%mm3, %%mm4              \n\t"\
95                         "lea " offset "(%0), %%"REG_d"  \n\t"\
96                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97                         "jb 1b                          \n\t"
98
99 #define YSCALEYUV2YV121 \
100                         "mov %2, %%"REG_a"              \n\t"\
101                         ".balign 16                     \n\t" /* FIXME Unroll? */\
102                         "1:                             \n\t"\
103                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105                         "psraw $7, %%mm0                \n\t"\
106                         "psraw $7, %%mm1                \n\t"\
107                         "packuswb %%mm1, %%mm0          \n\t"\
108                         MOVNTQ(%%mm0, (%1, %%REGa))\
109                         "add $8, %%"REG_a"              \n\t"\
110                         "jnc 1b                         \n\t"
111
112 /*
113                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115                            "r" (dest), "m" (dstW),
116                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120                 "xor %%"REG_a", %%"REG_a"       \n\t"\
121                 ".balign 16                     \n\t"\
122                 "nop                            \n\t"\
123                 "1:                             \n\t"\
124                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127                 "movq %%mm3, %%mm4              \n\t"\
128                 ".balign 16                     \n\t"\
129                 "2:                             \n\t"\
130                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133                 "add $16, %%"REG_d"             \n\t"\
134                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135                 "pmulhw %%mm0, %%mm2            \n\t"\
136                 "pmulhw %%mm0, %%mm5            \n\t"\
137                 "paddw %%mm2, %%mm3             \n\t"\
138                 "paddw %%mm5, %%mm4             \n\t"\
139                 "test %%"REG_S", %%"REG_S"      \n\t"\
140                 " jnz 2b                        \n\t"\
141 \
142                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145                 "movq %%mm1, %%mm7              \n\t"\
146                 ".balign 16                     \n\t"\
147                 "2:                             \n\t"\
148                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151                 "add $16, %%"REG_d"             \n\t"\
152                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153                 "pmulhw %%mm0, %%mm2            \n\t"\
154                 "pmulhw %%mm0, %%mm5            \n\t"\
155                 "paddw %%mm2, %%mm1             \n\t"\
156                 "paddw %%mm5, %%mm7             \n\t"\
157                 "test %%"REG_S", %%"REG_S"      \n\t"\
158                 " jnz 2b                        \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162                 YSCALEYUV2PACKEDX\
163                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177                 "paddw %%mm3, %%mm4             \n\t"\
178                 "movq %%mm2, %%mm0              \n\t"\
179                 "movq %%mm5, %%mm6              \n\t"\
180                 "movq %%mm4, %%mm3              \n\t"\
181                 "punpcklwd %%mm2, %%mm2         \n\t"\
182                 "punpcklwd %%mm5, %%mm5         \n\t"\
183                 "punpcklwd %%mm4, %%mm4         \n\t"\
184                 "paddw %%mm1, %%mm2             \n\t"\
185                 "paddw %%mm1, %%mm5             \n\t"\
186                 "paddw %%mm1, %%mm4             \n\t"\
187                 "punpckhwd %%mm0, %%mm0         \n\t"\
188                 "punpckhwd %%mm6, %%mm6         \n\t"\
189                 "punpckhwd %%mm3, %%mm3         \n\t"\
190                 "paddw %%mm7, %%mm0             \n\t"\
191                 "paddw %%mm7, %%mm6             \n\t"\
192                 "paddw %%mm7, %%mm3             \n\t"\
193                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194                 "packuswb %%mm0, %%mm2          \n\t"\
195                 "packuswb %%mm6, %%mm5          \n\t"\
196                 "packuswb %%mm3, %%mm4          \n\t"\
197                 "pxor %%mm7, %%mm7              \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200                 "pxor %%mm7, %%mm7              \n\t"\
201                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202                 "punpcklwd %%mm6, %%mm6         \n\t"\
203                 "punpcklwd %%mm6, %%mm6         \n\t"\
204                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205                 "punpcklwd %%mm5, %%mm5         \n\t"\
206                 "punpcklwd %%mm5, %%mm5         \n\t"\
207                 "xor %%"REG_a", %%"REG_a"               \n\t"\
208                 ".balign 16                     \n\t"\
209                 "1:                             \n\t"\
210                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237 \
238 \
239                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
243                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
244                 "packuswb %%mm3, %%mm3          \n\t"\
245 \
246                 "packuswb %%mm0, %%mm0          \n\t"\
247                 "paddw %%mm4, %%mm2             \n\t"\
248                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
249 \
250                 "packuswb %%mm1, %%mm1          \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256                 "psraw $3, %%mm0                \n\t"\
257                 "psraw $3, %%mm1                \n\t"\
258                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260                 "xor "#index", "#index"         \n\t"\
261                 ".balign 16                     \n\t"\
262                 "1:                             \n\t"\
263                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288                 
289 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290                 
291 #define REAL_YSCALEYUV2RGB(index, c) \
292                 "xor "#index", "#index" \n\t"\
293                 ".balign 16                     \n\t"\
294                 "1:                             \n\t"\
295                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334                 "paddw %%mm3, %%mm4             \n\t"\
335                 "movq %%mm2, %%mm0              \n\t"\
336                 "movq %%mm5, %%mm6              \n\t"\
337                 "movq %%mm4, %%mm3              \n\t"\
338                 "punpcklwd %%mm2, %%mm2         \n\t"\
339                 "punpcklwd %%mm5, %%mm5         \n\t"\
340                 "punpcklwd %%mm4, %%mm4         \n\t"\
341                 "paddw %%mm1, %%mm2             \n\t"\
342                 "paddw %%mm1, %%mm5             \n\t"\
343                 "paddw %%mm1, %%mm4             \n\t"\
344                 "punpckhwd %%mm0, %%mm0         \n\t"\
345                 "punpckhwd %%mm6, %%mm6         \n\t"\
346                 "punpckhwd %%mm3, %%mm3         \n\t"\
347                 "paddw %%mm7, %%mm0             \n\t"\
348                 "paddw %%mm7, %%mm6             \n\t"\
349                 "paddw %%mm7, %%mm3             \n\t"\
350                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351                 "packuswb %%mm0, %%mm2          \n\t"\
352                 "packuswb %%mm6, %%mm5          \n\t"\
353                 "packuswb %%mm3, %%mm4          \n\t"\
354                 "pxor %%mm7, %%mm7              \n\t"
355 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356                 
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358                 "xor "#index", "#index"         \n\t"\
359                 ".balign 16                     \n\t"\
360                 "1:                             \n\t"\
361                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363                 "psraw $7, %%mm3                \n\t" \
364                 "psraw $7, %%mm4                \n\t" \
365                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367                 "psraw $7, %%mm1                \n\t" \
368                 "psraw $7, %%mm7                \n\t" \
369                 
370 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371                 
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373                 "xor "#index", "#index" \n\t"\
374                 ".balign 16                     \n\t"\
375                 "1:                             \n\t"\
376                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398                 "paddw %%mm3, %%mm4             \n\t"\
399                 "movq %%mm2, %%mm0              \n\t"\
400                 "movq %%mm5, %%mm6              \n\t"\
401                 "movq %%mm4, %%mm3              \n\t"\
402                 "punpcklwd %%mm2, %%mm2         \n\t"\
403                 "punpcklwd %%mm5, %%mm5         \n\t"\
404                 "punpcklwd %%mm4, %%mm4         \n\t"\
405                 "paddw %%mm1, %%mm2             \n\t"\
406                 "paddw %%mm1, %%mm5             \n\t"\
407                 "paddw %%mm1, %%mm4             \n\t"\
408                 "punpckhwd %%mm0, %%mm0         \n\t"\
409                 "punpckhwd %%mm6, %%mm6         \n\t"\
410                 "punpckhwd %%mm3, %%mm3         \n\t"\
411                 "paddw %%mm7, %%mm0             \n\t"\
412                 "paddw %%mm7, %%mm6             \n\t"\
413                 "paddw %%mm7, %%mm3             \n\t"\
414                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415                 "packuswb %%mm0, %%mm2          \n\t"\
416                 "packuswb %%mm6, %%mm5          \n\t"\
417                 "packuswb %%mm3, %%mm4          \n\t"\
418                 "pxor %%mm7, %%mm7              \n\t"
419 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422                 "xor "#index", "#index"         \n\t"\
423                 ".balign 16                     \n\t"\
424                 "1:                             \n\t"\
425                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431                 "psrlw $8, %%mm3                \n\t" \
432                 "psrlw $8, %%mm4                \n\t" \
433                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435                 "psraw $7, %%mm1                \n\t" \
436                 "psraw $7, %%mm7                \n\t" 
437 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438                 
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441                 "xor "#index", "#index"         \n\t"\
442                 ".balign 16                     \n\t"\
443                 "1:                             \n\t"\
444                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470                 "paddw %%mm3, %%mm4             \n\t"\
471                 "movq %%mm2, %%mm0              \n\t"\
472                 "movq %%mm5, %%mm6              \n\t"\
473                 "movq %%mm4, %%mm3              \n\t"\
474                 "punpcklwd %%mm2, %%mm2         \n\t"\
475                 "punpcklwd %%mm5, %%mm5         \n\t"\
476                 "punpcklwd %%mm4, %%mm4         \n\t"\
477                 "paddw %%mm1, %%mm2             \n\t"\
478                 "paddw %%mm1, %%mm5             \n\t"\
479                 "paddw %%mm1, %%mm4             \n\t"\
480                 "punpckhwd %%mm0, %%mm0         \n\t"\
481                 "punpckhwd %%mm6, %%mm6         \n\t"\
482                 "punpckhwd %%mm3, %%mm3         \n\t"\
483                 "paddw %%mm7, %%mm0             \n\t"\
484                 "paddw %%mm7, %%mm6             \n\t"\
485                 "paddw %%mm7, %%mm3             \n\t"\
486                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487                 "packuswb %%mm0, %%mm2          \n\t"\
488                 "packuswb %%mm6, %%mm5          \n\t"\
489                 "packuswb %%mm3, %%mm4          \n\t"\
490                 "pxor %%mm7, %%mm7              \n\t"
491 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495                         "movq %%mm2, %%mm1              \n\t" /* B */\
496                         "movq %%mm5, %%mm6              \n\t" /* R */\
497                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507 \
508                         MOVNTQ(%%mm0, (dst, index, 4))\
509                         MOVNTQ(%%mm2, 8(dst, index, 4))\
510                         MOVNTQ(%%mm1, 16(dst, index, 4))\
511                         MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513                         "add $8, "#index"               \n\t"\
514                         "cmp "#dstw", "#index"          \n\t"\
515                         " jb 1b                         \n\t"
516 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522                         "psrlq $3, %%mm2                \n\t"\
523 \
524                         "movq %%mm2, %%mm1              \n\t"\
525                         "movq %%mm4, %%mm3              \n\t"\
526 \
527                         "punpcklbw %%mm7, %%mm3         \n\t"\
528                         "punpcklbw %%mm5, %%mm2         \n\t"\
529                         "punpckhbw %%mm7, %%mm4         \n\t"\
530                         "punpckhbw %%mm5, %%mm1         \n\t"\
531 \
532                         "psllq $3, %%mm3                \n\t"\
533                         "psllq $3, %%mm4                \n\t"\
534 \
535                         "por %%mm3, %%mm2               \n\t"\
536                         "por %%mm4, %%mm1               \n\t"\
537 \
538                         MOVNTQ(%%mm2, (dst, index, 2))\
539                         MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541                         "add $8, "#index"               \n\t"\
542                         "cmp "#dstw", "#index"          \n\t"\
543                         " jb 1b                         \n\t"
544 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550                         "psrlq $3, %%mm2                \n\t"\
551                         "psrlq $1, %%mm5                \n\t"\
552 \
553                         "movq %%mm2, %%mm1              \n\t"\
554                         "movq %%mm4, %%mm3              \n\t"\
555 \
556                         "punpcklbw %%mm7, %%mm3         \n\t"\
557                         "punpcklbw %%mm5, %%mm2         \n\t"\
558                         "punpckhbw %%mm7, %%mm4         \n\t"\
559                         "punpckhbw %%mm5, %%mm1         \n\t"\
560 \
561                         "psllq $2, %%mm3                \n\t"\
562                         "psllq $2, %%mm4                \n\t"\
563 \
564                         "por %%mm3, %%mm2               \n\t"\
565                         "por %%mm4, %%mm1               \n\t"\
566 \
567                         MOVNTQ(%%mm2, (dst, index, 2))\
568                         MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570                         "add $8, "#index"               \n\t"\
571                         "cmp "#dstw", "#index"          \n\t"\
572                         " jb 1b                         \n\t"
573 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577                         "movq %%mm2, %%mm1              \n\t" /* B */\
578                         "movq %%mm5, %%mm6              \n\t" /* R */\
579                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589 \
590                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598 \
599                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612 \
613                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621 \
622                         MOVNTQ(%%mm0, (dst))\
623                         MOVNTQ(%%mm2, 8(dst))\
624                         MOVNTQ(%%mm3, 16(dst))\
625                         "add $24, "#dst"                \n\t"\
626 \
627                         "add $8, "#index"               \n\t"\
628                         "cmp "#dstw", "#index"          \n\t"\
629                         " jb 1b                         \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633                         "movq %%mm2, %%mm1              \n\t" /* B */\
634                         "movq %%mm5, %%mm6              \n\t" /* R */\
635                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645 \
646                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650 \
651                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655 \
656                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660 \
661                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665                         MOVNTQ(%%mm0, (dst))\
666 \
667                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671                         MOVNTQ(%%mm6, 8(dst))\
672 \
673                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676                         MOVNTQ(%%mm5, 16(dst))\
677 \
678                         "add $24, "#dst"                \n\t"\
679 \
680                         "add $8, "#index"                       \n\t"\
681                         "cmp "#dstw", "#index"                  \n\t"\
682                         " jb 1b                         \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
687                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
688                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691 \
692                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695 \
696                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697                         "por %%mm1, %%mm6               \n\t"\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, (dst))\
700 \
701                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705 \
706                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709 \
710                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711                         "por %%mm3, %%mm6               \n\t"\
712                         MOVNTQ(%%mm6, 8(dst))\
713 \
714                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717 \
718                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721 \
722                         "por %%mm1, %%mm3               \n\t"\
723                         "por %%mm3, %%mm6               \n\t"\
724                         MOVNTQ(%%mm6, 16(dst))\
725 \
726                         "add $24, "#dst"                \n\t"\
727 \
728                         "add $8, "#index"               \n\t"\
729                         "cmp "#dstw", "#index"          \n\t"\
730                         " jb 1b                         \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741                         "packuswb %%mm3, %%mm3          \n\t"\
742                         "packuswb %%mm4, %%mm4          \n\t"\
743                         "packuswb %%mm7, %%mm1          \n\t"\
744                         "punpcklbw %%mm4, %%mm3         \n\t"\
745                         "movq %%mm1, %%mm7              \n\t"\
746                         "punpcklbw %%mm3, %%mm1         \n\t"\
747                         "punpckhbw %%mm3, %%mm7         \n\t"\
748 \
749                         MOVNTQ(%%mm1, (dst, index, 2))\
750                         MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752                         "add $8, "#index"               \n\t"\
753                         "cmp "#dstw", "#index"          \n\t"\
754                         " jb 1b                         \n\t"
755 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
761 {
762 #ifdef HAVE_MMX
763         if(uDest != NULL)
764         {
765                 asm volatile(
766                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767                                 :: "r" (&c->redDither),
768                                 "r" (uDest), "p" (chrDstW)
769                                 : "%"REG_a, "%"REG_d, "%"REG_S
770                         );
771
772                 asm volatile(
773                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774                                 :: "r" (&c->redDither),
775                                 "r" (vDest), "p" (chrDstW)
776                                 : "%"REG_a, "%"REG_d, "%"REG_S
777                         );
778         }
779
780         asm volatile(
781                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782                         :: "r" (&c->redDither),
783                            "r" (dest), "p" (dstW)
784                         : "%"REG_a, "%"REG_d, "%"REG_S
785                 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789                       chrFilter, chrSrc, chrFilterSize,
790                       dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793             chrFilter, chrSrc, chrFilterSize,
794             dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802 {
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804              chrFilter, chrSrc, chrFilterSize,
805              dest, uDest, dstW, chrDstW, dstFormat);
806 }
807
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
810 {
811 #ifdef HAVE_MMX
812         if(uDest != NULL)
813         {
814                 asm volatile(
815                                 YSCALEYUV2YV121
816                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817                                 "g" (-chrDstW)
818                                 : "%"REG_a
819                         );
820
821                 asm volatile(
822                                 YSCALEYUV2YV121
823                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824                                 "g" (-chrDstW)
825                                 : "%"REG_a
826                         );
827         }
828
829         asm volatile(
830                 YSCALEYUV2YV121
831                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
832                 "g" (-dstW)
833                 : "%"REG_a
834         );
835 #else
836         int i;
837         for(i=0; i<dstW; i++)
838         {
839                 int val= lumSrc[i]>>7;
840                 
841                 if(val&256){
842                         if(val<0) val=0;
843                         else      val=255;
844                 }
845
846                 dest[i]= val;
847         }
848
849         if(uDest != NULL)
850                 for(i=0; i<chrDstW; i++)
851                 {
852                         int u=chrSrc[i]>>7;
853                         int v=chrSrc[i + 2048]>>7;
854
855                         if((u|v)&256){
856                                 if(u<0)         u=0;
857                                 else if (u>255) u=255;
858                                 if(v<0)         v=0;
859                                 else if (v>255) v=255;
860                         }
861
862                         uDest[i]= u;
863                         vDest[i]= v;
864                 }
865 #endif
866 }
867
868
869 /**
870  * vertical scale YV12 to RGB
871  */
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874                             uint8_t *dest, int dstW, int dstY)
875 {
876         int dummy=0;
877         switch(c->dstFormat)
878         {
879 #ifdef HAVE_MMX
880         case IMGFMT_BGR32:
881                 {
882                         asm volatile(
883                                 YSCALEYUV2RGBX
884                                 WRITEBGR32(%4, %5, %%REGa)
885
886                         :: "r" (&c->redDither), 
887                            "m" (dummy), "m" (dummy), "m" (dummy),
888                            "r" (dest), "m" (dstW)
889                         : "%"REG_a, "%"REG_d, "%"REG_S
890                         );
891                 }
892                 break;
893         case IMGFMT_BGR24:
894                 {
895                         asm volatile(
896                                 YSCALEYUV2RGBX
897                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898                                 "add %4, %%"REG_b"                      \n\t"
899                                 WRITEBGR24(%%REGb, %5, %%REGa)
900
901                         :: "r" (&c->redDither), 
902                            "m" (dummy), "m" (dummy), "m" (dummy),
903                            "r" (dest), "m" (dstW)
904                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
905                         );
906                 }
907                 break;
908         case IMGFMT_BGR15:
909                 {
910                         asm volatile(
911                                 YSCALEYUV2RGBX
912                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 #ifdef DITHER1XBPP
914                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917 #endif
918
919                                 WRITEBGR15(%4, %5, %%REGa)
920
921                         :: "r" (&c->redDither), 
922                            "m" (dummy), "m" (dummy), "m" (dummy),
923                            "r" (dest), "m" (dstW)
924                         : "%"REG_a, "%"REG_d, "%"REG_S
925                         );
926                 }
927                 break;
928         case IMGFMT_BGR16:
929                 {
930                         asm volatile(
931                                 YSCALEYUV2RGBX
932                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937 #endif
938
939                                 WRITEBGR16(%4, %5, %%REGa)
940
941                         :: "r" (&c->redDither), 
942                            "m" (dummy), "m" (dummy), "m" (dummy),
943                            "r" (dest), "m" (dstW)
944                         : "%"REG_a, "%"REG_d, "%"REG_S
945                         );
946                 }
947                 break;
948         case IMGFMT_YUY2:
949                 {
950                         asm volatile(
951                                 YSCALEYUV2PACKEDX
952                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
954                                 "psraw $3, %%mm3                \n\t"
955                                 "psraw $3, %%mm4                \n\t"
956                                 "psraw $3, %%mm1                \n\t"
957                                 "psraw $3, %%mm7                \n\t"
958                                 WRITEYUY2(%4, %5, %%REGa)
959
960                         :: "r" (&c->redDither), 
961                            "m" (dummy), "m" (dummy), "m" (dummy),
962                            "r" (dest), "m" (dstW)
963                         : "%"REG_a, "%"REG_d, "%"REG_S
964                         );
965                 }
966                 break;
967 #endif
968         default:
969 #ifdef HAVE_ALTIVEC
970                 /* The following list of supported dstFormat values should
971                    match what's found in the body of altivec_yuv2packedX() */
972                 if(c->dstFormat==IMGFMT_ABGR  || c->dstFormat==IMGFMT_BGRA  ||
973                    c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
974                    c->dstFormat==IMGFMT_RGBA  || c->dstFormat==IMGFMT_ARGB)
975                         altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
976                                     chrFilter, chrSrc, chrFilterSize,
977                                     dest, dstW, dstY);
978                 else
979 #endif
980                         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
981                                     chrFilter, chrSrc, chrFilterSize,
982                                     dest, dstW, dstY);
983                 break;
984         }
985 }
986
987 /**
988  * vertical bilinear scale YV12 to RGB
989  */
990 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
991                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
992 {
993         int yalpha1=yalpha^4095;
994         int uvalpha1=uvalpha^4095;
995         int i;
996
997 #if 0 //isn't used
998         if(flags&SWS_FULL_CHR_H_INT)
999         {
1000                 switch(dstFormat)
1001                 {
1002 #ifdef HAVE_MMX
1003                 case IMGFMT_BGR32:
1004                         asm volatile(
1005
1006
1007 FULL_YSCALEYUV2RGB
1008                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1009                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1010
1011                         "movq %%mm3, %%mm1              \n\t"
1012                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1013                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1014
1015                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1016                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1017
1018                         "add $4, %%"REG_a"              \n\t"
1019                         "cmp %5, %%"REG_a"              \n\t"
1020                         " jb 1b                         \n\t"
1021
1022
1023                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1024                         "m" (yalpha1), "m" (uvalpha1)
1025                         : "%"REG_a
1026                         );
1027                         break;
1028                 case IMGFMT_BGR24:
1029                         asm volatile(
1030
1031 FULL_YSCALEYUV2RGB
1032
1033                                                                 // lsb ... msb
1034                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1035                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1036
1037                         "movq %%mm3, %%mm1              \n\t"
1038                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1039                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1040
1041                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1042                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1043                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1044                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1045                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1046                         "movq %%mm1, %%mm2              \n\t"
1047                         "psllq $48, %%mm1               \n\t" // 000000BG
1048                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1049
1050                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1051                         "psrld $16, %%mm2               \n\t" // R000R000
1052                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1053                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1054
1055                         "mov %4, %%"REG_b"              \n\t"
1056                         "add %%"REG_a", %%"REG_b"       \n\t"
1057
1058 #ifdef HAVE_MMX2
1059                         //FIXME Alignment
1060                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1061                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1062 #else
1063                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1064                         "psrlq $32, %%mm3               \n\t"
1065                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1066                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1067 #endif
1068                         "add $4, %%"REG_a"              \n\t"
1069                         "cmp %5, %%"REG_a"              \n\t"
1070                         " jb 1b                         \n\t"
1071
1072                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1073                         "m" (yalpha1), "m" (uvalpha1)
1074                         : "%"REG_a, "%"REG_b
1075                         );
1076                         break;
1077                 case IMGFMT_BGR15:
1078                         asm volatile(
1079
1080 FULL_YSCALEYUV2RGB
1081 #ifdef DITHER1XBPP
1082                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1083                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1084                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1085 #endif
1086                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1087                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1088                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1089
1090                         "psrlw $3, %%mm3                \n\t"
1091                         "psllw $2, %%mm1                \n\t"
1092                         "psllw $7, %%mm0                \n\t"
1093                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1094                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1095
1096                         "por %%mm3, %%mm1               \n\t"
1097                         "por %%mm1, %%mm0               \n\t"
1098
1099                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1100
1101                         "add $4, %%"REG_a"              \n\t"
1102                         "cmp %5, %%"REG_a"              \n\t"
1103                         " jb 1b                         \n\t"
1104
1105                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1106                         "m" (yalpha1), "m" (uvalpha1)
1107                         : "%"REG_a
1108                         );
1109                         break;
1110                 case IMGFMT_BGR16:
1111                         asm volatile(
1112
1113 FULL_YSCALEYUV2RGB
1114 #ifdef DITHER1XBPP
1115                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1116                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1117                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1118 #endif
1119                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1120                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1121                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1122
1123                         "psrlw $3, %%mm3                \n\t"
1124                         "psllw $3, %%mm1                \n\t"
1125                         "psllw $8, %%mm0                \n\t"
1126                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1127                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1128
1129                         "por %%mm3, %%mm1               \n\t"
1130                         "por %%mm1, %%mm0               \n\t"
1131
1132                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1133
1134                         "add $4, %%"REG_a"              \n\t"
1135                         "cmp %5, %%"REG_a"              \n\t"
1136                         " jb 1b                         \n\t"
1137
1138                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1139                         "m" (yalpha1), "m" (uvalpha1)
1140                         : "%"REG_a
1141                         );
1142                 break;
1143 #endif
1144                 case IMGFMT_RGB32:
1145 #ifndef HAVE_MMX
1146                 case IMGFMT_BGR32:
1147 #endif
1148                 if(dstFormat==IMGFMT_BGR32)
1149                 {
1150                         int i;
1151 #ifdef WORDS_BIGENDIAN
1152                         dest++;
1153 #endif
1154                         for(i=0;i<dstW;i++){
1155                                 // vertical linear interpolation && yuv2rgb in a single step:
1156                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162                                 dest+= 4;
1163                         }
1164                 }
1165                 else if(dstFormat==IMGFMT_BGR24)
1166                 {
1167                         int i;
1168                         for(i=0;i<dstW;i++){
1169                                 // vertical linear interpolation && yuv2rgb in a single step:
1170                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1174                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1175                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1176                                 dest+= 3;
1177                         }
1178                 }
1179                 else if(dstFormat==IMGFMT_BGR16)
1180                 {
1181                         int i;
1182                         for(i=0;i<dstW;i++){
1183                                 // vertical linear interpolation && yuv2rgb in a single step:
1184                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1185                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1186                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1187
1188                                 ((uint16_t*)dest)[i] =
1189                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1190                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1191                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1192                         }
1193                 }
1194                 else if(dstFormat==IMGFMT_BGR15)
1195                 {
1196                         int i;
1197                         for(i=0;i<dstW;i++){
1198                                 // vertical linear interpolation && yuv2rgb in a single step:
1199                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1200                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1201                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1202
1203                                 ((uint16_t*)dest)[i] =
1204                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1205                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1206                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1207                         }
1208                 }
1209         }//FULL_UV_IPOL
1210         else
1211         {
1212 #endif // if 0
1213 #ifdef HAVE_MMX
1214         switch(c->dstFormat)
1215         {
1216 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1217         case IMGFMT_BGR32:
1218                         asm volatile(
1219                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1220                                 "mov %4, %%"REG_SP"                     \n\t"
1221                                 YSCALEYUV2RGB(%%REGa, %5)
1222                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1223                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1224
1225                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1226                         "r" (&c->redDither)
1227                         : "%"REG_a
1228                         );
1229                         return;
1230         case IMGFMT_BGR24:
1231                         asm volatile(
1232                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1233                                 "mov %4, %%"REG_SP"                     \n\t"
1234                                 YSCALEYUV2RGB(%%REGa, %5)
1235                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1236                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1237                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1238                         "r" (&c->redDither)
1239                         : "%"REG_a
1240                         );
1241                         return;
1242         case IMGFMT_BGR15:
1243                         asm volatile(
1244                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1245                                 "mov %4, %%"REG_SP"                     \n\t"
1246                                 YSCALEYUV2RGB(%%REGa, %5)
1247                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1248 #ifdef DITHER1XBPP
1249                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1250                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1251                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1252 #endif
1253
1254                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1255                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1256
1257                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1258                         "r" (&c->redDither)
1259                         : "%"REG_a
1260                         );
1261                         return;
1262         case IMGFMT_BGR16:
1263                         asm volatile(
1264                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1265                                 "mov %4, %%"REG_SP"                     \n\t"
1266                                 YSCALEYUV2RGB(%%REGa, %5)
1267                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1268 #ifdef DITHER1XBPP
1269                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1270                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1271                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1272 #endif
1273
1274                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1275                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1276                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1277                         "r" (&c->redDither)
1278                         : "%"REG_a
1279                         );
1280                         return;
1281         case IMGFMT_YUY2:
1282                         asm volatile(
1283                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1284                                 "mov %4, %%"REG_SP"                     \n\t"
1285                                 YSCALEYUV2PACKED(%%REGa, %5)
1286                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1287                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1288                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1289                         "r" (&c->redDither)
1290                         : "%"REG_a
1291                         );
1292                         return;
1293         default: break;
1294         }
1295 #endif //HAVE_MMX
1296 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1297 }
1298
1299 /**
1300  * YV12 to RGB without scaling or interpolating
1301  */
1302 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1303                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1304 {
1305         const int yalpha1=0;
1306         int i;
1307         
1308         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1309         const int yalpha= 4096; //FIXME ...
1310
1311         if(flags&SWS_FULL_CHR_H_INT)
1312         {
1313                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1314                 return;
1315         }
1316
1317 #ifdef HAVE_MMX
1318         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1319         {
1320                 switch(dstFormat)
1321                 {
1322                 case IMGFMT_BGR32:
1323                         asm volatile(
1324                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1325                                 "mov %4, %%"REG_SP"                     \n\t"
1326                                 YSCALEYUV2RGB1(%%REGa, %5)
1327                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1328                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1329
1330                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1331                         "r" (&c->redDither)
1332                         : "%"REG_a
1333                         );
1334                         return;
1335                 case IMGFMT_BGR24:
1336                         asm volatile(
1337                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1338                                 "mov %4, %%"REG_SP"                     \n\t"
1339                                 YSCALEYUV2RGB1(%%REGa, %5)
1340                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1341                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1342
1343                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1344                         "r" (&c->redDither)
1345                         : "%"REG_a
1346                         );
1347                         return;
1348                 case IMGFMT_BGR15:
1349                         asm volatile(
1350                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1351                                 "mov %4, %%"REG_SP"                     \n\t"
1352                                 YSCALEYUV2RGB1(%%REGa, %5)
1353                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1354 #ifdef DITHER1XBPP
1355                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1356                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1357                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1358 #endif
1359                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1360                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1361
1362                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1363                         "r" (&c->redDither)
1364                         : "%"REG_a
1365                         );
1366                         return;
1367                 case IMGFMT_BGR16:
1368                         asm volatile(
1369                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1370                                 "mov %4, %%"REG_SP"                     \n\t"
1371                                 YSCALEYUV2RGB1(%%REGa, %5)
1372                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1373 #ifdef DITHER1XBPP
1374                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1375                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1376                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1377 #endif
1378
1379                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1380                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1381
1382                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1383                         "r" (&c->redDither)
1384                         : "%"REG_a
1385                         );
1386                         return;
1387                 case IMGFMT_YUY2:
1388                         asm volatile(
1389                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1390                                 "mov %4, %%"REG_SP"                     \n\t"
1391                                 YSCALEYUV2PACKED1(%%REGa, %5)
1392                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1393                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1394
1395                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1396                         "r" (&c->redDither)
1397                         : "%"REG_a
1398                         );
1399                         return;
1400                 }
1401         }
1402         else
1403         {
1404                 switch(dstFormat)
1405                 {
1406                 case IMGFMT_BGR32:
1407                         asm volatile(
1408                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1409                                 "mov %4, %%"REG_SP"                     \n\t"
1410                                 YSCALEYUV2RGB1b(%%REGa, %5)
1411                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1412                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1413
1414                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1415                         "r" (&c->redDither)
1416                         : "%"REG_a
1417                         );
1418                         return;
1419                 case IMGFMT_BGR24:
1420                         asm volatile(
1421                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1422                                 "mov %4, %%"REG_SP"                     \n\t"
1423                                 YSCALEYUV2RGB1b(%%REGa, %5)
1424                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1425                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1426
1427                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1428                         "r" (&c->redDither)
1429                         : "%"REG_a
1430                         );
1431                         return;
1432                 case IMGFMT_BGR15:
1433                         asm volatile(
1434                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1435                                 "mov %4, %%"REG_SP"                     \n\t"
1436                                 YSCALEYUV2RGB1b(%%REGa, %5)
1437                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1438 #ifdef DITHER1XBPP
1439                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1440                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1441                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1442 #endif
1443                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1444                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1445
1446                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1447                         "r" (&c->redDither)
1448                         : "%"REG_a
1449                         );
1450                         return;
1451                 case IMGFMT_BGR16:
1452                         asm volatile(
1453                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1454                                 "mov %4, %%"REG_SP"                     \n\t"
1455                                 YSCALEYUV2RGB1b(%%REGa, %5)
1456                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1457 #ifdef DITHER1XBPP
1458                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1459                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1460                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1461 #endif
1462
1463                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1464                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1465
1466                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1467                         "r" (&c->redDither)
1468                         : "%"REG_a
1469                         );
1470                         return;
1471                 case IMGFMT_YUY2:
1472                         asm volatile(
1473                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1474                                 "mov %4, %%"REG_SP"                     \n\t"
1475                                 YSCALEYUV2PACKED1b(%%REGa, %5)
1476                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1477                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1478
1479                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1480                         "r" (&c->redDither)
1481                         : "%"REG_a
1482                         );
1483                         return;
1484                 }
1485         }
1486 #endif
1487         if( uvalpha < 2048 )
1488         {
1489                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1490         }else{
1491                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1492         }
1493 }
1494
1495 //FIXME yuy2* can read upto 7 samples to much
1496
1497 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1498 {
1499 #ifdef HAVE_MMX
1500         asm volatile(
1501                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1502                 "mov %0, %%"REG_a"              \n\t"
1503                 "1:                             \n\t"
1504                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1505                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1506                 "pand %%mm2, %%mm0              \n\t"
1507                 "pand %%mm2, %%mm1              \n\t"
1508                 "packuswb %%mm1, %%mm0          \n\t"
1509                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1510                 "add $8, %%"REG_a"              \n\t"
1511                 " js 1b                         \n\t"
1512                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1513                 : "%"REG_a
1514         );
1515 #else
1516         int i;
1517         for(i=0; i<width; i++)
1518                 dst[i]= src[2*i];
1519 #endif
1520 }
1521
1522 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1523 {
1524 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1525         asm volatile(
1526                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1527                 "mov %0, %%"REG_a"              \n\t"
1528                 "1:                             \n\t"
1529                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1530                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1531                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1532                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1533                 PAVGB(%%mm2, %%mm0)
1534                 PAVGB(%%mm3, %%mm1)
1535                 "psrlw $8, %%mm0                \n\t"
1536                 "psrlw $8, %%mm1                \n\t"
1537                 "packuswb %%mm1, %%mm0          \n\t"
1538                 "movq %%mm0, %%mm1              \n\t"
1539                 "psrlw $8, %%mm0                \n\t"
1540                 "pand %%mm4, %%mm1              \n\t"
1541                 "packuswb %%mm0, %%mm0          \n\t"
1542                 "packuswb %%mm1, %%mm1          \n\t"
1543                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1544                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1545                 "add $4, %%"REG_a"              \n\t"
1546                 " js 1b                         \n\t"
1547                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1548                 : "%"REG_a
1549         );
1550 #else
1551         int i;
1552         for(i=0; i<width; i++)
1553         {
1554                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1555                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1556         }
1557 #endif
1558 }
1559
1560 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1561 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1562 {
1563 #ifdef HAVE_MMX
1564         asm volatile(
1565                 "mov %0, %%"REG_a"              \n\t"
1566                 "1:                             \n\t"
1567                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1568                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1569                 "psrlw $8, %%mm0                \n\t"
1570                 "psrlw $8, %%mm1                \n\t"
1571                 "packuswb %%mm1, %%mm0          \n\t"
1572                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1573                 "add $8, %%"REG_a"              \n\t"
1574                 " js 1b                         \n\t"
1575                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1576                 : "%"REG_a
1577         );
1578 #else
1579         int i;
1580         for(i=0; i<width; i++)
1581                 dst[i]= src[2*i+1];
1582 #endif
1583 }
1584
1585 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1586 {
1587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1588         asm volatile(
1589                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1590                 "mov %0, %%"REG_a"              \n\t"
1591                 "1:                             \n\t"
1592                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1593                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1594                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1595                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1596                 PAVGB(%%mm2, %%mm0)
1597                 PAVGB(%%mm3, %%mm1)
1598                 "pand %%mm4, %%mm0              \n\t"
1599                 "pand %%mm4, %%mm1              \n\t"
1600                 "packuswb %%mm1, %%mm0          \n\t"
1601                 "movq %%mm0, %%mm1              \n\t"
1602                 "psrlw $8, %%mm0                \n\t"
1603                 "pand %%mm4, %%mm1              \n\t"
1604                 "packuswb %%mm0, %%mm0          \n\t"
1605                 "packuswb %%mm1, %%mm1          \n\t"
1606                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1607                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1608                 "add $4, %%"REG_a"              \n\t"
1609                 " js 1b                         \n\t"
1610                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1611                 : "%"REG_a
1612         );
1613 #else
1614         int i;
1615         for(i=0; i<width; i++)
1616         {
1617                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1618                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1619         }
1620 #endif
1621 }
1622
1623 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1624 {
1625         int i;
1626         for(i=0; i<width; i++)
1627         {
1628                 int b=  ((uint32_t*)src)[i]&0xFF;
1629                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1630                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1631
1632                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1633         }
1634 }
1635
1636 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1637 {
1638         int i;
1639         for(i=0; i<width; i++)
1640         {
1641                 const int a= ((uint32_t*)src1)[2*i+0];
1642                 const int e= ((uint32_t*)src1)[2*i+1];
1643                 const int c= ((uint32_t*)src2)[2*i+0];
1644                 const int d= ((uint32_t*)src2)[2*i+1];
1645                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1646                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1647                 const int b=  l&0x3FF;
1648                 const int g=  h>>8;
1649                 const int r=  l>>16;
1650
1651                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1652                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1653         }
1654 }
1655
1656 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1657 {
1658 #ifdef HAVE_MMX
1659         asm volatile(
1660                 "mov %2, %%"REG_a"              \n\t"
1661                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1662                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1663                 "pxor %%mm7, %%mm7              \n\t"
1664                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1665                 ".balign 16                     \n\t"
1666                 "1:                             \n\t"
1667                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1668                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1669                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1670                 "punpcklbw %%mm7, %%mm0         \n\t"
1671                 "punpcklbw %%mm7, %%mm1         \n\t"
1672                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1673                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1674                 "punpcklbw %%mm7, %%mm2         \n\t"
1675                 "punpcklbw %%mm7, %%mm3         \n\t"
1676                 "pmaddwd %%mm6, %%mm0           \n\t"
1677                 "pmaddwd %%mm6, %%mm1           \n\t"
1678                 "pmaddwd %%mm6, %%mm2           \n\t"
1679                 "pmaddwd %%mm6, %%mm3           \n\t"
1680 #ifndef FAST_BGR2YV12
1681                 "psrad $8, %%mm0                \n\t"
1682                 "psrad $8, %%mm1                \n\t"
1683                 "psrad $8, %%mm2                \n\t"
1684                 "psrad $8, %%mm3                \n\t"
1685 #endif
1686                 "packssdw %%mm1, %%mm0          \n\t"
1687                 "packssdw %%mm3, %%mm2          \n\t"
1688                 "pmaddwd %%mm5, %%mm0           \n\t"
1689                 "pmaddwd %%mm5, %%mm2           \n\t"
1690                 "packssdw %%mm2, %%mm0          \n\t"
1691                 "psraw $7, %%mm0                \n\t"
1692
1693                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1694                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1695                 "punpcklbw %%mm7, %%mm4         \n\t"
1696                 "punpcklbw %%mm7, %%mm1         \n\t"
1697                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1698                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1699                 "punpcklbw %%mm7, %%mm2         \n\t"
1700                 "punpcklbw %%mm7, %%mm3         \n\t"
1701                 "pmaddwd %%mm6, %%mm4           \n\t"
1702                 "pmaddwd %%mm6, %%mm1           \n\t"
1703                 "pmaddwd %%mm6, %%mm2           \n\t"
1704                 "pmaddwd %%mm6, %%mm3           \n\t"
1705 #ifndef FAST_BGR2YV12
1706                 "psrad $8, %%mm4                \n\t"
1707                 "psrad $8, %%mm1                \n\t"
1708                 "psrad $8, %%mm2                \n\t"
1709                 "psrad $8, %%mm3                \n\t"
1710 #endif
1711                 "packssdw %%mm1, %%mm4          \n\t"
1712                 "packssdw %%mm3, %%mm2          \n\t"
1713                 "pmaddwd %%mm5, %%mm4           \n\t"
1714                 "pmaddwd %%mm5, %%mm2           \n\t"
1715                 "add $24, %%"REG_b"             \n\t"
1716                 "packssdw %%mm2, %%mm4          \n\t"
1717                 "psraw $7, %%mm4                \n\t"
1718
1719                 "packuswb %%mm4, %%mm0          \n\t"
1720                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1721
1722                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1723                 "add $8, %%"REG_a"              \n\t"
1724                 " js 1b                         \n\t"
1725                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1726                 : "%"REG_a, "%"REG_b
1727         );
1728 #else
1729         int i;
1730         for(i=0; i<width; i++)
1731         {
1732                 int b= src[i*3+0];
1733                 int g= src[i*3+1];
1734                 int r= src[i*3+2];
1735
1736                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1737         }
1738 #endif
1739 }
1740
1741 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1742 {
1743 #ifdef HAVE_MMX
1744         asm volatile(
1745                 "mov %4, %%"REG_a"              \n\t"
1746                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1747                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1748                 "pxor %%mm7, %%mm7              \n\t"
1749                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1750                 "add %%"REG_b", %%"REG_b"       \n\t"
1751                 ".balign 16                     \n\t"
1752                 "1:                             \n\t"
1753                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1754                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1755 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1756                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1757                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1758                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1759                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1760                 PAVGB(%%mm1, %%mm0)
1761                 PAVGB(%%mm3, %%mm2)
1762                 "movq %%mm0, %%mm1              \n\t"
1763                 "movq %%mm2, %%mm3              \n\t"
1764                 "psrlq $24, %%mm0               \n\t"
1765                 "psrlq $24, %%mm2               \n\t"
1766                 PAVGB(%%mm1, %%mm0)
1767                 PAVGB(%%mm3, %%mm2)
1768                 "punpcklbw %%mm7, %%mm0         \n\t"
1769                 "punpcklbw %%mm7, %%mm2         \n\t"
1770 #else
1771                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1772                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1773                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1774                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1775                 "punpcklbw %%mm7, %%mm0         \n\t"
1776                 "punpcklbw %%mm7, %%mm1         \n\t"
1777                 "punpcklbw %%mm7, %%mm2         \n\t"
1778                 "punpcklbw %%mm7, %%mm3         \n\t"
1779                 "paddw %%mm1, %%mm0             \n\t"
1780                 "paddw %%mm3, %%mm2             \n\t"
1781                 "paddw %%mm2, %%mm0             \n\t"
1782                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1783                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1784                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1785                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1786                 "punpcklbw %%mm7, %%mm4         \n\t"
1787                 "punpcklbw %%mm7, %%mm1         \n\t"
1788                 "punpcklbw %%mm7, %%mm2         \n\t"
1789                 "punpcklbw %%mm7, %%mm3         \n\t"
1790                 "paddw %%mm1, %%mm4             \n\t"
1791                 "paddw %%mm3, %%mm2             \n\t"
1792                 "paddw %%mm4, %%mm2             \n\t"
1793                 "psrlw $2, %%mm0                \n\t"
1794                 "psrlw $2, %%mm2                \n\t"
1795 #endif
1796                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1797                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1798                 
1799                 "pmaddwd %%mm0, %%mm1           \n\t"
1800                 "pmaddwd %%mm2, %%mm3           \n\t"
1801                 "pmaddwd %%mm6, %%mm0           \n\t"
1802                 "pmaddwd %%mm6, %%mm2           \n\t"
1803 #ifndef FAST_BGR2YV12
1804                 "psrad $8, %%mm0                \n\t"
1805                 "psrad $8, %%mm1                \n\t"
1806                 "psrad $8, %%mm2                \n\t"
1807                 "psrad $8, %%mm3                \n\t"
1808 #endif
1809                 "packssdw %%mm2, %%mm0          \n\t"
1810                 "packssdw %%mm3, %%mm1          \n\t"
1811                 "pmaddwd %%mm5, %%mm0           \n\t"
1812                 "pmaddwd %%mm5, %%mm1           \n\t"
1813                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1814                 "psraw $7, %%mm0                \n\t"
1815
1816 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1817                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1818                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1819                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1820                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1821                 PAVGB(%%mm1, %%mm4)
1822                 PAVGB(%%mm3, %%mm2)
1823                 "movq %%mm4, %%mm1              \n\t"
1824                 "movq %%mm2, %%mm3              \n\t"
1825                 "psrlq $24, %%mm4               \n\t"
1826                 "psrlq $24, %%mm2               \n\t"
1827                 PAVGB(%%mm1, %%mm4)
1828                 PAVGB(%%mm3, %%mm2)
1829                 "punpcklbw %%mm7, %%mm4         \n\t"
1830                 "punpcklbw %%mm7, %%mm2         \n\t"
1831 #else
1832                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1833                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1834                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1835                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1836                 "punpcklbw %%mm7, %%mm4         \n\t"
1837                 "punpcklbw %%mm7, %%mm1         \n\t"
1838                 "punpcklbw %%mm7, %%mm2         \n\t"
1839                 "punpcklbw %%mm7, %%mm3         \n\t"
1840                 "paddw %%mm1, %%mm4             \n\t"
1841                 "paddw %%mm3, %%mm2             \n\t"
1842                 "paddw %%mm2, %%mm4             \n\t"
1843                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1844                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1845                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1846                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1847                 "punpcklbw %%mm7, %%mm5         \n\t"
1848                 "punpcklbw %%mm7, %%mm1         \n\t"
1849                 "punpcklbw %%mm7, %%mm2         \n\t"
1850                 "punpcklbw %%mm7, %%mm3         \n\t"
1851                 "paddw %%mm1, %%mm5             \n\t"
1852                 "paddw %%mm3, %%mm2             \n\t"
1853                 "paddw %%mm5, %%mm2             \n\t"
1854                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1855                 "psrlw $2, %%mm4                \n\t"
1856                 "psrlw $2, %%mm2                \n\t"
1857 #endif
1858                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1859                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1860                 
1861                 "pmaddwd %%mm4, %%mm1           \n\t"
1862                 "pmaddwd %%mm2, %%mm3           \n\t"
1863                 "pmaddwd %%mm6, %%mm4           \n\t"
1864                 "pmaddwd %%mm6, %%mm2           \n\t"
1865 #ifndef FAST_BGR2YV12
1866                 "psrad $8, %%mm4                \n\t"
1867                 "psrad $8, %%mm1                \n\t"
1868                 "psrad $8, %%mm2                \n\t"
1869                 "psrad $8, %%mm3                \n\t"
1870 #endif
1871                 "packssdw %%mm2, %%mm4          \n\t"
1872                 "packssdw %%mm3, %%mm1          \n\t"
1873                 "pmaddwd %%mm5, %%mm4           \n\t"
1874                 "pmaddwd %%mm5, %%mm1           \n\t"
1875                 "add $24, %%"REG_b"             \n\t"
1876                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1877                 "psraw $7, %%mm4                \n\t"
1878                 
1879                 "movq %%mm0, %%mm1              \n\t"
1880                 "punpckldq %%mm4, %%mm0         \n\t"
1881                 "punpckhdq %%mm4, %%mm1         \n\t"
1882                 "packsswb %%mm1, %%mm0          \n\t"
1883                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1884
1885                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1886                 "punpckhdq %%mm0, %%mm0         \n\t"
1887                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1888                 "add $4, %%"REG_a"              \n\t"
1889                 " js 1b                         \n\t"
1890                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1891                 : "%"REG_a, "%"REG_b
1892         );
1893 #else
1894         int i;
1895         for(i=0; i<width; i++)
1896         {
1897                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1898                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1899                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1900
1901                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1902                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1903         }
1904 #endif
1905 }
1906
1907 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1908 {
1909         int i;
1910         for(i=0; i<width; i++)
1911         {
1912                 int d= ((uint16_t*)src)[i];
1913                 int b= d&0x1F;
1914                 int g= (d>>5)&0x3F;
1915                 int r= (d>>11)&0x1F;
1916
1917                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1918         }
1919 }
1920
1921 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1922 {
1923         int i;
1924         for(i=0; i<width; i++)
1925         {
1926                 int d0= ((uint32_t*)src1)[i];
1927                 int d1= ((uint32_t*)src2)[i];
1928                 
1929                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1930                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1931
1932                 int dh2= (dh>>11) + (dh<<21);
1933                 int d= dh2 + dl;
1934
1935                 int b= d&0x7F;
1936                 int r= (d>>11)&0x7F;
1937                 int g= d>>21;
1938                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1939                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1940         }
1941 }
1942
1943 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1944 {
1945         int i;
1946         for(i=0; i<width; i++)
1947         {
1948                 int d= ((uint16_t*)src)[i];
1949                 int b= d&0x1F;
1950                 int g= (d>>5)&0x1F;
1951                 int r= (d>>10)&0x1F;
1952
1953                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1954         }
1955 }
1956
1957 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1958 {
1959         int i;
1960         for(i=0; i<width; i++)
1961         {
1962                 int d0= ((uint32_t*)src1)[i];
1963                 int d1= ((uint32_t*)src2)[i];
1964                 
1965                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1966                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1967
1968                 int dh2= (dh>>11) + (dh<<21);
1969                 int d= dh2 + dl;
1970
1971                 int b= d&0x7F;
1972                 int r= (d>>10)&0x7F;
1973                 int g= d>>21;
1974                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1975                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1976         }
1977 }
1978
1979
1980 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1981 {
1982         int i;
1983         for(i=0; i<width; i++)
1984         {
1985                 int r=  ((uint32_t*)src)[i]&0xFF;
1986                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1987                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1988
1989                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1990         }
1991 }
1992
1993 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1994 {
1995         int i;
1996         for(i=0; i<width; i++)
1997         {
1998                 const int a= ((uint32_t*)src1)[2*i+0];
1999                 const int e= ((uint32_t*)src1)[2*i+1];
2000                 const int c= ((uint32_t*)src2)[2*i+0];
2001                 const int d= ((uint32_t*)src2)[2*i+1];
2002                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2003                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2004                 const int r=  l&0x3FF;
2005                 const int g=  h>>8;
2006                 const int b=  l>>16;
2007
2008                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2009                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2010         }
2011 }
2012
2013 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2014 {
2015         int i;
2016         for(i=0; i<width; i++)
2017         {
2018                 int r= src[i*3+0];
2019                 int g= src[i*3+1];
2020                 int b= src[i*3+2];
2021
2022                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2023         }
2024 }
2025
2026 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2027 {
2028         int i;
2029         for(i=0; i<width; i++)
2030         {
2031                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2032                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2033                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2034
2035                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2036                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2037         }
2038 }
2039
2040
2041 // Bilinear / Bicubic scaling
2042 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2043                                   int16_t *filter, int16_t *filterPos, long filterSize)
2044 {
2045 #ifdef HAVE_MMX
2046         assert(filterSize % 4 == 0 && filterSize>0);
2047         if(filterSize==4) // allways true for upscaling, sometimes for down too
2048         {
2049                 long counter= -2*dstW;
2050                 filter-= counter*2;
2051                 filterPos-= counter/2;
2052                 dst-= counter/2;
2053                 asm volatile(
2054                         "pxor %%mm7, %%mm7              \n\t"
2055                         "movq "MANGLE(w02)", %%mm6      \n\t"
2056                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2057                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2058                         ".balign 16                     \n\t"
2059                         "1:                             \n\t"
2060                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2061                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2062                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2063                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2064                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2065                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2066                         "punpcklbw %%mm7, %%mm0         \n\t"
2067                         "punpcklbw %%mm7, %%mm2         \n\t"
2068                         "pmaddwd %%mm1, %%mm0           \n\t"
2069                         "pmaddwd %%mm2, %%mm3           \n\t"
2070                         "psrad $8, %%mm0                \n\t"
2071                         "psrad $8, %%mm3                \n\t"
2072                         "packssdw %%mm3, %%mm0          \n\t"
2073                         "pmaddwd %%mm6, %%mm0           \n\t"
2074                         "packssdw %%mm0, %%mm0          \n\t"
2075                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2076                         "add $4, %%"REG_BP"             \n\t"
2077                         " jnc 1b                        \n\t"
2078
2079                         "pop %%"REG_BP"                 \n\t"
2080                         : "+a" (counter)
2081                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2082                         : "%"REG_b
2083                 );
2084         }
2085         else if(filterSize==8)
2086         {
2087                 long counter= -2*dstW;
2088                 filter-= counter*4;
2089                 filterPos-= counter/2;
2090                 dst-= counter/2;
2091                 asm volatile(
2092                         "pxor %%mm7, %%mm7              \n\t"
2093                         "movq "MANGLE(w02)", %%mm6      \n\t"
2094                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2095                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2096                         ".balign 16                     \n\t"
2097                         "1:                             \n\t"
2098                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2099                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2100                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2101                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2102                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2103                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2104                         "punpcklbw %%mm7, %%mm0         \n\t"
2105                         "punpcklbw %%mm7, %%mm2         \n\t"
2106                         "pmaddwd %%mm1, %%mm0           \n\t"
2107                         "pmaddwd %%mm2, %%mm3           \n\t"
2108
2109                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2110                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2111                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2112                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2113                         "punpcklbw %%mm7, %%mm4         \n\t"
2114                         "punpcklbw %%mm7, %%mm2         \n\t"
2115                         "pmaddwd %%mm1, %%mm4           \n\t"
2116                         "pmaddwd %%mm2, %%mm5           \n\t"
2117                         "paddd %%mm4, %%mm0             \n\t"
2118                         "paddd %%mm5, %%mm3             \n\t"
2119                                                 
2120                         "psrad $8, %%mm0                \n\t"
2121                         "psrad $8, %%mm3                \n\t"
2122                         "packssdw %%mm3, %%mm0          \n\t"
2123                         "pmaddwd %%mm6, %%mm0           \n\t"
2124                         "packssdw %%mm0, %%mm0          \n\t"
2125                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2126                         "add $4, %%"REG_BP"             \n\t"
2127                         " jnc 1b                        \n\t"
2128
2129                         "pop %%"REG_BP"                 \n\t"
2130                         : "+a" (counter)
2131                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2132                         : "%"REG_b
2133                 );
2134         }
2135         else
2136         {
2137                 uint8_t *offset = src+filterSize;
2138                 long counter= -2*dstW;
2139 //              filter-= counter*filterSize/2;
2140                 filterPos-= counter/2;
2141                 dst-= counter/2;
2142                 asm volatile(
2143                         "pxor %%mm7, %%mm7              \n\t"
2144                         "movq "MANGLE(w02)", %%mm6      \n\t"
2145                         ".balign 16                     \n\t"
2146                         "1:                             \n\t"
2147                         "mov %2, %%"REG_c"              \n\t"
2148                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2149                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2150                         "mov %5, %%"REG_c"              \n\t"
2151                         "pxor %%mm4, %%mm4              \n\t"
2152                         "pxor %%mm5, %%mm5              \n\t"
2153                         "2:                             \n\t"
2154                         "movq (%1), %%mm1               \n\t"
2155                         "movq (%1, %6), %%mm3           \n\t"
2156                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2157                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2158                         "punpcklbw %%mm7, %%mm0         \n\t"
2159                         "punpcklbw %%mm7, %%mm2         \n\t"
2160                         "pmaddwd %%mm1, %%mm0           \n\t"
2161                         "pmaddwd %%mm2, %%mm3           \n\t"
2162                         "paddd %%mm3, %%mm5             \n\t"
2163                         "paddd %%mm0, %%mm4             \n\t"
2164                         "add $8, %1                     \n\t"
2165                         "add $4, %%"REG_c"              \n\t"
2166                         "cmp %4, %%"REG_c"              \n\t"
2167                         " jb 2b                         \n\t"
2168                         "add %6, %1                     \n\t"
2169                         "psrad $8, %%mm4                \n\t"
2170                         "psrad $8, %%mm5                \n\t"
2171                         "packssdw %%mm5, %%mm4          \n\t"
2172                         "pmaddwd %%mm6, %%mm4           \n\t"
2173                         "packssdw %%mm4, %%mm4          \n\t"
2174                         "mov %3, %%"REG_a"              \n\t"
2175                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2176                         "add $4, %0                     \n\t"
2177                         " jnc 1b                        \n\t"
2178
2179                         : "+r" (counter), "+r" (filter)
2180                         : "m" (filterPos), "m" (dst), "m"(offset),
2181                           "m" (src), "r" (filterSize*2)
2182                         : "%"REG_b, "%"REG_a, "%"REG_c
2183                 );
2184         }
2185 #else
2186 #ifdef HAVE_ALTIVEC
2187         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2188 #else
2189         int i;
2190         for(i=0; i<dstW; i++)
2191         {
2192                 int j;
2193                 int srcPos= filterPos[i];
2194                 int val=0;
2195 //              printf("filterPos: %d\n", filterPos[i]);
2196                 for(j=0; j<filterSize; j++)
2197                 {
2198 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2199                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2200                 }
2201 //              filter += hFilterSize;
2202                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2203 //              dst[i] = val>>7;
2204         }
2205 #endif
2206 #endif
2207 }
2208       // *** horizontal scale Y line to temp buffer
2209 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2210                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2211                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2212                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2213                                    int32_t *mmx2FilterPos)
2214 {
2215     if(srcFormat==IMGFMT_YUY2)
2216     {
2217         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2218         src= formatConvBuffer;
2219     }
2220     else if(srcFormat==IMGFMT_UYVY)
2221     {
2222         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2223         src= formatConvBuffer;
2224     }
2225     else if(srcFormat==IMGFMT_BGR32)
2226     {
2227         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2228         src= formatConvBuffer;
2229     }
2230     else if(srcFormat==IMGFMT_BGR24)
2231     {
2232         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2233         src= formatConvBuffer;
2234     }
2235     else if(srcFormat==IMGFMT_BGR16)
2236     {
2237         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2238         src= formatConvBuffer;
2239     }
2240     else if(srcFormat==IMGFMT_BGR15)
2241     {
2242         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2243         src= formatConvBuffer;
2244     }
2245     else if(srcFormat==IMGFMT_RGB32)
2246     {
2247         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2248         src= formatConvBuffer;
2249     }
2250     else if(srcFormat==IMGFMT_RGB24)
2251     {
2252         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2253         src= formatConvBuffer;
2254     }
2255
2256 #ifdef HAVE_MMX
2257         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2258     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2259 #else
2260     if(!(flags&SWS_FAST_BILINEAR))
2261 #endif
2262     {
2263         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2264     }
2265     else // Fast Bilinear upscale / crap downscale
2266     {
2267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2268 #ifdef HAVE_MMX2
2269         int i;
2270         if(canMMX2BeUsed)
2271         {
2272                 asm volatile(
2273                         "pxor %%mm7, %%mm7              \n\t"
2274                         "mov %0, %%"REG_c"              \n\t"
2275                         "mov %1, %%"REG_D"              \n\t"
2276                         "mov %2, %%"REG_d"              \n\t"
2277                         "mov %3, %%"REG_b"              \n\t"
2278                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2279                         PREFETCH" (%%"REG_c")           \n\t"
2280                         PREFETCH" 32(%%"REG_c")         \n\t"
2281                         PREFETCH" 64(%%"REG_c")         \n\t"
2282
2283 #ifdef ARCH_X86_64
2284
2285 #define FUNNY_Y_CODE \
2286                         "movl (%%"REG_b"), %%esi        \n\t"\
2287                         "call *%4                       \n\t"\
2288                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2289                         "add %%"REG_S", %%"REG_c"       \n\t"\
2290                         "add %%"REG_a", %%"REG_D"       \n\t"\
2291                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2292
2293 #else
2294
2295 #define FUNNY_Y_CODE \
2296                         "movl (%%"REG_b"), %%esi        \n\t"\
2297                         "call *%4                       \n\t"\
2298                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2299                         "add %%"REG_a", %%"REG_D"       \n\t"\
2300                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2301
2302 #endif
2303
2304 FUNNY_Y_CODE
2305 FUNNY_Y_CODE
2306 FUNNY_Y_CODE
2307 FUNNY_Y_CODE
2308 FUNNY_Y_CODE
2309 FUNNY_Y_CODE
2310 FUNNY_Y_CODE
2311 FUNNY_Y_CODE
2312
2313                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2314                         "m" (funnyYCode)
2315                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2316                 );
2317                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2318         }
2319         else
2320         {
2321 #endif
2322         int xInc_shr16 = xInc >> 16;
2323         int xInc_mask = xInc & 0xffff;
2324         //NO MMX just normal asm ...
2325         asm volatile(
2326                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2327                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2328                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2329                 ".balign 16                     \n\t"
2330                 "1:                             \n\t"
2331                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2332                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2333                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2334                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2335                 "shll $16, %%edi                \n\t"
2336                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2337                 "mov %1, %%"REG_D"              \n\t"
2338                 "shrl $9, %%esi                 \n\t"
2339                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2340                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2341                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2342
2343                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2344                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2345                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2346                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2347                 "shll $16, %%edi                \n\t"
2348                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2349                 "mov %1, %%"REG_D"              \n\t"
2350                 "shrl $9, %%esi                 \n\t"
2351                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2352                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2353                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2354
2355
2356                 "add $2, %%"REG_a"              \n\t"
2357                 "cmp %2, %%"REG_a"              \n\t"
2358                 " jb 1b                         \n\t"
2359
2360
2361                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2363                 );
2364 #ifdef HAVE_MMX2
2365         } //if MMX2 can't be used
2366 #endif
2367 #else
2368         int i;
2369         unsigned int xpos=0;
2370         for(i=0;i<dstWidth;i++)
2371         {
2372                 register unsigned int xx=xpos>>16;
2373                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2374                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2375                 xpos+=xInc;
2376         }
2377 #endif
2378     }
2379 }
2380
2381 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2382                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2383                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2384                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2385                                    int32_t *mmx2FilterPos)
2386 {
2387     if(srcFormat==IMGFMT_YUY2)
2388     {
2389         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2390         src1= formatConvBuffer;
2391         src2= formatConvBuffer+2048;
2392     }
2393     else if(srcFormat==IMGFMT_UYVY)
2394     {
2395         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2396         src1= formatConvBuffer;
2397         src2= formatConvBuffer+2048;
2398     }
2399     else if(srcFormat==IMGFMT_BGR32)
2400     {
2401         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2402         src1= formatConvBuffer;
2403         src2= formatConvBuffer+2048;
2404     }
2405     else if(srcFormat==IMGFMT_BGR24)
2406     {
2407         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2408         src1= formatConvBuffer;
2409         src2= formatConvBuffer+2048;
2410     }
2411     else if(srcFormat==IMGFMT_BGR16)
2412     {
2413         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2414         src1= formatConvBuffer;
2415         src2= formatConvBuffer+2048;
2416     }
2417     else if(srcFormat==IMGFMT_BGR15)
2418     {
2419         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2420         src1= formatConvBuffer;
2421         src2= formatConvBuffer+2048;
2422     }
2423     else if(srcFormat==IMGFMT_RGB32)
2424     {
2425         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2426         src1= formatConvBuffer;
2427         src2= formatConvBuffer+2048;
2428     }
2429     else if(srcFormat==IMGFMT_RGB24)
2430     {
2431         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2432         src1= formatConvBuffer;
2433         src2= formatConvBuffer+2048;
2434     }
2435     else if(isGray(srcFormat))
2436     {
2437         return;
2438     }
2439
2440 #ifdef HAVE_MMX
2441         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2442     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2443 #else
2444     if(!(flags&SWS_FAST_BILINEAR))
2445 #endif
2446     {
2447         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2448         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2449     }
2450     else // Fast Bilinear upscale / crap downscale
2451     {
2452 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2453 #ifdef HAVE_MMX2
2454         int i;
2455         if(canMMX2BeUsed)
2456         {
2457                 asm volatile(
2458                         "pxor %%mm7, %%mm7              \n\t"
2459                         "mov %0, %%"REG_c"              \n\t"
2460                         "mov %1, %%"REG_D"              \n\t"
2461                         "mov %2, %%"REG_d"              \n\t"
2462                         "mov %3, %%"REG_b"              \n\t"
2463                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2464                         PREFETCH" (%%"REG_c")           \n\t"
2465                         PREFETCH" 32(%%"REG_c")         \n\t"
2466                         PREFETCH" 64(%%"REG_c")         \n\t"
2467
2468 #ifdef ARCH_X86_64
2469
2470 #define FUNNY_UV_CODE \
2471                         "movl (%%"REG_b"), %%esi        \n\t"\
2472                         "call *%4                       \n\t"\
2473                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2474                         "add %%"REG_S", %%"REG_c"       \n\t"\
2475                         "add %%"REG_a", %%"REG_D"       \n\t"\
2476                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2477
2478 #else
2479
2480 #define FUNNY_UV_CODE \
2481                         "movl (%%"REG_b"), %%esi        \n\t"\
2482                         "call *%4                       \n\t"\
2483                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2484                         "add %%"REG_a", %%"REG_D"       \n\t"\
2485                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2486
2487 #endif
2488
2489 FUNNY_UV_CODE
2490 FUNNY_UV_CODE
2491 FUNNY_UV_CODE
2492 FUNNY_UV_CODE
2493                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2494                         "mov %5, %%"REG_c"              \n\t" // src
2495                         "mov %1, %%"REG_D"              \n\t" // buf1
2496                         "add $4096, %%"REG_D"           \n\t"
2497                         PREFETCH" (%%"REG_c")           \n\t"
2498                         PREFETCH" 32(%%"REG_c")         \n\t"
2499                         PREFETCH" 64(%%"REG_c")         \n\t"
2500
2501 FUNNY_UV_CODE
2502 FUNNY_UV_CODE
2503 FUNNY_UV_CODE
2504 FUNNY_UV_CODE
2505
2506                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507                         "m" (funnyUVCode), "m" (src2)
2508                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2509                 );
2510                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2511                 {
2512 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2513                         dst[i] = src1[srcW-1]*128;
2514                         dst[i+2048] = src2[srcW-1]*128;
2515                 }
2516         }
2517         else
2518         {
2519 #endif
2520         long xInc_shr16 = (long) (xInc >> 16);
2521         int xInc_mask = xInc & 0xffff; 
2522         asm volatile(
2523                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2524                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2525                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2526                 ".balign 16                     \n\t"
2527                 "1:                             \n\t"
2528                 "mov %0, %%"REG_S"              \n\t"
2529                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2530                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2531                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2532                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2533                 "shll $16, %%edi                \n\t"
2534                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2535                 "mov %1, %%"REG_D"              \n\t"
2536                 "shrl $9, %%esi                 \n\t"
2537                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2538
2539                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2540                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2541                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2542                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2543                 "shll $16, %%edi                \n\t"
2544                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2545                 "mov %1, %%"REG_D"              \n\t"
2546                 "shrl $9, %%esi                 \n\t"
2547                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2548
2549                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2550                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2551                 "add $1, %%"REG_a"              \n\t"
2552                 "cmp %2, %%"REG_a"              \n\t"
2553                 " jb 1b                         \n\t"
2554
2555 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2556    which is needed to support GCC-4.0 */
2557 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2558                 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2559 #else
2560                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2561 #endif
2562                 "r" (src2)
2563                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2564                 );
2565 #ifdef HAVE_MMX2
2566         } //if MMX2 can't be used
2567 #endif
2568 #else
2569         int i;
2570         unsigned int xpos=0;
2571         for(i=0;i<dstWidth;i++)
2572         {
2573                 register unsigned int xx=xpos>>16;
2574                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2575                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2576                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2577 /* slower
2578           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2579           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2580 */
2581                 xpos+=xInc;
2582         }
2583 #endif
2584    }
2585 }
2586
2587 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2588              int srcSliceH, uint8_t* dst[], int dstStride[]){
2589
2590         /* load a few things into local vars to make the code more readable? and faster */
2591         const int srcW= c->srcW;
2592         const int dstW= c->dstW;
2593         const int dstH= c->dstH;
2594         const int chrDstW= c->chrDstW;
2595         const int chrSrcW= c->chrSrcW;
2596         const int lumXInc= c->lumXInc;
2597         const int chrXInc= c->chrXInc;
2598         const int dstFormat= c->dstFormat;
2599         const int srcFormat= c->srcFormat;
2600         const int flags= c->flags;
2601         const int canMMX2BeUsed= c->canMMX2BeUsed;
2602         int16_t *vLumFilterPos= c->vLumFilterPos;
2603         int16_t *vChrFilterPos= c->vChrFilterPos;
2604         int16_t *hLumFilterPos= c->hLumFilterPos;
2605         int16_t *hChrFilterPos= c->hChrFilterPos;
2606         int16_t *vLumFilter= c->vLumFilter;
2607         int16_t *vChrFilter= c->vChrFilter;
2608         int16_t *hLumFilter= c->hLumFilter;
2609         int16_t *hChrFilter= c->hChrFilter;
2610         int32_t *lumMmxFilter= c->lumMmxFilter;
2611         int32_t *chrMmxFilter= c->chrMmxFilter;
2612         const int vLumFilterSize= c->vLumFilterSize;
2613         const int vChrFilterSize= c->vChrFilterSize;
2614         const int hLumFilterSize= c->hLumFilterSize;
2615         const int hChrFilterSize= c->hChrFilterSize;
2616         int16_t **lumPixBuf= c->lumPixBuf;
2617         int16_t **chrPixBuf= c->chrPixBuf;
2618         const int vLumBufSize= c->vLumBufSize;
2619         const int vChrBufSize= c->vChrBufSize;
2620         uint8_t *funnyYCode= c->funnyYCode;
2621         uint8_t *funnyUVCode= c->funnyUVCode;
2622         uint8_t *formatConvBuffer= c->formatConvBuffer;
2623         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2624         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2625         int lastDstY;
2626
2627         /* vars whch will change and which we need to storw back in the context */
2628         int dstY= c->dstY;
2629         int lumBufIndex= c->lumBufIndex;
2630         int chrBufIndex= c->chrBufIndex;
2631         int lastInLumBuf= c->lastInLumBuf;
2632         int lastInChrBuf= c->lastInChrBuf;
2633         
2634         if(isPacked(c->srcFormat)){
2635                 src[0]=
2636                 src[1]=
2637                 src[2]= src[0];
2638                 srcStride[0]=
2639                 srcStride[1]=
2640                 srcStride[2]= srcStride[0];
2641         }
2642         srcStride[1]<<= c->vChrDrop;
2643         srcStride[2]<<= c->vChrDrop;
2644
2645 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2646 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2647
2648 #if 0 //self test FIXME move to a vfilter or something
2649 {
2650 static volatile int i=0;
2651 i++;
2652 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2653         selfTest(src, srcStride, c->srcW, c->srcH);
2654 i--;
2655 }
2656 #endif
2657
2658 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2659 //dstStride[0],dstStride[1],dstStride[2]);
2660
2661         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2662         {
2663                 static int firstTime=1; //FIXME move this into the context perhaps
2664                 if(flags & SWS_PRINT_INFO && firstTime)
2665                 {
2666                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2667                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2668                         firstTime=0;
2669                 }
2670         }
2671
2672         /* Note the user might start scaling the picture in the middle so this will not get executed
2673            this is not really intended but works currently, so ppl might do it */
2674         if(srcSliceY ==0){
2675                 lumBufIndex=0;
2676                 chrBufIndex=0;
2677                 dstY=0; 
2678                 lastInLumBuf= -1;
2679                 lastInChrBuf= -1;
2680         }
2681
2682         lastDstY= dstY;
2683
2684         for(;dstY < dstH; dstY++){
2685                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2686                 const int chrDstY= dstY>>c->chrDstVSubSample;
2687                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2688                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2689
2690                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2691                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2692                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2693                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2694
2695 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2696 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2697                 //handle holes (FAST_BILINEAR & weird filters)
2698                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2699                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2700 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2701                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2702                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2703
2704                 // Do we have enough lines in this slice to output the dstY line
2705                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2706                 {
2707                         //Do horizontal scaling
2708                         while(lastInLumBuf < lastLumSrcY)
2709                         {
2710                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2711                                 lumBufIndex++;
2712 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2713                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2714                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2715                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2716 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2717                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2718                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2719                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2720                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2721                                 lastInLumBuf++;
2722                         }
2723                         while(lastInChrBuf < lastChrSrcY)
2724                         {
2725                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2726                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2727                                 chrBufIndex++;
2728                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2729                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2730                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2731                                 //FIXME replace parameters through context struct (some at least)
2732
2733                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2734                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2735                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2736                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2737                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2738                                 lastInChrBuf++;
2739                         }
2740                         //wrap buf index around to stay inside the ring buffer
2741                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2742                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2743                 }
2744                 else // not enough lines left in this slice -> load the rest in the buffer
2745                 {
2746 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2747                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2748                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2749                         vChrBufSize, vLumBufSize);*/
2750
2751                         //Do horizontal scaling
2752                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2753                         {
2754                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2755                                 lumBufIndex++;
2756                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2757                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2758                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2759                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2760                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2761                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2762                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2763                                 lastInLumBuf++;
2764                         }
2765                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2766                         {
2767                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2768                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2769                                 chrBufIndex++;
2770                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2771                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2772                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2773
2774                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2775                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2776                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2777                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2778                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2779                                 lastInChrBuf++;
2780                         }
2781                         //wrap buf index around to stay inside the ring buffer
2782                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2783                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2784                         break; //we can't output a dstY line so let's try with the next slice
2785                 }
2786
2787 #ifdef HAVE_MMX
2788                 b5Dither= dither8[dstY&1];
2789                 g6Dither= dither4[dstY&1];
2790                 g5Dither= dither8[dstY&1];
2791                 r5Dither= dither8[(dstY+1)&1];
2792 #endif
2793             if(dstY < dstH-2)
2794             {
2795                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2796                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2797 #ifdef HAVE_MMX
2798                 int i;
2799                 for(i=0; i<vLumFilterSize; i++)
2800                 {
2801                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2802                         lumMmxFilter[4*i+2]= 
2803                         lumMmxFilter[4*i+3]= 
2804                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2805                 }
2806                 for(i=0; i<vChrFilterSize; i++)
2807                 {
2808                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2809                         chrMmxFilter[4*i+2]= 
2810                         chrMmxFilter[4*i+3]= 
2811                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2812                 }
2813 #endif
2814                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2815                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2816                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2817                         RENAME(yuv2nv12X)(c,
2818                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2819                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820                                 dest, uDest, dstW, chrDstW, dstFormat);
2821                 }
2822                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2823                 {
2824                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2825                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2826                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2827                         {
2828                                 int16_t *lumBuf = lumPixBuf[0];
2829                                 int16_t *chrBuf= chrPixBuf[0];
2830                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2831                         }
2832                         else //General YV12
2833                         {
2834                                 RENAME(yuv2yuvX)(c,
2835                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2836                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837                                         dest, uDest, vDest, dstW, chrDstW);
2838                         }
2839                 }
2840                 else
2841                 {
2842                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2843                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2844                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2845                         {
2846                                 int chrAlpha= vChrFilter[2*dstY+1];
2847                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2848                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2849                         }
2850                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2851                         {
2852                                 int lumAlpha= vLumFilter[2*dstY+1];
2853                                 int chrAlpha= vChrFilter[2*dstY+1];
2854                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2855                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2856                         }
2857                         else //General RGB
2858                         {
2859                                 RENAME(yuv2packedX)(c,
2860                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2862                                         dest, dstW, dstY);
2863                         }
2864                 }
2865             }
2866             else // hmm looks like we can't use MMX here without overwriting this array's tail
2867             {
2868                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2869                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2870                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2871                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2872                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2873                         yuv2nv12XinC(
2874                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2875                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2876                                 dest, uDest, dstW, chrDstW, dstFormat);
2877                 }
2878                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2879                 {
2880                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2881                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2882                         yuv2yuvXinC(
2883                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2884                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2885                                 dest, uDest, vDest, dstW, chrDstW);
2886                 }
2887                 else
2888                 {
2889                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2890                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2891                         yuv2packedXinC(c, 
2892                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2893                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2894                                 dest, dstW, dstY);
2895                 }
2896             }
2897         }
2898
2899 #ifdef HAVE_MMX
2900         __asm __volatile(SFENCE:::"memory");
2901         __asm __volatile(EMMS:::"memory");
2902 #endif
2903         /* store changed local vars back in the context */
2904         c->dstY= dstY;
2905         c->lumBufIndex= lumBufIndex;
2906         c->chrBufIndex= chrBufIndex;
2907         c->lastInLumBuf= lastInLumBuf;
2908         c->lastInChrBuf= lastInChrBuf;
2909
2910         return dstY - lastDstY;
2911 }