]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
remove unused varibles patch by (Eric Sesterhenn <SnakeByte at gmx dot de>)
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #undef MOVNTQ
20 #undef PAVGB
21 #undef PREFETCH
22 #undef PREFETCHW
23 #undef EMMS
24 #undef SFENCE
25
26 #ifdef HAVE_3DNOW
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28 #define EMMS     "femms"
29 #else
30 #define EMMS     "emms"
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
39 #else
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
42 #endif
43
44 #ifdef HAVE_MMX2
45 #define SFENCE "sfence"
46 #else
47 #define SFENCE "/nop"
48 #endif
49
50 #ifdef HAVE_MMX2
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54 #endif
55
56 #ifdef HAVE_MMX2
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58 #else
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60 #endif
61
62 #define YSCALEYUV2YV12X(x) \
63                         "xorl %%eax, %%eax              \n\t"\
64                         "pxor %%mm3, %%mm3              \n\t"\
65                         "pxor %%mm4, %%mm4              \n\t"\
66                         "movl %0, %%edx                 \n\t"\
67                         ".balign 16                     \n\t" /* FIXME Unroll? */\
68                         "1:                             \n\t"\
69                         "movl (%1, %%edx, 4), %%esi     \n\t"\
70                         "movq (%2, %%edx, 8), %%mm0     \n\t" /* filterCoeff */\
71                         "movq " #x "(%%esi, %%eax, 2), %%mm2    \n\t" /* srcData */\
72                         "movq 8+" #x "(%%esi, %%eax, 2), %%mm5  \n\t" /* srcData */\
73                         "pmulhw %%mm0, %%mm2            \n\t"\
74                         "pmulhw %%mm0, %%mm5            \n\t"\
75                         "paddw %%mm2, %%mm3             \n\t"\
76                         "paddw %%mm5, %%mm4             \n\t"\
77                         "addl $1, %%edx                 \n\t"\
78                         " jnz 1b                        \n\t"\
79                         "psraw $3, %%mm3                \n\t"\
80                         "psraw $3, %%mm4                \n\t"\
81                         "packuswb %%mm4, %%mm3          \n\t"\
82                         MOVNTQ(%%mm3, (%3, %%eax))\
83                         "addl $8, %%eax                 \n\t"\
84                         "cmpl %4, %%eax                 \n\t"\
85                         "pxor %%mm3, %%mm3              \n\t"\
86                         "pxor %%mm4, %%mm4              \n\t"\
87                         "movl %0, %%edx                 \n\t"\
88                         "jb 1b                          \n\t"
89
90 #define YSCALEYUV2YV121 \
91                         "movl %2, %%eax                 \n\t"\
92                         ".balign 16                     \n\t" /* FIXME Unroll? */\
93                         "1:                             \n\t"\
94                         "movq (%0, %%eax, 2), %%mm0     \n\t"\
95                         "movq 8(%0, %%eax, 2), %%mm1    \n\t"\
96                         "psraw $7, %%mm0                \n\t"\
97                         "psraw $7, %%mm1                \n\t"\
98                         "packuswb %%mm1, %%mm0          \n\t"\
99                         MOVNTQ(%%mm0, (%1, %%eax))\
100                         "addl $8, %%eax                 \n\t"\
101                         "jnc 1b                         \n\t"
102
103 /*
104                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
105                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
106                            "r" (dest), "m" (dstW),
107                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
108                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
109 */
110 #define YSCALEYUV2RGBX \
111                 "xorl %%eax, %%eax              \n\t"\
112                 ".balign 16                     \n\t"\
113                 "1:                             \n\t"\
114                 "movl %1, %%edx                 \n\t" /* -chrFilterSize */\
115                 "movl %3, %%ebx                 \n\t" /* chrMmxFilter+chrFilterSize */\
116                 "movl %7, %%ecx                 \n\t" /* chrSrc+chrFilterSize */\
117                 "pxor %%mm3, %%mm3              \n\t"\
118                 "pxor %%mm4, %%mm4              \n\t"\
119                 "2:                             \n\t"\
120                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
121                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
122                 "movq (%%esi, %%eax), %%mm2     \n\t" /* UsrcData */\
123                 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
124                 "pmulhw %%mm0, %%mm2            \n\t"\
125                 "pmulhw %%mm0, %%mm5            \n\t"\
126                 "paddw %%mm2, %%mm3             \n\t"\
127                 "paddw %%mm5, %%mm4             \n\t"\
128                 "addl $1, %%edx                 \n\t"\
129                 " jnz 2b                        \n\t"\
130 \
131                 "movl %0, %%edx                 \n\t" /* -lumFilterSize */\
132                 "movl %2, %%ebx                 \n\t" /* lumMmxFilter+lumFilterSize */\
133                 "movl %6, %%ecx                 \n\t" /* lumSrc+lumFilterSize */\
134                 "pxor %%mm1, %%mm1              \n\t"\
135                 "pxor %%mm7, %%mm7              \n\t"\
136                 "2:                             \n\t"\
137                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
138                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
139                 "movq (%%esi, %%eax, 2), %%mm2  \n\t" /* Y1srcData */\
140                 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
141                 "pmulhw %%mm0, %%mm2            \n\t"\
142                 "pmulhw %%mm0, %%mm5            \n\t"\
143                 "paddw %%mm2, %%mm1             \n\t"\
144                 "paddw %%mm5, %%mm7             \n\t"\
145                 "addl $1, %%edx                 \n\t"\
146                 " jnz 2b                        \n\t"\
147 \
148                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
149                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
150                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
151                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
152                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
153                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
154         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
155                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
156                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
157                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
158                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
159                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
160                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
161         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
162                 "paddw %%mm3, %%mm4             \n\t"\
163                 "movq %%mm2, %%mm0              \n\t"\
164                 "movq %%mm5, %%mm6              \n\t"\
165                 "movq %%mm4, %%mm3              \n\t"\
166                 "punpcklwd %%mm2, %%mm2         \n\t"\
167                 "punpcklwd %%mm5, %%mm5         \n\t"\
168                 "punpcklwd %%mm4, %%mm4         \n\t"\
169                 "paddw %%mm1, %%mm2             \n\t"\
170                 "paddw %%mm1, %%mm5             \n\t"\
171                 "paddw %%mm1, %%mm4             \n\t"\
172                 "punpckhwd %%mm0, %%mm0         \n\t"\
173                 "punpckhwd %%mm6, %%mm6         \n\t"\
174                 "punpckhwd %%mm3, %%mm3         \n\t"\
175                 "paddw %%mm7, %%mm0             \n\t"\
176                 "paddw %%mm7, %%mm6             \n\t"\
177                 "paddw %%mm7, %%mm3             \n\t"\
178                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
179                 "packuswb %%mm0, %%mm2          \n\t"\
180                 "packuswb %%mm6, %%mm5          \n\t"\
181                 "packuswb %%mm3, %%mm4          \n\t"\
182                 "pxor %%mm7, %%mm7              \n\t"
183
184 #define FULL_YSCALEYUV2RGB \
185                 "pxor %%mm7, %%mm7              \n\t"\
186                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
187                 "punpcklwd %%mm6, %%mm6         \n\t"\
188                 "punpcklwd %%mm6, %%mm6         \n\t"\
189                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
190                 "punpcklwd %%mm5, %%mm5         \n\t"\
191                 "punpcklwd %%mm5, %%mm5         \n\t"\
192                 "xorl %%eax, %%eax              \n\t"\
193                 ".balign 16                     \n\t"\
194                 "1:                             \n\t"\
195                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
196                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
197                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
198                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
199                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
200                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
201                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
202                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
203                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
204                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
205                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
206                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
207                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
208                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
209                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
210                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
211                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
212                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
213 \
214 \
215                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
216                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
217                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
218                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
219                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
220                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
221                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
222 \
223 \
224                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
225                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
226                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
227                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
228                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
229                 "packuswb %%mm3, %%mm3          \n\t"\
230 \
231                 "packuswb %%mm0, %%mm0          \n\t"\
232                 "paddw %%mm4, %%mm2             \n\t"\
233                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
234 \
235                 "packuswb %%mm1, %%mm1          \n\t"
236
237 #define YSCALEYUV2RGB \
238                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
239                 "punpcklwd %%mm6, %%mm6         \n\t"\
240                 "punpcklwd %%mm6, %%mm6         \n\t"\
241                 "movq %%mm6, 3968(%2)           \n\t"\
242                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
243                 "punpcklwd %%mm5, %%mm5         \n\t"\
244                 "punpcklwd %%mm5, %%mm5         \n\t"\
245                 "movq %%mm5, 3976(%2)           \n\t"\
246                 "xorl %%eax, %%eax              \n\t"\
247                 ".balign 16                     \n\t"\
248                 "1:                             \n\t"\
249                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
250                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
251                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
252                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
253                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
254                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
255                 "movq 3976(%2), %%mm0           \n\t"\
256                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
257                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
258                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
259                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
260                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
261                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
262                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
263                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
264                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
265                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
266                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
267                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
268         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
269                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
270                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
271                 "movq 8(%0, %%eax, 2), %%mm6    \n\t" /*buf0[eax]*/\
272                 "movq 8(%1, %%eax, 2), %%mm7    \n\t" /*buf1[eax]*/\
273                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
274                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
275                 "pmulhw 3968(%2), %%mm0         \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
276                 "pmulhw 3968(%2), %%mm6         \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
278                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
280                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
282                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
283                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
284                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
285                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
286                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
287         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
288                 "paddw %%mm3, %%mm4             \n\t"\
289                 "movq %%mm2, %%mm0              \n\t"\
290                 "movq %%mm5, %%mm6              \n\t"\
291                 "movq %%mm4, %%mm3              \n\t"\
292                 "punpcklwd %%mm2, %%mm2         \n\t"\
293                 "punpcklwd %%mm5, %%mm5         \n\t"\
294                 "punpcklwd %%mm4, %%mm4         \n\t"\
295                 "paddw %%mm1, %%mm2             \n\t"\
296                 "paddw %%mm1, %%mm5             \n\t"\
297                 "paddw %%mm1, %%mm4             \n\t"\
298                 "punpckhwd %%mm0, %%mm0         \n\t"\
299                 "punpckhwd %%mm6, %%mm6         \n\t"\
300                 "punpckhwd %%mm3, %%mm3         \n\t"\
301                 "paddw %%mm7, %%mm0             \n\t"\
302                 "paddw %%mm7, %%mm6             \n\t"\
303                 "paddw %%mm7, %%mm3             \n\t"\
304                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
305                 "packuswb %%mm0, %%mm2          \n\t"\
306                 "packuswb %%mm6, %%mm5          \n\t"\
307                 "packuswb %%mm3, %%mm4          \n\t"\
308                 "pxor %%mm7, %%mm7              \n\t"
309
310 #define YSCALEYUV2RGB1 \
311                 "xorl %%eax, %%eax              \n\t"\
312                 ".balign 16                     \n\t"\
313                 "1:                             \n\t"\
314                 "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
315                 "movq 4096(%2, %%eax), %%mm4    \n\t" /* uvbuf0[eax+2048]*/\
316                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
317                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
318                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
319                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
320                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
321                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
322                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
323                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
324         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
325                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
326                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
327                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
328                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
330                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
331                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
332                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
333                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
334                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
335         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336                 "paddw %%mm3, %%mm4             \n\t"\
337                 "movq %%mm2, %%mm0              \n\t"\
338                 "movq %%mm5, %%mm6              \n\t"\
339                 "movq %%mm4, %%mm3              \n\t"\
340                 "punpcklwd %%mm2, %%mm2         \n\t"\
341                 "punpcklwd %%mm5, %%mm5         \n\t"\
342                 "punpcklwd %%mm4, %%mm4         \n\t"\
343                 "paddw %%mm1, %%mm2             \n\t"\
344                 "paddw %%mm1, %%mm5             \n\t"\
345                 "paddw %%mm1, %%mm4             \n\t"\
346                 "punpckhwd %%mm0, %%mm0         \n\t"\
347                 "punpckhwd %%mm6, %%mm6         \n\t"\
348                 "punpckhwd %%mm3, %%mm3         \n\t"\
349                 "paddw %%mm7, %%mm0             \n\t"\
350                 "paddw %%mm7, %%mm6             \n\t"\
351                 "paddw %%mm7, %%mm3             \n\t"\
352                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353                 "packuswb %%mm0, %%mm2          \n\t"\
354                 "packuswb %%mm6, %%mm5          \n\t"\
355                 "packuswb %%mm3, %%mm4          \n\t"\
356                 "pxor %%mm7, %%mm7              \n\t"
357
358 // do vertical chrominance interpolation
359 #define YSCALEYUV2RGB1b \
360                 "xorl %%eax, %%eax              \n\t"\
361                 ".balign 16                     \n\t"\
362                 "1:                             \n\t"\
363                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
364                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
365                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
366                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
367                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
368                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
369                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
370                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
371                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
372                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
373                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
374                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
375                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
376                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
377         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
378                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
379                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
380                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
381                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
383                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
384                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
385                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
386                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
387                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
388         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
389                 "paddw %%mm3, %%mm4             \n\t"\
390                 "movq %%mm2, %%mm0              \n\t"\
391                 "movq %%mm5, %%mm6              \n\t"\
392                 "movq %%mm4, %%mm3              \n\t"\
393                 "punpcklwd %%mm2, %%mm2         \n\t"\
394                 "punpcklwd %%mm5, %%mm5         \n\t"\
395                 "punpcklwd %%mm4, %%mm4         \n\t"\
396                 "paddw %%mm1, %%mm2             \n\t"\
397                 "paddw %%mm1, %%mm5             \n\t"\
398                 "paddw %%mm1, %%mm4             \n\t"\
399                 "punpckhwd %%mm0, %%mm0         \n\t"\
400                 "punpckhwd %%mm6, %%mm6         \n\t"\
401                 "punpckhwd %%mm3, %%mm3         \n\t"\
402                 "paddw %%mm7, %%mm0             \n\t"\
403                 "paddw %%mm7, %%mm6             \n\t"\
404                 "paddw %%mm7, %%mm3             \n\t"\
405                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
406                 "packuswb %%mm0, %%mm2          \n\t"\
407                 "packuswb %%mm6, %%mm5          \n\t"\
408                 "packuswb %%mm3, %%mm4          \n\t"\
409                 "pxor %%mm7, %%mm7              \n\t"
410
411 #define WRITEBGR32 \
412                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
413                         "movq %%mm2, %%mm1              \n\t" /* B */\
414                         "movq %%mm5, %%mm6              \n\t" /* R */\
415                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
416                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
417                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
418                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
419                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
420                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
421                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
422                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
423                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
424                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
425 \
426                         MOVNTQ(%%mm0, (%4, %%eax, 4))\
427                         MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
428                         MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
429                         MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
430 \
431                         "addl $8, %%eax                 \n\t"\
432                         "cmpl %5, %%eax                 \n\t"\
433                         " jb 1b                         \n\t"
434
435 #define WRITEBGR16 \
436                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
437                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
438                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
439                         "psrlq $3, %%mm2                \n\t"\
440 \
441                         "movq %%mm2, %%mm1              \n\t"\
442                         "movq %%mm4, %%mm3              \n\t"\
443 \
444                         "punpcklbw %%mm7, %%mm3         \n\t"\
445                         "punpcklbw %%mm5, %%mm2         \n\t"\
446                         "punpckhbw %%mm7, %%mm4         \n\t"\
447                         "punpckhbw %%mm5, %%mm1         \n\t"\
448 \
449                         "psllq $3, %%mm3                \n\t"\
450                         "psllq $3, %%mm4                \n\t"\
451 \
452                         "por %%mm3, %%mm2               \n\t"\
453                         "por %%mm4, %%mm1               \n\t"\
454 \
455                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
456                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
457 \
458                         "addl $8, %%eax                 \n\t"\
459                         "cmpl %5, %%eax                 \n\t"\
460                         " jb 1b                         \n\t"
461
462 #define WRITEBGR15 \
463                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
464                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
465                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
466                         "psrlq $3, %%mm2                \n\t"\
467                         "psrlq $1, %%mm5                \n\t"\
468 \
469                         "movq %%mm2, %%mm1              \n\t"\
470                         "movq %%mm4, %%mm3              \n\t"\
471 \
472                         "punpcklbw %%mm7, %%mm3         \n\t"\
473                         "punpcklbw %%mm5, %%mm2         \n\t"\
474                         "punpckhbw %%mm7, %%mm4         \n\t"\
475                         "punpckhbw %%mm5, %%mm1         \n\t"\
476 \
477                         "psllq $2, %%mm3                \n\t"\
478                         "psllq $2, %%mm4                \n\t"\
479 \
480                         "por %%mm3, %%mm2               \n\t"\
481                         "por %%mm4, %%mm1               \n\t"\
482 \
483                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
484                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
485 \
486                         "addl $8, %%eax                 \n\t"\
487                         "cmpl %5, %%eax                 \n\t"\
488                         " jb 1b                         \n\t"
489
490 #define WRITEBGR24OLD \
491                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
492                         "movq %%mm2, %%mm1              \n\t" /* B */\
493                         "movq %%mm5, %%mm6              \n\t" /* R */\
494                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
495                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
496                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
497                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
498                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
499                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
500                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
501                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
502                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
503                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
504 \
505                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
506                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
507                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
508                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
509                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
510                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
511                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
512                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
513 \
514                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
515                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
516                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
517                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
518                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
519                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
520                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
521                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
522                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
523                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
524                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
525                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
526                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
527 \
528                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
529                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
530                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
531                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
532                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
533                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
534                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
535                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
536 \
537                         MOVNTQ(%%mm0, (%%ebx))\
538                         MOVNTQ(%%mm2, 8(%%ebx))\
539                         MOVNTQ(%%mm3, 16(%%ebx))\
540                         "addl $24, %%ebx                \n\t"\
541 \
542                         "addl $8, %%eax                 \n\t"\
543                         "cmpl %5, %%eax                 \n\t"\
544                         " jb 1b                         \n\t"
545
546 #define WRITEBGR24MMX \
547                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
548                         "movq %%mm2, %%mm1              \n\t" /* B */\
549                         "movq %%mm5, %%mm6              \n\t" /* R */\
550                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
551                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
552                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
553                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
554                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
555                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
556                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
557                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
558                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
559                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
560 \
561                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
562                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
563                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
564                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
565 \
566                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
567                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
568                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
569                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
570 \
571                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
572                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
573                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
574                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
575 \
576                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
577                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
578                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
579                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
580                         MOVNTQ(%%mm0, (%%ebx))\
581 \
582                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
583                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
584                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
585                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
586                         MOVNTQ(%%mm6, 8(%%ebx))\
587 \
588                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
589                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
590                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
591                         MOVNTQ(%%mm5, 16(%%ebx))\
592 \
593                         "addl $24, %%ebx                \n\t"\
594 \
595                         "addl $8, %%eax                 \n\t"\
596                         "cmpl %5, %%eax                 \n\t"\
597                         " jb 1b                         \n\t"
598
599 #define WRITEBGR24MMX2 \
600                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
601                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
602                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
603                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
604                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
605                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
606 \
607                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
608                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
609                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
610 \
611                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
612                         "por %%mm1, %%mm6               \n\t"\
613                         "por %%mm3, %%mm6               \n\t"\
614                         MOVNTQ(%%mm6, (%%ebx))\
615 \
616                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
617                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
618                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
619                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
620 \
621                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
622                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
623                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
624 \
625                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
626                         "por %%mm3, %%mm6               \n\t"\
627                         MOVNTQ(%%mm6, 8(%%ebx))\
628 \
629                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
630                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
631                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
632 \
633                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
634                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
635                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
636 \
637                         "por %%mm1, %%mm3               \n\t"\
638                         "por %%mm3, %%mm6               \n\t"\
639                         MOVNTQ(%%mm6, 16(%%ebx))\
640 \
641                         "addl $24, %%ebx                \n\t"\
642 \
643                         "addl $8, %%eax                 \n\t"\
644                         "cmpl %5, %%eax                 \n\t"\
645                         " jb 1b                         \n\t"
646
647 #ifdef HAVE_MMX2
648 #undef WRITEBGR24
649 #define WRITEBGR24 WRITEBGR24MMX2
650 #else
651 #undef WRITEBGR24
652 #define WRITEBGR24 WRITEBGR24MMX
653 #endif
654
655 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
656                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
657                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
658                                     int16_t * lumMmxFilter, int16_t * chrMmxFilter)
659 {
660 #ifdef HAVE_MMX
661         if(uDest != NULL)
662         {
663                 asm volatile(
664                                 YSCALEYUV2YV12X(0)
665                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
666                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
667                                 : "%eax", "%edx", "%esi"
668                         );
669
670                 asm volatile(
671                                 YSCALEYUV2YV12X(4096)
672                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
673                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
674                                 : "%eax", "%edx", "%esi"
675                         );
676         }
677
678         asm volatile(
679                         YSCALEYUV2YV12X(0)
680                         :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
681                            "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
682                         : "%eax", "%edx", "%esi"
683                 );
684 #else
685 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
686             chrFilter, chrSrc, chrFilterSize,
687             dest, uDest, vDest, dstW, chrDstW);
688 #endif
689 }
690
691 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
692                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
693 {
694 #ifdef HAVE_MMX
695         if(uDest != NULL)
696         {
697                 asm volatile(
698                                 YSCALEYUV2YV121
699                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
700                                 "g" (-chrDstW)
701                                 : "%eax"
702                         );
703
704                 asm volatile(
705                                 YSCALEYUV2YV121
706                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
707                                 "g" (-chrDstW)
708                                 : "%eax"
709                         );
710         }
711
712         asm volatile(
713                 YSCALEYUV2YV121
714                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
715                 "g" (-dstW)
716                 : "%eax"
717         );
718 #else
719         int i;
720         for(i=0; i<dstW; i++)
721         {
722                 int val= lumSrc[i]>>7;
723                 
724                 if(val&256){
725                         if(val<0) val=0;
726                         else      val=255;
727                 }
728
729                 dest[i]= val;
730         }
731
732         if(uDest != NULL)
733                 for(i=0; i<chrDstW; i++)
734                 {
735                         int u=chrSrc[i]>>7;
736                         int v=chrSrc[i + 2048]>>7;
737
738                         if((u|v)&256){
739                                 if(u<0)         u=0;
740                                 else if (u>255) u=255;
741                                 if(v<0)         v=0;
742                                 else if (v>255) v=255;
743                         }
744
745                         uDest[i]= u;
746                         vDest[i]= v;
747                 }
748 #endif
749 }
750
751
752 /**
753  * vertical scale YV12 to RGB
754  */
755 static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
756                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
757                             uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
758 {
759         switch(c->dstFormat)
760         {
761 #ifdef HAVE_MMX
762         case IMGFMT_BGR32:
763                 {
764                         asm volatile(
765                                 YSCALEYUV2RGBX
766                                 WRITEBGR32
767
768                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
769                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
770                            "r" (dest), "m" (dstW),
771                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
772                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
773                         );
774                 }
775                 break;
776         case IMGFMT_BGR24:
777                 {
778                         asm volatile(
779                                 YSCALEYUV2RGBX
780                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t" //FIXME optimize
781                                 "addl %4, %%ebx                 \n\t"
782                                 WRITEBGR24
783
784                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
785                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
786                            "r" (dest), "m" (dstW),
787                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
788                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
789                         );
790                 }
791                 break;
792         case IMGFMT_BGR15:
793                 {
794                         asm volatile(
795                                 YSCALEYUV2RGBX
796                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
797 #ifdef DITHER1XBPP
798                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
799                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
800                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
801 #endif
802
803                                 WRITEBGR15
804
805                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
806                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
807                            "r" (dest), "m" (dstW),
808                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
809                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
810                         );
811                 }
812                 break;
813         case IMGFMT_BGR16:
814                 {
815                         asm volatile(
816                                 YSCALEYUV2RGBX
817                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
818 #ifdef DITHER1XBPP
819                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
820                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
821                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
822 #endif
823
824                                 WRITEBGR16
825
826                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
827                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
828                            "r" (dest), "m" (dstW),
829                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
830                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
831                         );
832                 }
833                 break;
834 #endif
835         default:
836                 yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize,
837                             chrFilter, chrSrc, chrFilterSize,
838                             dest, dstW, dstY);
839                 break;
840         }
841 }
842
843 /**
844  * vertical bilinear scale YV12 to RGB
845  */
846 static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
847                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
848 {
849         int yalpha1=yalpha^4095;
850         int uvalpha1=uvalpha^4095;
851         int i;
852
853 #if 0 //isnt used
854         if(flags&SWS_FULL_CHR_H_INT)
855         {
856                 switch(dstFormat)
857                 {
858 #ifdef HAVE_MMX
859                 case IMGFMT_BGR32:
860                         asm volatile(
861
862
863 FULL_YSCALEYUV2RGB
864                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
865                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
866
867                         "movq %%mm3, %%mm1              \n\t"
868                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
869                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
870
871                         MOVNTQ(%%mm3, (%4, %%eax, 4))
872                         MOVNTQ(%%mm1, 8(%4, %%eax, 4))
873
874                         "addl $4, %%eax                 \n\t"
875                         "cmpl %5, %%eax                 \n\t"
876                         " jb 1b                         \n\t"
877
878
879                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
880                         "m" (yalpha1), "m" (uvalpha1)
881                         : "%eax"
882                         );
883                         break;
884                 case IMGFMT_BGR24:
885                         asm volatile(
886
887 FULL_YSCALEYUV2RGB
888
889                                                                 // lsb ... msb
890                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
891                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
892
893                         "movq %%mm3, %%mm1              \n\t"
894                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
895                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
896
897                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
898                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
899                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
900                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
901                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
902                         "movq %%mm1, %%mm2              \n\t"
903                         "psllq $48, %%mm1               \n\t" // 000000BG
904                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
905
906                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
907                         "psrld $16, %%mm2               \n\t" // R000R000
908                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
909                         "por %%mm2, %%mm1               \n\t" // RBGRR000
910
911                         "movl %4, %%ebx                 \n\t"
912                         "addl %%eax, %%ebx              \n\t"
913
914 #ifdef HAVE_MMX2
915                         //FIXME Alignment
916                         "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
917                         "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
918 #else
919                         "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
920                         "psrlq $32, %%mm3               \n\t"
921                         "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
922                         "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
923 #endif
924                         "addl $4, %%eax                 \n\t"
925                         "cmpl %5, %%eax                 \n\t"
926                         " jb 1b                         \n\t"
927
928                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
929                         "m" (yalpha1), "m" (uvalpha1)
930                         : "%eax", "%ebx"
931                         );
932                         break;
933                 case IMGFMT_BGR15:
934                         asm volatile(
935
936 FULL_YSCALEYUV2RGB
937 #ifdef DITHER1XBPP
938                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
939                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
940                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
941 #endif
942                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
943                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
944                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
945
946                         "psrlw $3, %%mm3                \n\t"
947                         "psllw $2, %%mm1                \n\t"
948                         "psllw $7, %%mm0                \n\t"
949                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
950                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
951
952                         "por %%mm3, %%mm1               \n\t"
953                         "por %%mm1, %%mm0               \n\t"
954
955                         MOVNTQ(%%mm0, (%4, %%eax, 2))
956
957                         "addl $4, %%eax                 \n\t"
958                         "cmpl %5, %%eax                 \n\t"
959                         " jb 1b                         \n\t"
960
961                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
962                         "m" (yalpha1), "m" (uvalpha1)
963                         : "%eax"
964                         );
965                         break;
966                 case IMGFMT_BGR16:
967                         asm volatile(
968
969 FULL_YSCALEYUV2RGB
970 #ifdef DITHER1XBPP
971                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
972                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
973                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
974 #endif
975                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
976                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
977                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
978
979                         "psrlw $3, %%mm3                \n\t"
980                         "psllw $3, %%mm1                \n\t"
981                         "psllw $8, %%mm0                \n\t"
982                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
983                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
984
985                         "por %%mm3, %%mm1               \n\t"
986                         "por %%mm1, %%mm0               \n\t"
987
988                         MOVNTQ(%%mm0, (%4, %%eax, 2))
989
990                         "addl $4, %%eax                 \n\t"
991                         "cmpl %5, %%eax                 \n\t"
992                         " jb 1b                         \n\t"
993
994                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
995                         "m" (yalpha1), "m" (uvalpha1)
996                         : "%eax"
997                         );
998                 break;
999 #endif
1000                 case IMGFMT_RGB32:
1001 #ifndef HAVE_MMX
1002                 case IMGFMT_BGR32:
1003 #endif
1004                 if(dstFormat==IMGFMT_BGR32)
1005                 {
1006                         int i;
1007 #ifdef WORDS_BIGENDIAN
1008                         dest++;
1009 #endif
1010                         for(i=0;i<dstW;i++){
1011                                 // vertical linear interpolation && yuv2rgb in a single step:
1012                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1015                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1018                                 dest+= 4;
1019                         }
1020                 }
1021                 else if(dstFormat==IMGFMT_BGR24)
1022                 {
1023                         int i;
1024                         for(i=0;i<dstW;i++){
1025                                 // vertical linear interpolation && yuv2rgb in a single step:
1026                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1029                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1030                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1031                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1032                                 dest+= 3;
1033                         }
1034                 }
1035                 else if(dstFormat==IMGFMT_BGR16)
1036                 {
1037                         int i;
1038                         for(i=0;i<dstW;i++){
1039                                 // vertical linear interpolation && yuv2rgb in a single step:
1040                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1041                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1042                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1043
1044                                 ((uint16_t*)dest)[i] =
1045                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1046                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1047                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1048                         }
1049                 }
1050                 else if(dstFormat==IMGFMT_BGR15)
1051                 {
1052                         int i;
1053                         for(i=0;i<dstW;i++){
1054                                 // vertical linear interpolation && yuv2rgb in a single step:
1055                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1056                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1057                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1058
1059                                 ((uint16_t*)dest)[i] =
1060                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1061                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1062                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1063                         }
1064                 }
1065         }//FULL_UV_IPOL
1066         else
1067         {
1068 #endif // if 0
1069 #ifdef HAVE_MMX
1070         switch(c->dstFormat)
1071         {
1072         case IMGFMT_BGR32:
1073                         asm volatile(
1074                                 YSCALEYUV2RGB
1075                                 WRITEBGR32
1076
1077                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1078                         "m" (yalpha1), "m" (uvalpha1)
1079                         : "%eax"
1080                         );
1081                         return;
1082         case IMGFMT_BGR24:
1083                         asm volatile(
1084                                 "movl %4, %%ebx                 \n\t"
1085                                 YSCALEYUV2RGB
1086                                 WRITEBGR24
1087
1088                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1089                         "m" (yalpha1), "m" (uvalpha1)
1090                         : "%eax", "%ebx"
1091                         );
1092                         return;
1093         case IMGFMT_BGR15:
1094                         asm volatile(
1095                                 YSCALEYUV2RGB
1096                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097 #ifdef DITHER1XBPP
1098                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1099                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1100                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1101 #endif
1102
1103                                 WRITEBGR15
1104
1105                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1106                         "m" (yalpha1), "m" (uvalpha1)
1107                         : "%eax"
1108                         );
1109                         return;
1110         case IMGFMT_BGR16:
1111                         asm volatile(
1112                                 YSCALEYUV2RGB
1113                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114 #ifdef DITHER1XBPP
1115                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1116                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1117                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1118 #endif
1119
1120                                 WRITEBGR16
1121
1122                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1123                         "m" (yalpha1), "m" (uvalpha1)
1124                         : "%eax"
1125                         );
1126                         return;
1127         default: break;
1128         }
1129 #endif //HAVE_MMX
1130 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C)
1131 }
1132
1133 /**
1134  * YV12 to RGB without scaling or interpolating
1135  */
1136 static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1137                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1138 {
1139         int uvalpha1=uvalpha^4095;
1140         const int yalpha1=0;
1141         int i;
1142         
1143         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1144         const int yalpha= 4096; //FIXME ...
1145
1146         if(flags&SWS_FULL_CHR_H_INT)
1147         {
1148                 RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1149                 return;
1150         }
1151
1152 #ifdef HAVE_MMX
1153         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1154         {
1155                 switch(dstFormat)
1156                 {
1157                 case IMGFMT_BGR32:
1158                         asm volatile(
1159                                 YSCALEYUV2RGB1
1160                                 WRITEBGR32
1161                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1162                         "m" (yalpha1), "m" (uvalpha1)
1163                         : "%eax"
1164                         );
1165                         return;
1166                 case IMGFMT_BGR24:
1167                         asm volatile(
1168                                 "movl %4, %%ebx                 \n\t"
1169                                 YSCALEYUV2RGB1
1170                                 WRITEBGR24
1171                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1172                         "m" (yalpha1), "m" (uvalpha1)
1173                         : "%eax", "%ebx"
1174                         );
1175                         return;
1176                 case IMGFMT_BGR15:
1177                         asm volatile(
1178                                 YSCALEYUV2RGB1
1179                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1180 #ifdef DITHER1XBPP
1181                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1182                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1183                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1184 #endif
1185                                 WRITEBGR15
1186                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1187                         "m" (yalpha1), "m" (uvalpha1)
1188                         : "%eax"
1189                         );
1190                         return;
1191                 case IMGFMT_BGR16:
1192                         asm volatile(
1193                                 YSCALEYUV2RGB1
1194                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1195 #ifdef DITHER1XBPP
1196                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1197                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1198                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1199 #endif
1200
1201                                 WRITEBGR16
1202                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1203                         "m" (yalpha1), "m" (uvalpha1)
1204                         : "%eax"
1205                         );
1206                         return;
1207                 }
1208         }
1209         else
1210         {
1211                 switch(dstFormat)
1212                 {
1213                 case IMGFMT_BGR32:
1214                         asm volatile(
1215                                 YSCALEYUV2RGB1b
1216                                 WRITEBGR32
1217                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1218                         "m" (yalpha1), "m" (uvalpha1)
1219                         : "%eax"
1220                         );
1221                         return;
1222                 case IMGFMT_BGR24:
1223                         asm volatile(
1224                                 "movl %4, %%ebx                 \n\t"
1225                                 YSCALEYUV2RGB1b
1226                                 WRITEBGR24
1227                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1228                         "m" (yalpha1), "m" (uvalpha1)
1229                         : "%eax", "%ebx"
1230                         );
1231                         return;
1232                 case IMGFMT_BGR15:
1233                         asm volatile(
1234                                 YSCALEYUV2RGB1b
1235                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236 #ifdef DITHER1XBPP
1237                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1238                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1239                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1240 #endif
1241                                 WRITEBGR15
1242                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243                         "m" (yalpha1), "m" (uvalpha1)
1244                         : "%eax"
1245                         );
1246                         return;
1247                 case IMGFMT_BGR16:
1248                         asm volatile(
1249                                 YSCALEYUV2RGB1b
1250                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1251 #ifdef DITHER1XBPP
1252                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1253                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1254                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1255 #endif
1256
1257                                 WRITEBGR16
1258                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1259                         "m" (yalpha1), "m" (uvalpha1)
1260                         : "%eax"
1261                         );
1262                         return;
1263                 }
1264         }
1265 #endif
1266         if( uvalpha < 2048 )
1267         {
1268                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C)
1269         }else{
1270                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C)
1271         }
1272 }
1273
1274 //FIXME yuy2* can read upto 7 samples to much
1275
1276 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1277 {
1278 #ifdef HAVE_MMX
1279         asm volatile(
1280                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1281                 "movl %0, %%eax                 \n\t"
1282                 "1:                             \n\t"
1283                 "movq (%1, %%eax,2), %%mm0      \n\t"
1284                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
1285                 "pand %%mm2, %%mm0              \n\t"
1286                 "pand %%mm2, %%mm1              \n\t"
1287                 "packuswb %%mm1, %%mm0          \n\t"
1288                 "movq %%mm0, (%2, %%eax)        \n\t"
1289                 "addl $8, %%eax                 \n\t"
1290                 " js 1b                         \n\t"
1291                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1292                 : "%eax"
1293         );
1294 #else
1295         int i;
1296         for(i=0; i<width; i++)
1297                 dst[i]= src[2*i];
1298 #endif
1299 }
1300
1301 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1302 {
1303 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1304         asm volatile(
1305                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1306                 "movl %0, %%eax                 \n\t"
1307                 "1:                             \n\t"
1308                 "movq (%1, %%eax,4), %%mm0      \n\t"
1309                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
1310                 "movq (%2, %%eax,4), %%mm2      \n\t"
1311                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
1312                 PAVGB(%%mm2, %%mm0)
1313                 PAVGB(%%mm3, %%mm1)
1314                 "psrlw $8, %%mm0                \n\t"
1315                 "psrlw $8, %%mm1                \n\t"
1316                 "packuswb %%mm1, %%mm0          \n\t"
1317                 "movq %%mm0, %%mm1              \n\t"
1318                 "psrlw $8, %%mm0                \n\t"
1319                 "pand %%mm4, %%mm1              \n\t"
1320                 "packuswb %%mm0, %%mm0          \n\t"
1321                 "packuswb %%mm1, %%mm1          \n\t"
1322                 "movd %%mm0, (%4, %%eax)        \n\t"
1323                 "movd %%mm1, (%3, %%eax)        \n\t"
1324                 "addl $4, %%eax                 \n\t"
1325                 " js 1b                         \n\t"
1326                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1327                 : "%eax"
1328         );
1329 #else
1330         int i;
1331         for(i=0; i<width; i++)
1332         {
1333                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1334                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1335         }
1336 #endif
1337 }
1338
1339 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1340 {
1341 #ifdef HAVE_MMXFIXME
1342 #else
1343         int i;
1344         for(i=0; i<width; i++)
1345         {
1346                 int b= src[i*4+0];
1347                 int g= src[i*4+1];
1348                 int r= src[i*4+2];
1349
1350                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1351         }
1352 #endif
1353 }
1354
1355 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1356 {
1357 #ifdef HAVE_MMXFIXME
1358 #else
1359         int i;
1360         for(i=0; i<width; i++)
1361         {
1362                 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1363                 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1364                 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1365
1366                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1367                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1368         }
1369 #endif
1370 }
1371
1372 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1373 {
1374 #ifdef HAVE_MMX
1375         asm volatile(
1376                 "movl %2, %%eax                 \n\t"
1377                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1378                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1379                 "pxor %%mm7, %%mm7              \n\t"
1380                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1381                 ".balign 16                     \n\t"
1382                 "1:                             \n\t"
1383                 PREFETCH" 64(%0, %%ebx)         \n\t"
1384                 "movd (%0, %%ebx), %%mm0        \n\t"
1385                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1386                 "punpcklbw %%mm7, %%mm0         \n\t"
1387                 "punpcklbw %%mm7, %%mm1         \n\t"
1388                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1389                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1390                 "punpcklbw %%mm7, %%mm2         \n\t"
1391                 "punpcklbw %%mm7, %%mm3         \n\t"
1392                 "pmaddwd %%mm6, %%mm0           \n\t"
1393                 "pmaddwd %%mm6, %%mm1           \n\t"
1394                 "pmaddwd %%mm6, %%mm2           \n\t"
1395                 "pmaddwd %%mm6, %%mm3           \n\t"
1396 #ifndef FAST_BGR2YV12
1397                 "psrad $8, %%mm0                \n\t"
1398                 "psrad $8, %%mm1                \n\t"
1399                 "psrad $8, %%mm2                \n\t"
1400                 "psrad $8, %%mm3                \n\t"
1401 #endif
1402                 "packssdw %%mm1, %%mm0          \n\t"
1403                 "packssdw %%mm3, %%mm2          \n\t"
1404                 "pmaddwd %%mm5, %%mm0           \n\t"
1405                 "pmaddwd %%mm5, %%mm2           \n\t"
1406                 "packssdw %%mm2, %%mm0          \n\t"
1407                 "psraw $7, %%mm0                \n\t"
1408
1409                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1410                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1411                 "punpcklbw %%mm7, %%mm4         \n\t"
1412                 "punpcklbw %%mm7, %%mm1         \n\t"
1413                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1414                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1415                 "punpcklbw %%mm7, %%mm2         \n\t"
1416                 "punpcklbw %%mm7, %%mm3         \n\t"
1417                 "pmaddwd %%mm6, %%mm4           \n\t"
1418                 "pmaddwd %%mm6, %%mm1           \n\t"
1419                 "pmaddwd %%mm6, %%mm2           \n\t"
1420                 "pmaddwd %%mm6, %%mm3           \n\t"
1421 #ifndef FAST_BGR2YV12
1422                 "psrad $8, %%mm4                \n\t"
1423                 "psrad $8, %%mm1                \n\t"
1424                 "psrad $8, %%mm2                \n\t"
1425                 "psrad $8, %%mm3                \n\t"
1426 #endif
1427                 "packssdw %%mm1, %%mm4          \n\t"
1428                 "packssdw %%mm3, %%mm2          \n\t"
1429                 "pmaddwd %%mm5, %%mm4           \n\t"
1430                 "pmaddwd %%mm5, %%mm2           \n\t"
1431                 "addl $24, %%ebx                \n\t"
1432                 "packssdw %%mm2, %%mm4          \n\t"
1433                 "psraw $7, %%mm4                \n\t"
1434
1435                 "packuswb %%mm4, %%mm0          \n\t"
1436                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1437
1438                 "movq %%mm0, (%1, %%eax)        \n\t"
1439                 "addl $8, %%eax                 \n\t"
1440                 " js 1b                         \n\t"
1441                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1442                 : "%eax", "%ebx"
1443         );
1444 #else
1445         int i;
1446         for(i=0; i<width; i++)
1447         {
1448                 int b= src[i*3+0];
1449                 int g= src[i*3+1];
1450                 int r= src[i*3+2];
1451
1452                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1453         }
1454 #endif
1455 }
1456
1457 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1458 {
1459 #ifdef HAVE_MMX
1460         asm volatile(
1461                 "movl %4, %%eax                 \n\t"
1462                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1463                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1464                 "pxor %%mm7, %%mm7              \n\t"
1465                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1466                 "addl %%ebx, %%ebx              \n\t"
1467                 ".balign 16                     \n\t"
1468                 "1:                             \n\t"
1469                 PREFETCH" 64(%0, %%ebx)         \n\t"
1470                 PREFETCH" 64(%1, %%ebx)         \n\t"
1471 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1472                 "movq (%0, %%ebx), %%mm0        \n\t"
1473                 "movq (%1, %%ebx), %%mm1        \n\t"
1474                 "movq 6(%0, %%ebx), %%mm2       \n\t"
1475                 "movq 6(%1, %%ebx), %%mm3       \n\t"
1476                 PAVGB(%%mm1, %%mm0)
1477                 PAVGB(%%mm3, %%mm2)
1478                 "movq %%mm0, %%mm1              \n\t"
1479                 "movq %%mm2, %%mm3              \n\t"
1480                 "psrlq $24, %%mm0               \n\t"
1481                 "psrlq $24, %%mm2               \n\t"
1482                 PAVGB(%%mm1, %%mm0)
1483                 PAVGB(%%mm3, %%mm2)
1484                 "punpcklbw %%mm7, %%mm0         \n\t"
1485                 "punpcklbw %%mm7, %%mm2         \n\t"
1486 #else
1487                 "movd (%0, %%ebx), %%mm0        \n\t"
1488                 "movd (%1, %%ebx), %%mm1        \n\t"
1489                 "movd 3(%0, %%ebx), %%mm2       \n\t"
1490                 "movd 3(%1, %%ebx), %%mm3       \n\t"
1491                 "punpcklbw %%mm7, %%mm0         \n\t"
1492                 "punpcklbw %%mm7, %%mm1         \n\t"
1493                 "punpcklbw %%mm7, %%mm2         \n\t"
1494                 "punpcklbw %%mm7, %%mm3         \n\t"
1495                 "paddw %%mm1, %%mm0             \n\t"
1496                 "paddw %%mm3, %%mm2             \n\t"
1497                 "paddw %%mm2, %%mm0             \n\t"
1498                 "movd 6(%0, %%ebx), %%mm4       \n\t"
1499                 "movd 6(%1, %%ebx), %%mm1       \n\t"
1500                 "movd 9(%0, %%ebx), %%mm2       \n\t"
1501                 "movd 9(%1, %%ebx), %%mm3       \n\t"
1502                 "punpcklbw %%mm7, %%mm4         \n\t"
1503                 "punpcklbw %%mm7, %%mm1         \n\t"
1504                 "punpcklbw %%mm7, %%mm2         \n\t"
1505                 "punpcklbw %%mm7, %%mm3         \n\t"
1506                 "paddw %%mm1, %%mm4             \n\t"
1507                 "paddw %%mm3, %%mm2             \n\t"
1508                 "paddw %%mm4, %%mm2             \n\t"
1509                 "psrlw $2, %%mm0                \n\t"
1510                 "psrlw $2, %%mm2                \n\t"
1511 #endif
1512                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1513                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1514                 
1515                 "pmaddwd %%mm0, %%mm1           \n\t"
1516                 "pmaddwd %%mm2, %%mm3           \n\t"
1517                 "pmaddwd %%mm6, %%mm0           \n\t"
1518                 "pmaddwd %%mm6, %%mm2           \n\t"
1519 #ifndef FAST_BGR2YV12
1520                 "psrad $8, %%mm0                \n\t"
1521                 "psrad $8, %%mm1                \n\t"
1522                 "psrad $8, %%mm2                \n\t"
1523                 "psrad $8, %%mm3                \n\t"
1524 #endif
1525                 "packssdw %%mm2, %%mm0          \n\t"
1526                 "packssdw %%mm3, %%mm1          \n\t"
1527                 "pmaddwd %%mm5, %%mm0           \n\t"
1528                 "pmaddwd %%mm5, %%mm1           \n\t"
1529                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1530                 "psraw $7, %%mm0                \n\t"
1531
1532 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1533                 "movq 12(%0, %%ebx), %%mm4      \n\t"
1534                 "movq 12(%1, %%ebx), %%mm1      \n\t"
1535                 "movq 18(%0, %%ebx), %%mm2      \n\t"
1536                 "movq 18(%1, %%ebx), %%mm3      \n\t"
1537                 PAVGB(%%mm1, %%mm4)
1538                 PAVGB(%%mm3, %%mm2)
1539                 "movq %%mm4, %%mm1              \n\t"
1540                 "movq %%mm2, %%mm3              \n\t"
1541                 "psrlq $24, %%mm4               \n\t"
1542                 "psrlq $24, %%mm2               \n\t"
1543                 PAVGB(%%mm1, %%mm4)
1544                 PAVGB(%%mm3, %%mm2)
1545                 "punpcklbw %%mm7, %%mm4         \n\t"
1546                 "punpcklbw %%mm7, %%mm2         \n\t"
1547 #else
1548                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1549                 "movd 12(%1, %%ebx), %%mm1      \n\t"
1550                 "movd 15(%0, %%ebx), %%mm2      \n\t"
1551                 "movd 15(%1, %%ebx), %%mm3      \n\t"
1552                 "punpcklbw %%mm7, %%mm4         \n\t"
1553                 "punpcklbw %%mm7, %%mm1         \n\t"
1554                 "punpcklbw %%mm7, %%mm2         \n\t"
1555                 "punpcklbw %%mm7, %%mm3         \n\t"
1556                 "paddw %%mm1, %%mm4             \n\t"
1557                 "paddw %%mm3, %%mm2             \n\t"
1558                 "paddw %%mm2, %%mm4             \n\t"
1559                 "movd 18(%0, %%ebx), %%mm5      \n\t"
1560                 "movd 18(%1, %%ebx), %%mm1      \n\t"
1561                 "movd 21(%0, %%ebx), %%mm2      \n\t"
1562                 "movd 21(%1, %%ebx), %%mm3      \n\t"
1563                 "punpcklbw %%mm7, %%mm5         \n\t"
1564                 "punpcklbw %%mm7, %%mm1         \n\t"
1565                 "punpcklbw %%mm7, %%mm2         \n\t"
1566                 "punpcklbw %%mm7, %%mm3         \n\t"
1567                 "paddw %%mm1, %%mm5             \n\t"
1568                 "paddw %%mm3, %%mm2             \n\t"
1569                 "paddw %%mm5, %%mm2             \n\t"
1570                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1571                 "psrlw $2, %%mm4                \n\t"
1572                 "psrlw $2, %%mm2                \n\t"
1573 #endif
1574                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1575                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1576                 
1577                 "pmaddwd %%mm4, %%mm1           \n\t"
1578                 "pmaddwd %%mm2, %%mm3           \n\t"
1579                 "pmaddwd %%mm6, %%mm4           \n\t"
1580                 "pmaddwd %%mm6, %%mm2           \n\t"
1581 #ifndef FAST_BGR2YV12
1582                 "psrad $8, %%mm4                \n\t"
1583                 "psrad $8, %%mm1                \n\t"
1584                 "psrad $8, %%mm2                \n\t"
1585                 "psrad $8, %%mm3                \n\t"
1586 #endif
1587                 "packssdw %%mm2, %%mm4          \n\t"
1588                 "packssdw %%mm3, %%mm1          \n\t"
1589                 "pmaddwd %%mm5, %%mm4           \n\t"
1590                 "pmaddwd %%mm5, %%mm1           \n\t"
1591                 "addl $24, %%ebx                \n\t"
1592                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1593                 "psraw $7, %%mm4                \n\t"
1594                 
1595                 "movq %%mm0, %%mm1              \n\t"
1596                 "punpckldq %%mm4, %%mm0         \n\t"
1597                 "punpckhdq %%mm4, %%mm1         \n\t"
1598                 "packsswb %%mm1, %%mm0          \n\t"
1599                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1600
1601                 "movd %%mm0, (%2, %%eax)        \n\t"
1602                 "punpckhdq %%mm0, %%mm0         \n\t"
1603                 "movd %%mm0, (%3, %%eax)        \n\t"
1604                 "addl $4, %%eax                 \n\t"
1605                 " js 1b                         \n\t"
1606                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1607                 : "%eax", "%ebx"
1608         );
1609 #else
1610         int i;
1611         for(i=0; i<width; i++)
1612         {
1613                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1614                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1615                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1616
1617                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1618                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1619         }
1620 #endif
1621 }
1622
1623 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1624 {
1625         int i;
1626         for(i=0; i<width; i++)
1627         {
1628                 int d= src[i*2] + (src[i*2+1]<<8);
1629                 int b= d&0x1F;
1630                 int g= (d>>5)&0x3F;
1631                 int r= (d>>11)&0x1F;
1632
1633                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1634         }
1635 }
1636
1637 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1638 {
1639         int i;
1640         for(i=0; i<width; i++)
1641         {
1642 #if 1
1643                 int d0= le2me_32( ((uint32_t*)src1)[i] );
1644                 int d1= le2me_32( ((uint32_t*)src2)[i] );
1645                 
1646                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1647                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1648
1649                 int dh2= (dh>>11) + (dh<<21);
1650                 int d= dh2 + dl;
1651
1652                 int b= d&0x7F;
1653                 int r= (d>>11)&0x7F;
1654                 int g= d>>21;
1655 #else
1656                 int d0= src1[i*4] + (src1[i*4+1]<<8);
1657                 int b0= d0&0x1F;
1658                 int g0= (d0>>5)&0x3F;
1659                 int r0= (d0>>11)&0x1F;
1660
1661                 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1662                 int b1= d1&0x1F;
1663                 int g1= (d1>>5)&0x3F;
1664                 int r1= (d1>>11)&0x1F;
1665
1666                 int d2= src2[i*4] + (src2[i*4+1]<<8);
1667                 int b2= d2&0x1F;
1668                 int g2= (d2>>5)&0x3F;
1669                 int r2= (d2>>11)&0x1F;
1670
1671                 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1672                 int b3= d3&0x1F;
1673                 int g3= (d3>>5)&0x3F;
1674                 int r3= (d3>>11)&0x1F;
1675
1676                 int b= b0 + b1 + b2 + b3;
1677                 int g= g0 + g1 + g2 + g3;
1678                 int r= r0 + r1 + r2 + r3;
1679 #endif
1680                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1681                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1682         }
1683 }
1684
1685 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1686 {
1687         int i;
1688         for(i=0; i<width; i++)
1689         {
1690                 int d= src[i*2] + (src[i*2+1]<<8);
1691                 int b= d&0x1F;
1692                 int g= (d>>5)&0x1F;
1693                 int r= (d>>10)&0x1F;
1694
1695                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1696         }
1697 }
1698
1699 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1700 {
1701         int i;
1702         for(i=0; i<width; i++)
1703         {
1704 #if 1
1705                 int d0= le2me_32( ((uint32_t*)src1)[i] );
1706                 int d1= le2me_32( ((uint32_t*)src2)[i] );
1707                 
1708                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1709                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1710
1711                 int dh2= (dh>>11) + (dh<<21);
1712                 int d= dh2 + dl;
1713
1714                 int b= d&0x7F;
1715                 int r= (d>>10)&0x7F;
1716                 int g= d>>21;
1717 #else
1718                 int d0= src1[i*4] + (src1[i*4+1]<<8);
1719                 int b0= d0&0x1F;
1720                 int g0= (d0>>5)&0x1F;
1721                 int r0= (d0>>10)&0x1F;
1722
1723                 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1724                 int b1= d1&0x1F;
1725                 int g1= (d1>>5)&0x1F;
1726                 int r1= (d1>>10)&0x1F;
1727
1728                 int d2= src2[i*4] + (src2[i*4+1]<<8);
1729                 int b2= d2&0x1F;
1730                 int g2= (d2>>5)&0x1F;
1731                 int r2= (d2>>10)&0x1F;
1732
1733                 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1734                 int b3= d3&0x1F;
1735                 int g3= (d3>>5)&0x1F;
1736                 int r3= (d3>>10)&0x1F;
1737
1738                 int b= b0 + b1 + b2 + b3;
1739                 int g= g0 + g1 + g2 + g3;
1740                 int r= r0 + r1 + r2 + r3;
1741 #endif
1742                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1743                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1744         }
1745 }
1746
1747
1748 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1749 {
1750         int i;
1751         for(i=0; i<width; i++)
1752         {
1753                 int r= src[i*4+0];
1754                 int g= src[i*4+1];
1755                 int b= src[i*4+2];
1756
1757                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1758         }
1759 }
1760
1761 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1762 {
1763         int i;
1764         for(i=0; i<width; i++)
1765         {
1766                 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1767                 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1768                 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1769
1770                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1771                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1772         }
1773 }
1774
1775 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1776 {
1777         int i;
1778         for(i=0; i<width; i++)
1779         {
1780                 int r= src[i*3+0];
1781                 int g= src[i*3+1];
1782                 int b= src[i*3+2];
1783
1784                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1785         }
1786 }
1787
1788 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1789 {
1790         int i;
1791         for(i=0; i<width; i++)
1792         {
1793                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1794                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1795                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1796
1797                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1798                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1799         }
1800 }
1801
1802
1803 // Bilinear / Bicubic scaling
1804 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1805                                   int16_t *filter, int16_t *filterPos, int filterSize)
1806 {
1807 #ifdef HAVE_MMX
1808         if(filterSize==4) // allways true for upscaling, sometimes for down too
1809         {
1810                 int counter= -2*dstW;
1811                 filter-= counter*2;
1812                 filterPos-= counter/2;
1813                 dst-= counter/2;
1814                 asm volatile(
1815                         "pxor %%mm7, %%mm7              \n\t"
1816                         "movq "MANGLE(w02)", %%mm6      \n\t"
1817                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
1818                         "movl %%eax, %%ebp              \n\t"
1819                         ".balign 16                     \n\t"
1820                         "1:                             \n\t"
1821                         "movzwl (%2, %%ebp), %%eax      \n\t"
1822                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
1823                         "movq (%1, %%ebp, 4), %%mm1     \n\t"
1824                         "movq 8(%1, %%ebp, 4), %%mm3    \n\t"
1825                         "movd (%3, %%eax), %%mm0        \n\t"
1826                         "movd (%3, %%ebx), %%mm2        \n\t"
1827                         "punpcklbw %%mm7, %%mm0         \n\t"
1828                         "punpcklbw %%mm7, %%mm2         \n\t"
1829                         "pmaddwd %%mm1, %%mm0           \n\t"
1830                         "pmaddwd %%mm2, %%mm3           \n\t"
1831                         "psrad $8, %%mm0                \n\t"
1832                         "psrad $8, %%mm3                \n\t"
1833                         "packssdw %%mm3, %%mm0          \n\t"
1834                         "pmaddwd %%mm6, %%mm0           \n\t"
1835                         "packssdw %%mm0, %%mm0          \n\t"
1836                         "movd %%mm0, (%4, %%ebp)        \n\t"
1837                         "addl $4, %%ebp                 \n\t"
1838                         " jnc 1b                        \n\t"
1839
1840                         "popl %%ebp                     \n\t"
1841                         : "+a" (counter)
1842                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1843                         : "%ebx"
1844                 );
1845         }
1846         else if(filterSize==8)
1847         {
1848                 int counter= -2*dstW;
1849                 filter-= counter*4;
1850                 filterPos-= counter/2;
1851                 dst-= counter/2;
1852                 asm volatile(
1853                         "pxor %%mm7, %%mm7              \n\t"
1854                         "movq "MANGLE(w02)", %%mm6      \n\t"
1855                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
1856                         "movl %%eax, %%ebp              \n\t"
1857                         ".balign 16                     \n\t"
1858                         "1:                             \n\t"
1859                         "movzwl (%2, %%ebp), %%eax      \n\t"
1860                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
1861                         "movq (%1, %%ebp, 8), %%mm1     \n\t"
1862                         "movq 16(%1, %%ebp, 8), %%mm3   \n\t"
1863                         "movd (%3, %%eax), %%mm0        \n\t"
1864                         "movd (%3, %%ebx), %%mm2        \n\t"
1865                         "punpcklbw %%mm7, %%mm0         \n\t"
1866                         "punpcklbw %%mm7, %%mm2         \n\t"
1867                         "pmaddwd %%mm1, %%mm0           \n\t"
1868                         "pmaddwd %%mm2, %%mm3           \n\t"
1869
1870                         "movq 8(%1, %%ebp, 8), %%mm1    \n\t"
1871                         "movq 24(%1, %%ebp, 8), %%mm5   \n\t"
1872                         "movd 4(%3, %%eax), %%mm4       \n\t"
1873                         "movd 4(%3, %%ebx), %%mm2       \n\t"
1874                         "punpcklbw %%mm7, %%mm4         \n\t"
1875                         "punpcklbw %%mm7, %%mm2         \n\t"
1876                         "pmaddwd %%mm1, %%mm4           \n\t"
1877                         "pmaddwd %%mm2, %%mm5           \n\t"
1878                         "paddd %%mm4, %%mm0             \n\t"
1879                         "paddd %%mm5, %%mm3             \n\t"
1880                                                 
1881                         "psrad $8, %%mm0                \n\t"
1882                         "psrad $8, %%mm3                \n\t"
1883                         "packssdw %%mm3, %%mm0          \n\t"
1884                         "pmaddwd %%mm6, %%mm0           \n\t"
1885                         "packssdw %%mm0, %%mm0          \n\t"
1886                         "movd %%mm0, (%4, %%ebp)        \n\t"
1887                         "addl $4, %%ebp                 \n\t"
1888                         " jnc 1b                        \n\t"
1889
1890                         "popl %%ebp                     \n\t"
1891                         : "+a" (counter)
1892                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1893                         : "%ebx"
1894                 );
1895         }
1896         else
1897         {
1898                 int counter= -2*dstW;
1899 //              filter-= counter*filterSize/2;
1900                 filterPos-= counter/2;
1901                 dst-= counter/2;
1902                 asm volatile(
1903                         "pxor %%mm7, %%mm7              \n\t"
1904                         "movq "MANGLE(w02)", %%mm6      \n\t"
1905                         ".balign 16                     \n\t"
1906                         "1:                             \n\t"
1907                         "movl %2, %%ecx                 \n\t"
1908                         "movzwl (%%ecx, %0), %%eax      \n\t"
1909                         "movzwl 2(%%ecx, %0), %%ebx     \n\t"
1910                         "movl %5, %%ecx                 \n\t"
1911                         "pxor %%mm4, %%mm4              \n\t"
1912                         "pxor %%mm5, %%mm5              \n\t"
1913                         "2:                             \n\t"
1914                         "movq (%1), %%mm1               \n\t"
1915                         "movq (%1, %6), %%mm3           \n\t"
1916                         "movd (%%ecx, %%eax), %%mm0     \n\t"
1917                         "movd (%%ecx, %%ebx), %%mm2     \n\t"
1918                         "punpcklbw %%mm7, %%mm0         \n\t"
1919                         "punpcklbw %%mm7, %%mm2         \n\t"
1920                         "pmaddwd %%mm1, %%mm0           \n\t"
1921                         "pmaddwd %%mm2, %%mm3           \n\t"
1922                         "paddd %%mm3, %%mm5             \n\t"
1923                         "paddd %%mm0, %%mm4             \n\t"
1924                         "addl $8, %1                    \n\t"
1925                         "addl $4, %%ecx                 \n\t"
1926                         "cmpl %4, %%ecx                 \n\t"
1927                         " jb 2b                         \n\t"
1928                         "addl %6, %1                    \n\t"
1929                         "psrad $8, %%mm4                \n\t"
1930                         "psrad $8, %%mm5                \n\t"
1931                         "packssdw %%mm5, %%mm4          \n\t"
1932                         "pmaddwd %%mm6, %%mm4           \n\t"
1933                         "packssdw %%mm4, %%mm4          \n\t"
1934                         "movl %3, %%eax                 \n\t"
1935                         "movd %%mm4, (%%eax, %0)        \n\t"
1936                         "addl $4, %0                    \n\t"
1937                         " jnc 1b                        \n\t"
1938
1939                         : "+r" (counter), "+r" (filter)
1940                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1941                           "m" (src), "r" (filterSize*2)
1942                         : "%ebx", "%eax", "%ecx"
1943                 );
1944         }
1945 #else
1946         int i;
1947         for(i=0; i<dstW; i++)
1948         {
1949                 int j;
1950                 int srcPos= filterPos[i];
1951                 int val=0;
1952 //              printf("filterPos: %d\n", filterPos[i]);
1953                 for(j=0; j<filterSize; j++)
1954                 {
1955 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1956                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1957                 }
1958 //              filter += hFilterSize;
1959                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1960 //              dst[i] = val>>7;
1961         }
1962 #endif
1963 }
1964       // *** horizontal scale Y line to temp buffer
1965 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
1966                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1967                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
1968                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
1969                                    int32_t *mmx2FilterPos)
1970 {
1971     if(srcFormat==IMGFMT_YUY2)
1972     {
1973         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
1974         src= formatConvBuffer;
1975     }
1976     else if(srcFormat==IMGFMT_BGR32)
1977     {
1978         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
1979         src= formatConvBuffer;
1980     }
1981     else if(srcFormat==IMGFMT_BGR24)
1982     {
1983         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
1984         src= formatConvBuffer;
1985     }
1986     else if(srcFormat==IMGFMT_BGR16)
1987     {
1988         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
1989         src= formatConvBuffer;
1990     }
1991     else if(srcFormat==IMGFMT_BGR15)
1992     {
1993         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
1994         src= formatConvBuffer;
1995     }
1996     else if(srcFormat==IMGFMT_RGB32)
1997     {
1998         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
1999         src= formatConvBuffer;
2000     }
2001     else if(srcFormat==IMGFMT_RGB24)
2002     {
2003         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2004         src= formatConvBuffer;
2005     }
2006
2007 #ifdef HAVE_MMX
2008         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2009     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2010 #else
2011     if(!(flags&SWS_FAST_BILINEAR))
2012 #endif
2013     {
2014         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2015     }
2016     else // Fast Bilinear upscale / crap downscale
2017     {
2018 #ifdef ARCH_X86
2019 #ifdef HAVE_MMX2
2020         int i;
2021         if(canMMX2BeUsed)
2022         {
2023                 asm volatile(
2024                         "pxor %%mm7, %%mm7              \n\t"
2025                         "movl %0, %%ecx                 \n\t"
2026                         "movl %1, %%edi                 \n\t"
2027                         "movl %2, %%edx                 \n\t"
2028                         "movl %3, %%ebx                 \n\t"
2029                         "xorl %%eax, %%eax              \n\t" // i
2030                         PREFETCH" (%%ecx)               \n\t"
2031                         PREFETCH" 32(%%ecx)             \n\t"
2032                         PREFETCH" 64(%%ecx)             \n\t"
2033
2034 #define FUNNY_Y_CODE \
2035                         "movl (%%ebx), %%esi            \n\t"\
2036                         "call *%4                       \n\t"\
2037                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2038                         "addl %%eax, %%edi              \n\t"\
2039                         "xorl %%eax, %%eax              \n\t"\
2040
2041 FUNNY_Y_CODE
2042 FUNNY_Y_CODE
2043 FUNNY_Y_CODE
2044 FUNNY_Y_CODE
2045 FUNNY_Y_CODE
2046 FUNNY_Y_CODE
2047 FUNNY_Y_CODE
2048 FUNNY_Y_CODE
2049
2050                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2051                         "m" (funnyYCode)
2052                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2053                 );
2054                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2055         }
2056         else
2057         {
2058 #endif
2059         //NO MMX just normal asm ...
2060         asm volatile(
2061                 "xorl %%eax, %%eax              \n\t" // i
2062                 "xorl %%ebx, %%ebx              \n\t" // xx
2063                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2064                 ".balign 16                     \n\t"
2065                 "1:                             \n\t"
2066                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
2067                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2068                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2069                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2070                 "shll $16, %%edi                \n\t"
2071                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2072                 "movl %1, %%edi                 \n\t"
2073                 "shrl $9, %%esi                 \n\t"
2074                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2075                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2076                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2077
2078                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
2079                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2080                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2081                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2082                 "shll $16, %%edi                \n\t"
2083                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2084                 "movl %1, %%edi                 \n\t"
2085                 "shrl $9, %%esi                 \n\t"
2086                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
2087                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2088                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2089
2090
2091                 "addl $2, %%eax                 \n\t"
2092                 "cmpl %2, %%eax                 \n\t"
2093                 " jb 1b                         \n\t"
2094
2095
2096                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2097                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2098                 );
2099 #ifdef HAVE_MMX2
2100         } //if MMX2 cant be used
2101 #endif
2102 #else
2103         int i;
2104         unsigned int xpos=0;
2105         for(i=0;i<dstWidth;i++)
2106         {
2107                 register unsigned int xx=xpos>>16;
2108                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2109                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2110                 xpos+=xInc;
2111         }
2112 #endif
2113     }
2114 }
2115
2116 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2117                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2118                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2119                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2120                                    int32_t *mmx2FilterPos)
2121 {
2122     if(srcFormat==IMGFMT_YUY2)
2123     {
2124         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2125         src1= formatConvBuffer;
2126         src2= formatConvBuffer+2048;
2127     }
2128     else if(srcFormat==IMGFMT_BGR32)
2129     {
2130         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2131         src1= formatConvBuffer;
2132         src2= formatConvBuffer+2048;
2133     }
2134     else if(srcFormat==IMGFMT_BGR24)
2135     {
2136         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2137         src1= formatConvBuffer;
2138         src2= formatConvBuffer+2048;
2139     }
2140     else if(srcFormat==IMGFMT_BGR16)
2141     {
2142         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2143         src1= formatConvBuffer;
2144         src2= formatConvBuffer+2048;
2145     }
2146     else if(srcFormat==IMGFMT_BGR15)
2147     {
2148         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2149         src1= formatConvBuffer;
2150         src2= formatConvBuffer+2048;
2151     }
2152     else if(srcFormat==IMGFMT_RGB32)
2153     {
2154         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2155         src1= formatConvBuffer;
2156         src2= formatConvBuffer+2048;
2157     }
2158     else if(srcFormat==IMGFMT_RGB24)
2159     {
2160         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2161         src1= formatConvBuffer;
2162         src2= formatConvBuffer+2048;
2163     }
2164     else if(isGray(srcFormat))
2165     {
2166         return;
2167     }
2168
2169 #ifdef HAVE_MMX
2170         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2171     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2172 #else
2173     if(!(flags&SWS_FAST_BILINEAR))
2174 #endif
2175     {
2176         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2177         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2178     }
2179     else // Fast Bilinear upscale / crap downscale
2180     {
2181 #ifdef ARCH_X86
2182 #ifdef HAVE_MMX2
2183         int i;
2184         if(canMMX2BeUsed)
2185         {
2186                 asm volatile(
2187                         "pxor %%mm7, %%mm7              \n\t"
2188                         "movl %0, %%ecx                 \n\t"
2189                         "movl %1, %%edi                 \n\t"
2190                         "movl %2, %%edx                 \n\t"
2191                         "movl %3, %%ebx                 \n\t"
2192                         "xorl %%eax, %%eax              \n\t" // i
2193                         PREFETCH" (%%ecx)               \n\t"
2194                         PREFETCH" 32(%%ecx)             \n\t"
2195                         PREFETCH" 64(%%ecx)             \n\t"
2196
2197 #define FUNNY_UV_CODE \
2198                         "movl (%%ebx), %%esi            \n\t"\
2199                         "call *%4                       \n\t"\
2200                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2201                         "addl %%eax, %%edi              \n\t"\
2202                         "xorl %%eax, %%eax              \n\t"\
2203
2204 FUNNY_UV_CODE
2205 FUNNY_UV_CODE
2206 FUNNY_UV_CODE
2207 FUNNY_UV_CODE
2208                         "xorl %%eax, %%eax              \n\t" // i
2209                         "movl %5, %%ecx                 \n\t" // src
2210                         "movl %1, %%edi                 \n\t" // buf1
2211                         "addl $4096, %%edi              \n\t"
2212                         PREFETCH" (%%ecx)               \n\t"
2213                         PREFETCH" 32(%%ecx)             \n\t"
2214                         PREFETCH" 64(%%ecx)             \n\t"
2215
2216 FUNNY_UV_CODE
2217 FUNNY_UV_CODE
2218 FUNNY_UV_CODE
2219 FUNNY_UV_CODE
2220
2221                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2222                         "m" (funnyUVCode), "m" (src2)
2223                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2224                 );
2225                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2226                 {
2227 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2228                         dst[i] = src1[srcW-1]*128;
2229                         dst[i+2048] = src2[srcW-1]*128;
2230                 }
2231         }
2232         else
2233         {
2234 #endif
2235         asm volatile(
2236                 "xorl %%eax, %%eax              \n\t" // i
2237                 "xorl %%ebx, %%ebx              \n\t" // xx
2238                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2239                 ".balign 16                     \n\t"
2240                 "1:                             \n\t"
2241                 "movl %0, %%esi                 \n\t"
2242                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
2243                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
2244                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2245                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2246                 "shll $16, %%edi                \n\t"
2247                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2248                 "movl %1, %%edi                 \n\t"
2249                 "shrl $9, %%esi                 \n\t"
2250                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2251
2252                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
2253                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
2254                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2255                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2256                 "shll $16, %%edi                \n\t"
2257                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2258                 "movl %1, %%edi                 \n\t"
2259                 "shrl $9, %%esi                 \n\t"
2260                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2261
2262                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2263                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2264                 "addl $1, %%eax                 \n\t"
2265                 "cmpl %2, %%eax                 \n\t"
2266                 " jb 1b                         \n\t"
2267
2268                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2269                 "r" (src2)
2270                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2271                 );
2272 #ifdef HAVE_MMX2
2273         } //if MMX2 cant be used
2274 #endif
2275 #else
2276         int i;
2277         unsigned int xpos=0;
2278         for(i=0;i<dstWidth;i++)
2279         {
2280                 register unsigned int xx=xpos>>16;
2281                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2282                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2283                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2284 /* slower
2285           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2286           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2287 */
2288                 xpos+=xInc;
2289         }
2290 #endif
2291    }
2292 }
2293
2294 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2295              int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2296
2297         /* load a few things into local vars to make the code more readable? and faster */
2298         const int srcW= c->srcW;
2299         const int dstW= c->dstW;
2300         const int dstH= c->dstH;
2301         const int chrDstW= c->chrDstW;
2302         const int chrSrcW= c->chrSrcW;
2303         const int lumXInc= c->lumXInc;
2304         const int chrXInc= c->chrXInc;
2305         const int dstFormat= c->dstFormat;
2306         const int srcFormat= c->srcFormat;
2307         const int flags= c->flags;
2308         const int canMMX2BeUsed= c->canMMX2BeUsed;
2309         int16_t *vLumFilterPos= c->vLumFilterPos;
2310         int16_t *vChrFilterPos= c->vChrFilterPos;
2311         int16_t *hLumFilterPos= c->hLumFilterPos;
2312         int16_t *hChrFilterPos= c->hChrFilterPos;
2313         int16_t *vLumFilter= c->vLumFilter;
2314         int16_t *vChrFilter= c->vChrFilter;
2315         int16_t *hLumFilter= c->hLumFilter;
2316         int16_t *hChrFilter= c->hChrFilter;
2317         int16_t *lumMmxFilter= c->lumMmxFilter;
2318         int16_t *chrMmxFilter= c->chrMmxFilter;
2319         const int vLumFilterSize= c->vLumFilterSize;
2320         const int vChrFilterSize= c->vChrFilterSize;
2321         const int hLumFilterSize= c->hLumFilterSize;
2322         const int hChrFilterSize= c->hChrFilterSize;
2323         int16_t **lumPixBuf= c->lumPixBuf;
2324         int16_t **chrPixBuf= c->chrPixBuf;
2325         const int vLumBufSize= c->vLumBufSize;
2326         const int vChrBufSize= c->vChrBufSize;
2327         uint8_t *funnyYCode= c->funnyYCode;
2328         uint8_t *funnyUVCode= c->funnyUVCode;
2329         uint8_t *formatConvBuffer= c->formatConvBuffer;
2330         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2331         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2332
2333         /* vars whch will change and which we need to storw back in the context */
2334         int dstY= c->dstY;
2335         int lumBufIndex= c->lumBufIndex;
2336         int chrBufIndex= c->chrBufIndex;
2337         int lastInLumBuf= c->lastInLumBuf;
2338         int lastInChrBuf= c->lastInChrBuf;
2339         int srcStride[3];
2340         int dstStride[3];
2341         uint8_t *src[3];
2342         uint8_t *dst[3];
2343         
2344         orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2345         orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
2346
2347         if(isPacked(c->srcFormat)){
2348                 src[0]=
2349                 src[1]=
2350                 src[2]= srcParam[0];
2351                 srcStride[0]=
2352                 srcStride[1]=
2353                 srcStride[2]= srcStrideParam[0];
2354         }
2355         srcStride[1]<<= c->vChrDrop;
2356         srcStride[2]<<= c->vChrDrop;
2357
2358 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2359 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2360
2361 #if 0 //self test FIXME move to a vfilter or something
2362 {
2363 static volatile int i=0;
2364 i++;
2365 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2366         selfTest(src, srcStride, c->srcW, c->srcH);
2367 i--;
2368 }
2369 #endif
2370
2371 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2372 //dstStride[0],dstStride[1],dstStride[2]);
2373
2374         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2375         {
2376                 static int firstTime=1; //FIXME move this into the context perhaps
2377                 if(flags & SWS_PRINT_INFO && firstTime)
2378                 {
2379                         mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2380                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2381                         firstTime=0;
2382                 }
2383         }
2384
2385         /* Note the user might start scaling the picture in the middle so this will not get executed
2386            this is not really intended but works currently, so ppl might do it */
2387         if(srcSliceY ==0){
2388                 lumBufIndex=0;
2389                 chrBufIndex=0;
2390                 dstY=0; 
2391                 lastInLumBuf= -1;
2392                 lastInChrBuf= -1;
2393         }
2394
2395         for(;dstY < dstH; dstY++){
2396                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2397                 const int chrDstY= dstY>>c->chrDstVSubSample;
2398                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2399                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2400
2401                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2402                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2403                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2404                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2405
2406                 //handle holes (FAST_BILINEAR & weird filters)
2407                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2408                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2409 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2410                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2411                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2412
2413                 // Do we have enough lines in this slice to output the dstY line
2414                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2415                 {
2416                         //Do horizontal scaling
2417                         while(lastInLumBuf < lastLumSrcY)
2418                         {
2419                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2420                                 lumBufIndex++;
2421 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2422                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2423                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2424                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2425 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2426                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2427                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2428                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2429                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2430                                 lastInLumBuf++;
2431                         }
2432                         while(lastInChrBuf < lastChrSrcY)
2433                         {
2434                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2435                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2436                                 chrBufIndex++;
2437                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2438                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2439                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2440                                 //FIXME replace parameters through context struct (some at least)
2441
2442                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2443                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2444                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2445                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2446                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2447                                 lastInChrBuf++;
2448                         }
2449                         //wrap buf index around to stay inside the ring buffer
2450                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2451                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2452                 }
2453                 else // not enough lines left in this slice -> load the rest in the buffer
2454                 {
2455 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2456                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2457                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2458                         vChrBufSize, vLumBufSize);*/
2459
2460                         //Do horizontal scaling
2461                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2462                         {
2463                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2464                                 lumBufIndex++;
2465                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2466                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2467                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2468                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2469                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2470                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2471                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2472                                 lastInLumBuf++;
2473                         }
2474                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2475                         {
2476                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2477                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2478                                 chrBufIndex++;
2479                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2480                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2481                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2482
2483                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2484                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2485                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2486                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2487                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2488                                 lastInChrBuf++;
2489                         }
2490                         //wrap buf index around to stay inside the ring buffer
2491                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2492                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2493                         break; //we cant output a dstY line so lets try with the next slice
2494                 }
2495
2496 #ifdef HAVE_MMX
2497                 b5Dither= dither8[dstY&1];
2498                 g6Dither= dither4[dstY&1];
2499                 g5Dither= dither8[dstY&1];
2500                 r5Dither= dither8[(dstY+1)&1];
2501 #endif
2502             if(dstY < dstH-2)
2503             {
2504                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2505                 {
2506                         if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2507                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2508                         {
2509                                 int16_t *lumBuf = lumPixBuf[0];
2510                                 int16_t *chrBuf= chrPixBuf[0];
2511                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2512                         }
2513                         else //General YV12
2514                         {
2515                                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2516                                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2517                                 RENAME(yuv2yuvX)(
2518                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2519                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2520                                         dest, uDest, vDest, dstW, chrDstW,
2521                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2522                         }
2523                 }
2524                 else
2525                 {
2526                         int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2527                         int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2528
2529                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2530                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2531                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2532                         {
2533                                 int chrAlpha= vChrFilter[2*dstY+1];
2534
2535                                 RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2536                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2537                         }
2538                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2539                         {
2540                                 int lumAlpha= vLumFilter[2*dstY+1];
2541                                 int chrAlpha= vChrFilter[2*dstY+1];
2542
2543                                 RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2544                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2545                         }
2546                         else //General RGB
2547                         {
2548                                 RENAME(yuv2rgbX)(c,
2549                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2550                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2551                                         dest, dstW,
2552                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
2553                         }
2554                 }
2555             }
2556             else // hmm looks like we cant use MMX here without overwriting this arrays tail
2557             {
2558                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2559                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2560                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2561                 {
2562                         if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL;
2563                         yuv2yuvXinC(
2564                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2565                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2566                                 dest, uDest, vDest, dstW, chrDstW);
2567                 }
2568                 else
2569                 {
2570                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2571                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2572                         yuv2rgbXinC(c, 
2573                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2574                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2575                                 dest, dstW, dstY);
2576                 }
2577             }
2578         }
2579
2580 #ifdef HAVE_MMX
2581         __asm __volatile(SFENCE:::"memory");
2582         __asm __volatile(EMMS:::"memory");
2583 #endif
2584         /* store changed local vars back in the context */
2585         c->dstY= dstY;
2586         c->lumBufIndex= lumBufIndex;
2587         c->chrBufIndex= chrBufIndex;
2588         c->lastInLumBuf= lastInLumBuf;
2589         c->lastInChrBuf= lastInChrBuf;
2590 }