]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
1000l
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #undef MOVNTQ
20 #undef PAVGB
21 #undef PREFETCH
22 #undef PREFETCHW
23 #undef EMMS
24 #undef SFENCE
25
26 #ifdef HAVE_3DNOW
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28 #define EMMS     "femms"
29 #else
30 #define EMMS     "emms"
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
39 #else
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
42 #endif
43
44 #ifdef HAVE_MMX2
45 #define SFENCE "sfence"
46 #else
47 #define SFENCE "/nop"
48 #endif
49
50 #ifdef HAVE_MMX2
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54 #endif
55
56 #ifdef HAVE_MMX2
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58 #else
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60 #endif
61
62 #ifdef HAVE_ALTIVEC
63 #include "swscale_altivec_template.c"
64 #endif
65
66 #define YSCALEYUV2YV12X(x, offset) \
67                         "xorl %%eax, %%eax              \n\t"\
68                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
69                         "movq %%mm3, %%mm4              \n\t"\
70                         "leal " offset "(%0), %%edx     \n\t"\
71                         "movl (%%edx), %%esi            \n\t"\
72                         ".balign 16                     \n\t" /* FIXME Unroll? */\
73                         "1:                             \n\t"\
74                         "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
75                         "movq " #x "(%%esi, %%eax, 2), %%mm2    \n\t" /* srcData */\
76                         "movq 8+" #x "(%%esi, %%eax, 2), %%mm5  \n\t" /* srcData */\
77                         "addl $16, %%edx                \n\t"\
78                         "movl (%%edx), %%esi            \n\t"\
79                         "testl %%esi, %%esi             \n\t"\
80                         "pmulhw %%mm0, %%mm2            \n\t"\
81                         "pmulhw %%mm0, %%mm5            \n\t"\
82                         "paddw %%mm2, %%mm3             \n\t"\
83                         "paddw %%mm5, %%mm4             \n\t"\
84                         " jnz 1b                        \n\t"\
85                         "psraw $3, %%mm3                \n\t"\
86                         "psraw $3, %%mm4                \n\t"\
87                         "packuswb %%mm4, %%mm3          \n\t"\
88                         MOVNTQ(%%mm3, (%1, %%eax))\
89                         "addl $8, %%eax                 \n\t"\
90                         "cmpl %2, %%eax                 \n\t"\
91                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
92                         "movq %%mm3, %%mm4              \n\t"\
93                         "leal " offset "(%0), %%edx     \n\t"\
94                         "movl (%%edx), %%esi            \n\t"\
95                         "jb 1b                          \n\t"
96
97 #define YSCALEYUV2YV121 \
98                         "movl %2, %%eax                 \n\t"\
99                         ".balign 16                     \n\t" /* FIXME Unroll? */\
100                         "1:                             \n\t"\
101                         "movq (%0, %%eax, 2), %%mm0     \n\t"\
102                         "movq 8(%0, %%eax, 2), %%mm1    \n\t"\
103                         "psraw $7, %%mm0                \n\t"\
104                         "psraw $7, %%mm1                \n\t"\
105                         "packuswb %%mm1, %%mm0          \n\t"\
106                         MOVNTQ(%%mm0, (%1, %%eax))\
107                         "addl $8, %%eax                 \n\t"\
108                         "jnc 1b                         \n\t"
109
110 /*
111                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
112                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
113                            "r" (dest), "m" (dstW),
114                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
115                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
116 */
117 #define YSCALEYUV2PACKEDX \
118                 "xorl %%eax, %%eax              \n\t"\
119                 ".balign 16                     \n\t"\
120                 "nop                            \n\t"\
121                 "1:                             \n\t"\
122                 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx        \n\t"\
123                 "movl (%%edx), %%esi            \n\t"\
124                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
125                 "movq %%mm3, %%mm4              \n\t"\
126                 ".balign 16                     \n\t"\
127                 "2:                             \n\t"\
128                 "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
129                 "movq (%%esi, %%eax), %%mm2     \n\t" /* UsrcData */\
130                 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
131                 "addl $16, %%edx                \n\t"\
132                 "movl (%%edx), %%esi            \n\t"\
133                 "pmulhw %%mm0, %%mm2            \n\t"\
134                 "pmulhw %%mm0, %%mm5            \n\t"\
135                 "paddw %%mm2, %%mm3             \n\t"\
136                 "paddw %%mm5, %%mm4             \n\t"\
137                 "testl %%esi, %%esi             \n\t"\
138                 " jnz 2b                        \n\t"\
139 \
140                 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx        \n\t"\
141                 "movl (%%edx), %%esi            \n\t"\
142                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
143                 "movq %%mm1, %%mm7              \n\t"\
144                 ".balign 16                     \n\t"\
145                 "2:                             \n\t"\
146                 "movq 8(%%edx), %%mm0           \n\t" /* filterCoeff */\
147                 "movq (%%esi, %%eax, 2), %%mm2  \n\t" /* Y1srcData */\
148                 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
149                 "addl $16, %%edx                \n\t"\
150                 "movl (%%edx), %%esi            \n\t"\
151                 "pmulhw %%mm0, %%mm2            \n\t"\
152                 "pmulhw %%mm0, %%mm5            \n\t"\
153                 "paddw %%mm2, %%mm1             \n\t"\
154                 "paddw %%mm5, %%mm7             \n\t"\
155                 "testl %%esi, %%esi             \n\t"\
156                 " jnz 2b                        \n\t"\
157
158
159 #define YSCALEYUV2RGBX \
160                 YSCALEYUV2PACKEDX\
161                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
162                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
163                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
164                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
165                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
166                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
167         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
168                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
169                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
170                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
171                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
172                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
173                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
174         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
175                 "paddw %%mm3, %%mm4             \n\t"\
176                 "movq %%mm2, %%mm0              \n\t"\
177                 "movq %%mm5, %%mm6              \n\t"\
178                 "movq %%mm4, %%mm3              \n\t"\
179                 "punpcklwd %%mm2, %%mm2         \n\t"\
180                 "punpcklwd %%mm5, %%mm5         \n\t"\
181                 "punpcklwd %%mm4, %%mm4         \n\t"\
182                 "paddw %%mm1, %%mm2             \n\t"\
183                 "paddw %%mm1, %%mm5             \n\t"\
184                 "paddw %%mm1, %%mm4             \n\t"\
185                 "punpckhwd %%mm0, %%mm0         \n\t"\
186                 "punpckhwd %%mm6, %%mm6         \n\t"\
187                 "punpckhwd %%mm3, %%mm3         \n\t"\
188                 "paddw %%mm7, %%mm0             \n\t"\
189                 "paddw %%mm7, %%mm6             \n\t"\
190                 "paddw %%mm7, %%mm3             \n\t"\
191                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
192                 "packuswb %%mm0, %%mm2          \n\t"\
193                 "packuswb %%mm6, %%mm5          \n\t"\
194                 "packuswb %%mm3, %%mm4          \n\t"\
195                 "pxor %%mm7, %%mm7              \n\t"
196 #if 0
197 #define FULL_YSCALEYUV2RGB \
198                 "pxor %%mm7, %%mm7              \n\t"\
199                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
200                 "punpcklwd %%mm6, %%mm6         \n\t"\
201                 "punpcklwd %%mm6, %%mm6         \n\t"\
202                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
203                 "punpcklwd %%mm5, %%mm5         \n\t"\
204                 "punpcklwd %%mm5, %%mm5         \n\t"\
205                 "xorl %%eax, %%eax              \n\t"\
206                 ".balign 16                     \n\t"\
207                 "1:                             \n\t"\
208                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
209                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
210                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
211                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
212                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
213                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
214                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
215                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
216                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
217                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
218                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
219                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
220                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
221                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
222                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
223                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
224                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
225                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
226 \
227 \
228                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
229                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
230                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
231                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
232                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
233                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
234                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
235 \
236 \
237                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
238                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
239                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
240                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
241                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
242                 "packuswb %%mm3, %%mm3          \n\t"\
243 \
244                 "packuswb %%mm0, %%mm0          \n\t"\
245                 "paddw %%mm4, %%mm2             \n\t"\
246                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
247 \
248                 "packuswb %%mm1, %%mm1          \n\t"
249 #endif
250
251 #define YSCALEYUV2PACKED(index, c) \
252                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
253                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
254                 "psraw $3, %%mm0                \n\t"\
255                 "psraw $3, %%mm1                \n\t"\
256                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
257                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
258                 "xorl "#index", "#index"                \n\t"\
259                 ".balign 16                     \n\t"\
260                 "1:                             \n\t"\
261                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
262                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
263                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
264                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
265                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
266                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
267                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
268                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
269                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
270                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
271                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
272                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
273                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
274                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
275                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
276                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
277                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
278                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
279                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
280                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
281                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
282                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
283                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
284                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
285                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
286                 
287 #define YSCALEYUV2RGB(index, c) \
288                 "xorl "#index", "#index"        \n\t"\
289                 ".balign 16                     \n\t"\
290                 "1:                             \n\t"\
291                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
292                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
293                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
294                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
295                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
296                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
297                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
298                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
299                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
300                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
301                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
302                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
303                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
304                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
305                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
306                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
307                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
308                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
309                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
310         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
311                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
312                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
313                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
314                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
315                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
316                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
317                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
318                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
319                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
320                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
321                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
322                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
323                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
324                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
325                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
326                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
327                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
328                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
329         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
330                 "paddw %%mm3, %%mm4             \n\t"\
331                 "movq %%mm2, %%mm0              \n\t"\
332                 "movq %%mm5, %%mm6              \n\t"\
333                 "movq %%mm4, %%mm3              \n\t"\
334                 "punpcklwd %%mm2, %%mm2         \n\t"\
335                 "punpcklwd %%mm5, %%mm5         \n\t"\
336                 "punpcklwd %%mm4, %%mm4         \n\t"\
337                 "paddw %%mm1, %%mm2             \n\t"\
338                 "paddw %%mm1, %%mm5             \n\t"\
339                 "paddw %%mm1, %%mm4             \n\t"\
340                 "punpckhwd %%mm0, %%mm0         \n\t"\
341                 "punpckhwd %%mm6, %%mm6         \n\t"\
342                 "punpckhwd %%mm3, %%mm3         \n\t"\
343                 "paddw %%mm7, %%mm0             \n\t"\
344                 "paddw %%mm7, %%mm6             \n\t"\
345                 "paddw %%mm7, %%mm3             \n\t"\
346                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
347                 "packuswb %%mm0, %%mm2          \n\t"\
348                 "packuswb %%mm6, %%mm5          \n\t"\
349                 "packuswb %%mm3, %%mm4          \n\t"\
350                 "pxor %%mm7, %%mm7              \n\t"
351                 
352 #define YSCALEYUV2PACKED1(index, c) \
353                 "xorl "#index", "#index"                \n\t"\
354                 ".balign 16                     \n\t"\
355                 "1:                             \n\t"\
356                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
357                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
358                 "psraw $7, %%mm3                \n\t" \
359                 "psraw $7, %%mm4                \n\t" \
360                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
361                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
362                 "psraw $7, %%mm1                \n\t" \
363                 "psraw $7, %%mm7                \n\t" \
364                 
365 #define YSCALEYUV2RGB1(index, c) \
366                 "xorl "#index", "#index"        \n\t"\
367                 ".balign 16                     \n\t"\
368                 "1:                             \n\t"\
369                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
370                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
371                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
372                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
373                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
374                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
375                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
376                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
377                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
378                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
379         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
380                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
381                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
382                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
384                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
385                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
386                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
387                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
388                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
389                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
390         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
391                 "paddw %%mm3, %%mm4             \n\t"\
392                 "movq %%mm2, %%mm0              \n\t"\
393                 "movq %%mm5, %%mm6              \n\t"\
394                 "movq %%mm4, %%mm3              \n\t"\
395                 "punpcklwd %%mm2, %%mm2         \n\t"\
396                 "punpcklwd %%mm5, %%mm5         \n\t"\
397                 "punpcklwd %%mm4, %%mm4         \n\t"\
398                 "paddw %%mm1, %%mm2             \n\t"\
399                 "paddw %%mm1, %%mm5             \n\t"\
400                 "paddw %%mm1, %%mm4             \n\t"\
401                 "punpckhwd %%mm0, %%mm0         \n\t"\
402                 "punpckhwd %%mm6, %%mm6         \n\t"\
403                 "punpckhwd %%mm3, %%mm3         \n\t"\
404                 "paddw %%mm7, %%mm0             \n\t"\
405                 "paddw %%mm7, %%mm6             \n\t"\
406                 "paddw %%mm7, %%mm3             \n\t"\
407                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
408                 "packuswb %%mm0, %%mm2          \n\t"\
409                 "packuswb %%mm6, %%mm5          \n\t"\
410                 "packuswb %%mm3, %%mm4          \n\t"\
411                 "pxor %%mm7, %%mm7              \n\t"
412
413 #define YSCALEYUV2PACKED1b(index, c) \
414                 "xorl "#index", "#index"                \n\t"\
415                 ".balign 16                     \n\t"\
416                 "1:                             \n\t"\
417                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
418                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
419                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
420                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
421                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
422                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
423                 "psrlw $8, %%mm3                \n\t" \
424                 "psrlw $8, %%mm4                \n\t" \
425                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
426                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
427                 "psraw $7, %%mm1                \n\t" \
428                 "psraw $7, %%mm7                \n\t" 
429                 
430 // do vertical chrominance interpolation
431 #define YSCALEYUV2RGB1b(index, c) \
432                 "xorl "#index", "#index"                \n\t"\
433                 ".balign 16                     \n\t"\
434                 "1:                             \n\t"\
435                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
436                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
437                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
440                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
441                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
442                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
443                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
444                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
445                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
446                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
447                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
448                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
449         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
450                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
451                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
452                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
454                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
455                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
456                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
457                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
458                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
459                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
460         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461                 "paddw %%mm3, %%mm4             \n\t"\
462                 "movq %%mm2, %%mm0              \n\t"\
463                 "movq %%mm5, %%mm6              \n\t"\
464                 "movq %%mm4, %%mm3              \n\t"\
465                 "punpcklwd %%mm2, %%mm2         \n\t"\
466                 "punpcklwd %%mm5, %%mm5         \n\t"\
467                 "punpcklwd %%mm4, %%mm4         \n\t"\
468                 "paddw %%mm1, %%mm2             \n\t"\
469                 "paddw %%mm1, %%mm5             \n\t"\
470                 "paddw %%mm1, %%mm4             \n\t"\
471                 "punpckhwd %%mm0, %%mm0         \n\t"\
472                 "punpckhwd %%mm6, %%mm6         \n\t"\
473                 "punpckhwd %%mm3, %%mm3         \n\t"\
474                 "paddw %%mm7, %%mm0             \n\t"\
475                 "paddw %%mm7, %%mm6             \n\t"\
476                 "paddw %%mm7, %%mm3             \n\t"\
477                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478                 "packuswb %%mm0, %%mm2          \n\t"\
479                 "packuswb %%mm6, %%mm5          \n\t"\
480                 "packuswb %%mm3, %%mm4          \n\t"\
481                 "pxor %%mm7, %%mm7              \n\t"
482
483 #define WRITEBGR32(dst, dstw, index) \
484                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
485                         "movq %%mm2, %%mm1              \n\t" /* B */\
486                         "movq %%mm5, %%mm6              \n\t" /* R */\
487                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
488                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
489                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
490                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
491                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
492                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
493                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
494                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
495                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
496                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
497 \
498                         MOVNTQ(%%mm0, (dst, index, 4))\
499                         MOVNTQ(%%mm2, 8(dst, index, 4))\
500                         MOVNTQ(%%mm1, 16(dst, index, 4))\
501                         MOVNTQ(%%mm3, 24(dst, index, 4))\
502 \
503                         "addl $8, "#index"              \n\t"\
504                         "cmpl "#dstw", "#index"         \n\t"\
505                         " jb 1b                         \n\t"
506
507 #define WRITEBGR16(dst, dstw, index) \
508                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
509                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
510                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
511                         "psrlq $3, %%mm2                \n\t"\
512 \
513                         "movq %%mm2, %%mm1              \n\t"\
514                         "movq %%mm4, %%mm3              \n\t"\
515 \
516                         "punpcklbw %%mm7, %%mm3         \n\t"\
517                         "punpcklbw %%mm5, %%mm2         \n\t"\
518                         "punpckhbw %%mm7, %%mm4         \n\t"\
519                         "punpckhbw %%mm5, %%mm1         \n\t"\
520 \
521                         "psllq $3, %%mm3                \n\t"\
522                         "psllq $3, %%mm4                \n\t"\
523 \
524                         "por %%mm3, %%mm2               \n\t"\
525                         "por %%mm4, %%mm1               \n\t"\
526 \
527                         MOVNTQ(%%mm2, (dst, index, 2))\
528                         MOVNTQ(%%mm1, 8(dst, index, 2))\
529 \
530                         "addl $8, "#index"              \n\t"\
531                         "cmpl "#dstw", "#index"         \n\t"\
532                         " jb 1b                         \n\t"
533
534 #define WRITEBGR15(dst, dstw, index) \
535                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
536                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
537                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
538                         "psrlq $3, %%mm2                \n\t"\
539                         "psrlq $1, %%mm5                \n\t"\
540 \
541                         "movq %%mm2, %%mm1              \n\t"\
542                         "movq %%mm4, %%mm3              \n\t"\
543 \
544                         "punpcklbw %%mm7, %%mm3         \n\t"\
545                         "punpcklbw %%mm5, %%mm2         \n\t"\
546                         "punpckhbw %%mm7, %%mm4         \n\t"\
547                         "punpckhbw %%mm5, %%mm1         \n\t"\
548 \
549                         "psllq $2, %%mm3                \n\t"\
550                         "psllq $2, %%mm4                \n\t"\
551 \
552                         "por %%mm3, %%mm2               \n\t"\
553                         "por %%mm4, %%mm1               \n\t"\
554 \
555                         MOVNTQ(%%mm2, (dst, index, 2))\
556                         MOVNTQ(%%mm1, 8(dst, index, 2))\
557 \
558                         "addl $8, "#index"              \n\t"\
559                         "cmpl "#dstw", "#index"         \n\t"\
560                         " jb 1b                         \n\t"
561
562 #define WRITEBGR24OLD(dst, dstw, index) \
563                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
564                         "movq %%mm2, %%mm1              \n\t" /* B */\
565                         "movq %%mm5, %%mm6              \n\t" /* R */\
566                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
567                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
568                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
569                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
570                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
571                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
572                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
573                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
574                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
575                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
576 \
577                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
578                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
579                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
580                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
581                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
582                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
583                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
584                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
585 \
586                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
587                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
588                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
589                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
590                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
591                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
592                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
593                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
594                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
595                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
596                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
597                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
598                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
599 \
600                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
601                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
602                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
603                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
604                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
605                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
606                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
607                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
608 \
609                         MOVNTQ(%%mm0, (dst))\
610                         MOVNTQ(%%mm2, 8(dst))\
611                         MOVNTQ(%%mm3, 16(dst))\
612                         "addl $24, "#dst"               \n\t"\
613 \
614                         "addl $8, "#index"              \n\t"\
615                         "cmpl "#dstw", "#index"         \n\t"\
616                         " jb 1b                         \n\t"
617
618 #define WRITEBGR24MMX(dst, dstw, index) \
619                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
620                         "movq %%mm2, %%mm1              \n\t" /* B */\
621                         "movq %%mm5, %%mm6              \n\t" /* R */\
622                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
623                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
624                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
625                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
626                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
627                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
628                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
629                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
630                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
631                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
632 \
633                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
634                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
635                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
636                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
637 \
638                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
639                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
640                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
641                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
642 \
643                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
644                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
645                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
646                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
647 \
648                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
649                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
650                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
651                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
652                         MOVNTQ(%%mm0, (dst))\
653 \
654                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
655                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
656                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
657                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
658                         MOVNTQ(%%mm6, 8(dst))\
659 \
660                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
661                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
662                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
663                         MOVNTQ(%%mm5, 16(dst))\
664 \
665                         "addl $24, "#dst"               \n\t"\
666 \
667                         "addl $8, "#index"                      \n\t"\
668                         "cmpl "#dstw", "#index"                 \n\t"\
669                         " jb 1b                         \n\t"
670
671 #define WRITEBGR24MMX2(dst, dstw, index) \
672                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
673                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
674                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
675                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
676                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
677                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
678 \
679                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
680                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
681                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
682 \
683                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
684                         "por %%mm1, %%mm6               \n\t"\
685                         "por %%mm3, %%mm6               \n\t"\
686                         MOVNTQ(%%mm6, (dst))\
687 \
688                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
689                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
690                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
691                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
692 \
693                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
694                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
695                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
696 \
697                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, 8(dst))\
700 \
701                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
702                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
703                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
704 \
705                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
706                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
707                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
708 \
709                         "por %%mm1, %%mm3               \n\t"\
710                         "por %%mm3, %%mm6               \n\t"\
711                         MOVNTQ(%%mm6, 16(dst))\
712 \
713                         "addl $24, "#dst"               \n\t"\
714 \
715                         "addl $8, "#index"              \n\t"\
716                         "cmpl "#dstw", "#index"         \n\t"\
717                         " jb 1b                         \n\t"
718
719 #ifdef HAVE_MMX2
720 #undef WRITEBGR24
721 #define WRITEBGR24 WRITEBGR24MMX2
722 #else
723 #undef WRITEBGR24
724 #define WRITEBGR24 WRITEBGR24MMX
725 #endif
726
727 #define WRITEYUY2(dst, dstw, index) \
728                         "packuswb %%mm3, %%mm3          \n\t"\
729                         "packuswb %%mm4, %%mm4          \n\t"\
730                         "packuswb %%mm7, %%mm1          \n\t"\
731                         "punpcklbw %%mm4, %%mm3         \n\t"\
732                         "movq %%mm1, %%mm7              \n\t"\
733                         "punpcklbw %%mm3, %%mm1         \n\t"\
734                         "punpckhbw %%mm3, %%mm7         \n\t"\
735 \
736                         MOVNTQ(%%mm1, (dst, index, 2))\
737                         MOVNTQ(%%mm7, 8(dst, index, 2))\
738 \
739                         "addl $8, "#index"              \n\t"\
740                         "cmpl "#dstw", "#index"         \n\t"\
741                         " jb 1b                         \n\t"
742
743
744 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
745                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
746                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
747 {
748 #ifdef HAVE_MMX
749         if(uDest != NULL)
750         {
751                 asm volatile(
752                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
753                                 :: "r" (&c->redDither),
754                                 "r" (uDest), "m" (chrDstW)
755                                 : "%eax", "%edx", "%esi"
756                         );
757
758                 asm volatile(
759                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
760                                 :: "r" (&c->redDither),
761                                 "r" (vDest), "m" (chrDstW)
762                                 : "%eax", "%edx", "%esi"
763                         );
764         }
765
766         asm volatile(
767                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
768                         :: "r" (&c->redDither),
769                            "r" (dest), "m" (dstW)
770                         : "%eax", "%edx", "%esi"
771                 );
772 #else
773 #ifdef HAVE_ALTIVEC
774 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
775                       chrFilter, chrSrc, chrFilterSize,
776                       dest, uDest, vDest, dstW, chrDstW);
777 #else //HAVE_ALTIVEC
778 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
779             chrFilter, chrSrc, chrFilterSize,
780             dest, uDest, vDest, dstW, chrDstW);
781 #endif //!HAVE_ALTIVEC
782 #endif
783 }
784
785 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
786                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
787 {
788 #ifdef HAVE_MMX
789         if(uDest != NULL)
790         {
791                 asm volatile(
792                                 YSCALEYUV2YV121
793                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
794                                 "g" (-chrDstW)
795                                 : "%eax"
796                         );
797
798                 asm volatile(
799                                 YSCALEYUV2YV121
800                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
801                                 "g" (-chrDstW)
802                                 : "%eax"
803                         );
804         }
805
806         asm volatile(
807                 YSCALEYUV2YV121
808                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
809                 "g" (-dstW)
810                 : "%eax"
811         );
812 #else
813         int i;
814         for(i=0; i<dstW; i++)
815         {
816                 int val= lumSrc[i]>>7;
817                 
818                 if(val&256){
819                         if(val<0) val=0;
820                         else      val=255;
821                 }
822
823                 dest[i]= val;
824         }
825
826         if(uDest != NULL)
827                 for(i=0; i<chrDstW; i++)
828                 {
829                         int u=chrSrc[i]>>7;
830                         int v=chrSrc[i + 2048]>>7;
831
832                         if((u|v)&256){
833                                 if(u<0)         u=0;
834                                 else if (u>255) u=255;
835                                 if(v<0)         v=0;
836                                 else if (v>255) v=255;
837                         }
838
839                         uDest[i]= u;
840                         vDest[i]= v;
841                 }
842 #endif
843 }
844
845
846 /**
847  * vertical scale YV12 to RGB
848  */
849 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
850                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
851                             uint8_t *dest, int dstW, int dstY)
852 {
853         int dummy=0;
854         switch(c->dstFormat)
855         {
856 #ifdef HAVE_MMX
857         case IMGFMT_BGR32:
858                 {
859                         asm volatile(
860                                 YSCALEYUV2RGBX
861                                 WRITEBGR32(%4, %5, %%eax)
862
863                         :: "r" (&c->redDither), 
864                            "m" (dummy), "m" (dummy), "m" (dummy),
865                            "r" (dest), "m" (dstW)
866                         : "%eax", "%edx", "%esi"
867                         );
868                 }
869                 break;
870         case IMGFMT_BGR24:
871                 {
872                         asm volatile(
873                                 YSCALEYUV2RGBX
874                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t" //FIXME optimize
875                                 "addl %4, %%ebx                 \n\t"
876                                 WRITEBGR24(%%ebx, %5, %%eax)
877
878                         :: "r" (&c->redDither), 
879                            "m" (dummy), "m" (dummy), "m" (dummy),
880                            "r" (dest), "m" (dstW)
881                         : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
882                         );
883                 }
884                 break;
885         case IMGFMT_BGR15:
886                 {
887                         asm volatile(
888                                 YSCALEYUV2RGBX
889                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
890 #ifdef DITHER1XBPP
891                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
892                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
893                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
894 #endif
895
896                                 WRITEBGR15(%4, %5, %%eax)
897
898                         :: "r" (&c->redDither), 
899                            "m" (dummy), "m" (dummy), "m" (dummy),
900                            "r" (dest), "m" (dstW)
901                         : "%eax", "%edx", "%esi"
902                         );
903                 }
904                 break;
905         case IMGFMT_BGR16:
906                 {
907                         asm volatile(
908                                 YSCALEYUV2RGBX
909                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
910 #ifdef DITHER1XBPP
911                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
912                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
913                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
914 #endif
915
916                                 WRITEBGR16(%4, %5, %%eax)
917
918                         :: "r" (&c->redDither), 
919                            "m" (dummy), "m" (dummy), "m" (dummy),
920                            "r" (dest), "m" (dstW)
921                         : "%eax", "%edx", "%esi"
922                         );
923                 }
924                 break;
925         case IMGFMT_YUY2:
926                 {
927                         asm volatile(
928                                 YSCALEYUV2PACKEDX
929                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
930
931                                 "psraw $3, %%mm3                \n\t"
932                                 "psraw $3, %%mm4                \n\t"
933                                 "psraw $3, %%mm1                \n\t"
934                                 "psraw $3, %%mm7                \n\t"
935                                 WRITEYUY2(%4, %5, %%eax)
936
937                         :: "r" (&c->redDither), 
938                            "m" (dummy), "m" (dummy), "m" (dummy),
939                            "r" (dest), "m" (dstW)
940                         : "%eax", "%edx", "%esi"
941                         );
942                 }
943                 break;
944 #endif
945         default:
946                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
947                             chrFilter, chrSrc, chrFilterSize,
948                             dest, dstW, dstY);
949                 break;
950         }
951 }
952
953 /**
954  * vertical bilinear scale YV12 to RGB
955  */
956 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
957                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
958 {
959         int yalpha1=yalpha^4095;
960         int uvalpha1=uvalpha^4095;
961         int i;
962
963 #if 0 //isn't used
964         if(flags&SWS_FULL_CHR_H_INT)
965         {
966                 switch(dstFormat)
967                 {
968 #ifdef HAVE_MMX
969                 case IMGFMT_BGR32:
970                         asm volatile(
971
972
973 FULL_YSCALEYUV2RGB
974                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
975                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
976
977                         "movq %%mm3, %%mm1              \n\t"
978                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
979                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
980
981                         MOVNTQ(%%mm3, (%4, %%eax, 4))
982                         MOVNTQ(%%mm1, 8(%4, %%eax, 4))
983
984                         "addl $4, %%eax                 \n\t"
985                         "cmpl %5, %%eax                 \n\t"
986                         " jb 1b                         \n\t"
987
988
989                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
990                         "m" (yalpha1), "m" (uvalpha1)
991                         : "%eax"
992                         );
993                         break;
994                 case IMGFMT_BGR24:
995                         asm volatile(
996
997 FULL_YSCALEYUV2RGB
998
999                                                                 // lsb ... msb
1000                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1001                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1002
1003                         "movq %%mm3, %%mm1              \n\t"
1004                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1005                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1006
1007                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1008                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1009                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1010                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1011                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1012                         "movq %%mm1, %%mm2              \n\t"
1013                         "psllq $48, %%mm1               \n\t" // 000000BG
1014                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1015
1016                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1017                         "psrld $16, %%mm2               \n\t" // R000R000
1018                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1019                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1020
1021                         "movl %4, %%ebx                 \n\t"
1022                         "addl %%eax, %%ebx              \n\t"
1023
1024 #ifdef HAVE_MMX2
1025                         //FIXME Alignment
1026                         "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
1027                         "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
1028 #else
1029                         "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
1030                         "psrlq $32, %%mm3               \n\t"
1031                         "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
1032                         "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
1033 #endif
1034                         "addl $4, %%eax                 \n\t"
1035                         "cmpl %5, %%eax                 \n\t"
1036                         " jb 1b                         \n\t"
1037
1038                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1039                         "m" (yalpha1), "m" (uvalpha1)
1040                         : "%eax", "%ebx"
1041                         );
1042                         break;
1043                 case IMGFMT_BGR15:
1044                         asm volatile(
1045
1046 FULL_YSCALEYUV2RGB
1047 #ifdef DITHER1XBPP
1048                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1049                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1050                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1051 #endif
1052                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1053                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1054                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1055
1056                         "psrlw $3, %%mm3                \n\t"
1057                         "psllw $2, %%mm1                \n\t"
1058                         "psllw $7, %%mm0                \n\t"
1059                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1060                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1061
1062                         "por %%mm3, %%mm1               \n\t"
1063                         "por %%mm1, %%mm0               \n\t"
1064
1065                         MOVNTQ(%%mm0, (%4, %%eax, 2))
1066
1067                         "addl $4, %%eax                 \n\t"
1068                         "cmpl %5, %%eax                 \n\t"
1069                         " jb 1b                         \n\t"
1070
1071                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1072                         "m" (yalpha1), "m" (uvalpha1)
1073                         : "%eax"
1074                         );
1075                         break;
1076                 case IMGFMT_BGR16:
1077                         asm volatile(
1078
1079 FULL_YSCALEYUV2RGB
1080 #ifdef DITHER1XBPP
1081                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1082                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1083                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1084 #endif
1085                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1086                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1087                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1088
1089                         "psrlw $3, %%mm3                \n\t"
1090                         "psllw $3, %%mm1                \n\t"
1091                         "psllw $8, %%mm0                \n\t"
1092                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1093                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1094
1095                         "por %%mm3, %%mm1               \n\t"
1096                         "por %%mm1, %%mm0               \n\t"
1097
1098                         MOVNTQ(%%mm0, (%4, %%eax, 2))
1099
1100                         "addl $4, %%eax                 \n\t"
1101                         "cmpl %5, %%eax                 \n\t"
1102                         " jb 1b                         \n\t"
1103
1104                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1105                         "m" (yalpha1), "m" (uvalpha1)
1106                         : "%eax"
1107                         );
1108                 break;
1109 #endif
1110                 case IMGFMT_RGB32:
1111 #ifndef HAVE_MMX
1112                 case IMGFMT_BGR32:
1113 #endif
1114                 if(dstFormat==IMGFMT_BGR32)
1115                 {
1116                         int i;
1117 #ifdef WORDS_BIGENDIAN
1118                         dest++;
1119 #endif
1120                         for(i=0;i<dstW;i++){
1121                                 // vertical linear interpolation && yuv2rgb in a single step:
1122                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1123                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1124                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1125                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1126                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1127                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1128                                 dest+= 4;
1129                         }
1130                 }
1131                 else if(dstFormat==IMGFMT_BGR24)
1132                 {
1133                         int i;
1134                         for(i=0;i<dstW;i++){
1135                                 // vertical linear interpolation && yuv2rgb in a single step:
1136                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1137                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1138                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1139                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1140                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1141                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1142                                 dest+= 3;
1143                         }
1144                 }
1145                 else if(dstFormat==IMGFMT_BGR16)
1146                 {
1147                         int i;
1148                         for(i=0;i<dstW;i++){
1149                                 // vertical linear interpolation && yuv2rgb in a single step:
1150                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1152                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1153
1154                                 ((uint16_t*)dest)[i] =
1155                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1156                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1157                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1158                         }
1159                 }
1160                 else if(dstFormat==IMGFMT_BGR15)
1161                 {
1162                         int i;
1163                         for(i=0;i<dstW;i++){
1164                                 // vertical linear interpolation && yuv2rgb in a single step:
1165                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168
1169                                 ((uint16_t*)dest)[i] =
1170                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1171                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1172                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1173                         }
1174                 }
1175         }//FULL_UV_IPOL
1176         else
1177         {
1178 #endif // if 0
1179 #ifdef HAVE_MMX
1180         switch(c->dstFormat)
1181         {
1182 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1183         case IMGFMT_BGR32:
1184                         asm volatile(
1185                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1186                                 "movl %4, %%esp                         \n\t"
1187                                 YSCALEYUV2RGB(%%eax, %5)
1188                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
1189                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1190
1191                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1192                         "r" (&c->redDither)
1193                         : "%eax"
1194                         );
1195                         return;
1196         case IMGFMT_BGR24:
1197                         asm volatile(
1198                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1199                                 "movl %4, %%esp                 \n\t"
1200                                 YSCALEYUV2RGB(%%eax, %5)
1201                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
1202                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1203                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1204                         "r" (&c->redDither)
1205                         : "%eax"
1206                         );
1207                         return;
1208         case IMGFMT_BGR15:
1209                         asm volatile(
1210                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1211                                 "movl %4, %%esp                         \n\t"
1212                                 YSCALEYUV2RGB(%%eax, %5)
1213                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1214 #ifdef DITHER1XBPP
1215                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1216                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1217                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1218 #endif
1219
1220                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
1221                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1222
1223                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1224                         "r" (&c->redDither)
1225                         : "%eax"
1226                         );
1227                         return;
1228         case IMGFMT_BGR16:
1229                         asm volatile(
1230                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1231                                 "movl %4, %%esp                         \n\t"
1232                                 YSCALEYUV2RGB(%%eax, %5)
1233                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1234 #ifdef DITHER1XBPP
1235                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1237                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238 #endif
1239
1240                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
1241                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1242                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1243                         "r" (&c->redDither)
1244                         : "%eax"
1245                         );
1246                         return;
1247         case IMGFMT_YUY2:
1248                         asm volatile(
1249                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1250                                 "movl %4, %%esp                         \n\t"
1251                                 YSCALEYUV2PACKED(%%eax, %5)
1252                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
1253                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1254                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1255                         "r" (&c->redDither)
1256                         : "%eax"
1257                         );
1258                         return;
1259         default: break;
1260         }
1261 #endif //HAVE_MMX
1262 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1263 }
1264
1265 /**
1266  * YV12 to RGB without scaling or interpolating
1267  */
1268 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1269                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1270 {
1271         const int yalpha1=0;
1272         int i;
1273         
1274         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1275         const int yalpha= 4096; //FIXME ...
1276
1277         if(flags&SWS_FULL_CHR_H_INT)
1278         {
1279                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1280                 return;
1281         }
1282
1283 #ifdef HAVE_MMX
1284         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1285         {
1286                 switch(dstFormat)
1287                 {
1288                 case IMGFMT_BGR32:
1289                         asm volatile(
1290                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1291                                 "movl %4, %%esp                         \n\t"
1292                                 YSCALEYUV2RGB1(%%eax, %5)
1293                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
1294                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1295
1296                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1297                         "r" (&c->redDither)
1298                         : "%eax"
1299                         );
1300                         return;
1301                 case IMGFMT_BGR24:
1302                         asm volatile(
1303                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1304                                 "movl %4, %%esp                         \n\t"
1305                                 YSCALEYUV2RGB1(%%eax, %5)
1306                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
1307                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1308
1309                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1310                         "r" (&c->redDither)
1311                         : "%eax"
1312                         );
1313                         return;
1314                 case IMGFMT_BGR15:
1315                         asm volatile(
1316                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1317                                 "movl %4, %%esp                         \n\t"
1318                                 YSCALEYUV2RGB1(%%eax, %5)
1319                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1320 #ifdef DITHER1XBPP
1321                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1322                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1323                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1324 #endif
1325                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
1326                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1327
1328                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1329                         "r" (&c->redDither)
1330                         : "%eax"
1331                         );
1332                         return;
1333                 case IMGFMT_BGR16:
1334                         asm volatile(
1335                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1336                                 "movl %4, %%esp                         \n\t"
1337                                 YSCALEYUV2RGB1(%%eax, %5)
1338                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1339 #ifdef DITHER1XBPP
1340                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1341                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1342                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1343 #endif
1344
1345                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
1346                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1347
1348                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1349                         "r" (&c->redDither)
1350                         : "%eax"
1351                         );
1352                         return;
1353                 case IMGFMT_YUY2:
1354                         asm volatile(
1355                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1356                                 "movl %4, %%esp                         \n\t"
1357                                 YSCALEYUV2PACKED1(%%eax, %5)
1358                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
1359                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1360
1361                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1362                         "r" (&c->redDither)
1363                         : "%eax"
1364                         );
1365                         return;
1366                 }
1367         }
1368         else
1369         {
1370                 switch(dstFormat)
1371                 {
1372                 case IMGFMT_BGR32:
1373                         asm volatile(
1374                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1375                                 "movl %4, %%esp                         \n\t"
1376                                 YSCALEYUV2RGB1b(%%eax, %5)
1377                                 WRITEBGR32(%%esp, 8280(%5), %%eax)
1378                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1379
1380                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1381                         "r" (&c->redDither)
1382                         : "%eax"
1383                         );
1384                         return;
1385                 case IMGFMT_BGR24:
1386                         asm volatile(
1387                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1388                                 "movl %4, %%esp                         \n\t"
1389                                 YSCALEYUV2RGB1b(%%eax, %5)
1390                                 WRITEBGR24(%%esp, 8280(%5), %%eax)
1391                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1392
1393                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1394                         "r" (&c->redDither)
1395                         : "%eax"
1396                         );
1397                         return;
1398                 case IMGFMT_BGR15:
1399                         asm volatile(
1400                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1401                                 "movl %4, %%esp                         \n\t"
1402                                 YSCALEYUV2RGB1b(%%eax, %5)
1403                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1404 #ifdef DITHER1XBPP
1405                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1406                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1407                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1408 #endif
1409                                 WRITEBGR15(%%esp, 8280(%5), %%eax)
1410                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1411
1412                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1413                         "r" (&c->redDither)
1414                         : "%eax"
1415                         );
1416                         return;
1417                 case IMGFMT_BGR16:
1418                         asm volatile(
1419                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1420                                 "movl %4, %%esp                         \n\t"
1421                                 YSCALEYUV2RGB1b(%%eax, %5)
1422                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1423 #ifdef DITHER1XBPP
1424                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1425                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1426                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1427 #endif
1428
1429                                 WRITEBGR16(%%esp, 8280(%5), %%eax)
1430                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1431
1432                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1433                         "r" (&c->redDither)
1434                         : "%eax"
1435                         );
1436                         return;
1437                 case IMGFMT_YUY2:
1438                         asm volatile(
1439                                 "movl %%esp, "ESP_OFFSET"(%5)           \n\t"
1440                                 "movl %4, %%esp                         \n\t"
1441                                 YSCALEYUV2PACKED1b(%%eax, %5)
1442                                 WRITEYUY2(%%esp, 8280(%5), %%eax)
1443                                 "movl "ESP_OFFSET"(%5), %%esp           \n\t"
1444
1445                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1446                         "r" (&c->redDither)
1447                         : "%eax"
1448                         );
1449                         return;
1450                 }
1451         }
1452 #endif
1453         if( uvalpha < 2048 )
1454         {
1455                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1456         }else{
1457                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1458         }
1459 }
1460
1461 //FIXME yuy2* can read upto 7 samples to much
1462
1463 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1464 {
1465 #ifdef HAVE_MMX
1466         asm volatile(
1467                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1468                 "movl %0, %%eax                 \n\t"
1469                 "1:                             \n\t"
1470                 "movq (%1, %%eax,2), %%mm0      \n\t"
1471                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
1472                 "pand %%mm2, %%mm0              \n\t"
1473                 "pand %%mm2, %%mm1              \n\t"
1474                 "packuswb %%mm1, %%mm0          \n\t"
1475                 "movq %%mm0, (%2, %%eax)        \n\t"
1476                 "addl $8, %%eax                 \n\t"
1477                 " js 1b                         \n\t"
1478                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1479                 : "%eax"
1480         );
1481 #else
1482         int i;
1483         for(i=0; i<width; i++)
1484                 dst[i]= src[2*i];
1485 #endif
1486 }
1487
1488 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1489 {
1490 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1491         asm volatile(
1492                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1493                 "movl %0, %%eax                 \n\t"
1494                 "1:                             \n\t"
1495                 "movq (%1, %%eax,4), %%mm0      \n\t"
1496                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
1497                 "movq (%2, %%eax,4), %%mm2      \n\t"
1498                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
1499                 PAVGB(%%mm2, %%mm0)
1500                 PAVGB(%%mm3, %%mm1)
1501                 "psrlw $8, %%mm0                \n\t"
1502                 "psrlw $8, %%mm1                \n\t"
1503                 "packuswb %%mm1, %%mm0          \n\t"
1504                 "movq %%mm0, %%mm1              \n\t"
1505                 "psrlw $8, %%mm0                \n\t"
1506                 "pand %%mm4, %%mm1              \n\t"
1507                 "packuswb %%mm0, %%mm0          \n\t"
1508                 "packuswb %%mm1, %%mm1          \n\t"
1509                 "movd %%mm0, (%4, %%eax)        \n\t"
1510                 "movd %%mm1, (%3, %%eax)        \n\t"
1511                 "addl $4, %%eax                 \n\t"
1512                 " js 1b                         \n\t"
1513                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1514                 : "%eax"
1515         );
1516 #else
1517         int i;
1518         for(i=0; i<width; i++)
1519         {
1520                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1521                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1522         }
1523 #endif
1524 }
1525
1526 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1527 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1528 {
1529 #ifdef HAVE_MMX
1530         asm volatile(
1531                 "movl %0, %%eax                 \n\t"
1532                 "1:                             \n\t"
1533                 "movq (%1, %%eax,2), %%mm0      \n\t"
1534                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
1535                 "psrlw $8, %%mm0                \n\t"
1536                 "psrlw $8, %%mm1                \n\t"
1537                 "packuswb %%mm1, %%mm0          \n\t"
1538                 "movq %%mm0, (%2, %%eax)        \n\t"
1539                 "addl $8, %%eax                 \n\t"
1540                 " js 1b                         \n\t"
1541                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1542                 : "%eax"
1543         );
1544 #else
1545         int i;
1546         for(i=0; i<width; i++)
1547                 dst[i]= src[2*i+1];
1548 #endif
1549 }
1550
1551 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1552 {
1553 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1554         asm volatile(
1555                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1556                 "movl %0, %%eax                 \n\t"
1557                 "1:                             \n\t"
1558                 "movq (%1, %%eax,4), %%mm0      \n\t"
1559                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
1560                 "movq (%2, %%eax,4), %%mm2      \n\t"
1561                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
1562                 PAVGB(%%mm2, %%mm0)
1563                 PAVGB(%%mm3, %%mm1)
1564                 "pand %%mm4, %%mm0              \n\t"
1565                 "pand %%mm4, %%mm1              \n\t"
1566                 "packuswb %%mm1, %%mm0          \n\t"
1567                 "movq %%mm0, %%mm1              \n\t"
1568                 "psrlw $8, %%mm0                \n\t"
1569                 "pand %%mm4, %%mm1              \n\t"
1570                 "packuswb %%mm0, %%mm0          \n\t"
1571                 "packuswb %%mm1, %%mm1          \n\t"
1572                 "movd %%mm0, (%4, %%eax)        \n\t"
1573                 "movd %%mm1, (%3, %%eax)        \n\t"
1574                 "addl $4, %%eax                 \n\t"
1575                 " js 1b                         \n\t"
1576                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1577                 : "%eax"
1578         );
1579 #else
1580         int i;
1581         for(i=0; i<width; i++)
1582         {
1583                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1584                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1585         }
1586 #endif
1587 }
1588
1589 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1590 {
1591 #ifdef HAVE_MMXFIXME
1592 #else
1593         int i;
1594         for(i=0; i<width; i++)
1595         {
1596                 int b=  ((uint32_t*)src)[i]&0xFF;
1597                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1598                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1599
1600                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1601         }
1602 #endif
1603 }
1604
1605 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1606 {
1607 #ifdef HAVE_MMXFIXME
1608 #else
1609         int i;
1610         for(i=0; i<width; i++)
1611         {
1612                 const int a= ((uint32_t*)src1)[2*i+0];
1613                 const int e= ((uint32_t*)src1)[2*i+1];
1614                 const int c= ((uint32_t*)src2)[2*i+0];
1615                 const int d= ((uint32_t*)src2)[2*i+1];
1616                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1617                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1618                 const int b=  l&0x3FF;
1619                 const int g=  h>>8;
1620                 const int r=  l>>16;
1621
1622                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1623                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1624         }
1625 #endif
1626 }
1627
1628 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1629 {
1630 #ifdef HAVE_MMX
1631         asm volatile(
1632                 "movl %2, %%eax                 \n\t"
1633                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1634                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1635                 "pxor %%mm7, %%mm7              \n\t"
1636                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1637                 ".balign 16                     \n\t"
1638                 "1:                             \n\t"
1639                 PREFETCH" 64(%0, %%ebx)         \n\t"
1640                 "movd (%0, %%ebx), %%mm0        \n\t"
1641                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1642                 "punpcklbw %%mm7, %%mm0         \n\t"
1643                 "punpcklbw %%mm7, %%mm1         \n\t"
1644                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1645                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1646                 "punpcklbw %%mm7, %%mm2         \n\t"
1647                 "punpcklbw %%mm7, %%mm3         \n\t"
1648                 "pmaddwd %%mm6, %%mm0           \n\t"
1649                 "pmaddwd %%mm6, %%mm1           \n\t"
1650                 "pmaddwd %%mm6, %%mm2           \n\t"
1651                 "pmaddwd %%mm6, %%mm3           \n\t"
1652 #ifndef FAST_BGR2YV12
1653                 "psrad $8, %%mm0                \n\t"
1654                 "psrad $8, %%mm1                \n\t"
1655                 "psrad $8, %%mm2                \n\t"
1656                 "psrad $8, %%mm3                \n\t"
1657 #endif
1658                 "packssdw %%mm1, %%mm0          \n\t"
1659                 "packssdw %%mm3, %%mm2          \n\t"
1660                 "pmaddwd %%mm5, %%mm0           \n\t"
1661                 "pmaddwd %%mm5, %%mm2           \n\t"
1662                 "packssdw %%mm2, %%mm0          \n\t"
1663                 "psraw $7, %%mm0                \n\t"
1664
1665                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1666                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1667                 "punpcklbw %%mm7, %%mm4         \n\t"
1668                 "punpcklbw %%mm7, %%mm1         \n\t"
1669                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1670                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1671                 "punpcklbw %%mm7, %%mm2         \n\t"
1672                 "punpcklbw %%mm7, %%mm3         \n\t"
1673                 "pmaddwd %%mm6, %%mm4           \n\t"
1674                 "pmaddwd %%mm6, %%mm1           \n\t"
1675                 "pmaddwd %%mm6, %%mm2           \n\t"
1676                 "pmaddwd %%mm6, %%mm3           \n\t"
1677 #ifndef FAST_BGR2YV12
1678                 "psrad $8, %%mm4                \n\t"
1679                 "psrad $8, %%mm1                \n\t"
1680                 "psrad $8, %%mm2                \n\t"
1681                 "psrad $8, %%mm3                \n\t"
1682 #endif
1683                 "packssdw %%mm1, %%mm4          \n\t"
1684                 "packssdw %%mm3, %%mm2          \n\t"
1685                 "pmaddwd %%mm5, %%mm4           \n\t"
1686                 "pmaddwd %%mm5, %%mm2           \n\t"
1687                 "addl $24, %%ebx                \n\t"
1688                 "packssdw %%mm2, %%mm4          \n\t"
1689                 "psraw $7, %%mm4                \n\t"
1690
1691                 "packuswb %%mm4, %%mm0          \n\t"
1692                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1693
1694                 "movq %%mm0, (%1, %%eax)        \n\t"
1695                 "addl $8, %%eax                 \n\t"
1696                 " js 1b                         \n\t"
1697                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1698                 : "%eax", "%ebx"
1699         );
1700 #else
1701         int i;
1702         for(i=0; i<width; i++)
1703         {
1704                 int b= src[i*3+0];
1705                 int g= src[i*3+1];
1706                 int r= src[i*3+2];
1707
1708                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1709         }
1710 #endif
1711 }
1712
1713 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1714 {
1715 #ifdef HAVE_MMX
1716         asm volatile(
1717                 "movl %4, %%eax                 \n\t"
1718                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1719                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1720                 "pxor %%mm7, %%mm7              \n\t"
1721                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1722                 "addl %%ebx, %%ebx              \n\t"
1723                 ".balign 16                     \n\t"
1724                 "1:                             \n\t"
1725                 PREFETCH" 64(%0, %%ebx)         \n\t"
1726                 PREFETCH" 64(%1, %%ebx)         \n\t"
1727 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728                 "movq (%0, %%ebx), %%mm0        \n\t"
1729                 "movq (%1, %%ebx), %%mm1        \n\t"
1730                 "movq 6(%0, %%ebx), %%mm2       \n\t"
1731                 "movq 6(%1, %%ebx), %%mm3       \n\t"
1732                 PAVGB(%%mm1, %%mm0)
1733                 PAVGB(%%mm3, %%mm2)
1734                 "movq %%mm0, %%mm1              \n\t"
1735                 "movq %%mm2, %%mm3              \n\t"
1736                 "psrlq $24, %%mm0               \n\t"
1737                 "psrlq $24, %%mm2               \n\t"
1738                 PAVGB(%%mm1, %%mm0)
1739                 PAVGB(%%mm3, %%mm2)
1740                 "punpcklbw %%mm7, %%mm0         \n\t"
1741                 "punpcklbw %%mm7, %%mm2         \n\t"
1742 #else
1743                 "movd (%0, %%ebx), %%mm0        \n\t"
1744                 "movd (%1, %%ebx), %%mm1        \n\t"
1745                 "movd 3(%0, %%ebx), %%mm2       \n\t"
1746                 "movd 3(%1, %%ebx), %%mm3       \n\t"
1747                 "punpcklbw %%mm7, %%mm0         \n\t"
1748                 "punpcklbw %%mm7, %%mm1         \n\t"
1749                 "punpcklbw %%mm7, %%mm2         \n\t"
1750                 "punpcklbw %%mm7, %%mm3         \n\t"
1751                 "paddw %%mm1, %%mm0             \n\t"
1752                 "paddw %%mm3, %%mm2             \n\t"
1753                 "paddw %%mm2, %%mm0             \n\t"
1754                 "movd 6(%0, %%ebx), %%mm4       \n\t"
1755                 "movd 6(%1, %%ebx), %%mm1       \n\t"
1756                 "movd 9(%0, %%ebx), %%mm2       \n\t"
1757                 "movd 9(%1, %%ebx), %%mm3       \n\t"
1758                 "punpcklbw %%mm7, %%mm4         \n\t"
1759                 "punpcklbw %%mm7, %%mm1         \n\t"
1760                 "punpcklbw %%mm7, %%mm2         \n\t"
1761                 "punpcklbw %%mm7, %%mm3         \n\t"
1762                 "paddw %%mm1, %%mm4             \n\t"
1763                 "paddw %%mm3, %%mm2             \n\t"
1764                 "paddw %%mm4, %%mm2             \n\t"
1765                 "psrlw $2, %%mm0                \n\t"
1766                 "psrlw $2, %%mm2                \n\t"
1767 #endif
1768                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1769                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1770                 
1771                 "pmaddwd %%mm0, %%mm1           \n\t"
1772                 "pmaddwd %%mm2, %%mm3           \n\t"
1773                 "pmaddwd %%mm6, %%mm0           \n\t"
1774                 "pmaddwd %%mm6, %%mm2           \n\t"
1775 #ifndef FAST_BGR2YV12
1776                 "psrad $8, %%mm0                \n\t"
1777                 "psrad $8, %%mm1                \n\t"
1778                 "psrad $8, %%mm2                \n\t"
1779                 "psrad $8, %%mm3                \n\t"
1780 #endif
1781                 "packssdw %%mm2, %%mm0          \n\t"
1782                 "packssdw %%mm3, %%mm1          \n\t"
1783                 "pmaddwd %%mm5, %%mm0           \n\t"
1784                 "pmaddwd %%mm5, %%mm1           \n\t"
1785                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1786                 "psraw $7, %%mm0                \n\t"
1787
1788 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1789                 "movq 12(%0, %%ebx), %%mm4      \n\t"
1790                 "movq 12(%1, %%ebx), %%mm1      \n\t"
1791                 "movq 18(%0, %%ebx), %%mm2      \n\t"
1792                 "movq 18(%1, %%ebx), %%mm3      \n\t"
1793                 PAVGB(%%mm1, %%mm4)
1794                 PAVGB(%%mm3, %%mm2)
1795                 "movq %%mm4, %%mm1              \n\t"
1796                 "movq %%mm2, %%mm3              \n\t"
1797                 "psrlq $24, %%mm4               \n\t"
1798                 "psrlq $24, %%mm2               \n\t"
1799                 PAVGB(%%mm1, %%mm4)
1800                 PAVGB(%%mm3, %%mm2)
1801                 "punpcklbw %%mm7, %%mm4         \n\t"
1802                 "punpcklbw %%mm7, %%mm2         \n\t"
1803 #else
1804                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1805                 "movd 12(%1, %%ebx), %%mm1      \n\t"
1806                 "movd 15(%0, %%ebx), %%mm2      \n\t"
1807                 "movd 15(%1, %%ebx), %%mm3      \n\t"
1808                 "punpcklbw %%mm7, %%mm4         \n\t"
1809                 "punpcklbw %%mm7, %%mm1         \n\t"
1810                 "punpcklbw %%mm7, %%mm2         \n\t"
1811                 "punpcklbw %%mm7, %%mm3         \n\t"
1812                 "paddw %%mm1, %%mm4             \n\t"
1813                 "paddw %%mm3, %%mm2             \n\t"
1814                 "paddw %%mm2, %%mm4             \n\t"
1815                 "movd 18(%0, %%ebx), %%mm5      \n\t"
1816                 "movd 18(%1, %%ebx), %%mm1      \n\t"
1817                 "movd 21(%0, %%ebx), %%mm2      \n\t"
1818                 "movd 21(%1, %%ebx), %%mm3      \n\t"
1819                 "punpcklbw %%mm7, %%mm5         \n\t"
1820                 "punpcklbw %%mm7, %%mm1         \n\t"
1821                 "punpcklbw %%mm7, %%mm2         \n\t"
1822                 "punpcklbw %%mm7, %%mm3         \n\t"
1823                 "paddw %%mm1, %%mm5             \n\t"
1824                 "paddw %%mm3, %%mm2             \n\t"
1825                 "paddw %%mm5, %%mm2             \n\t"
1826                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1827                 "psrlw $2, %%mm4                \n\t"
1828                 "psrlw $2, %%mm2                \n\t"
1829 #endif
1830                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1831                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1832                 
1833                 "pmaddwd %%mm4, %%mm1           \n\t"
1834                 "pmaddwd %%mm2, %%mm3           \n\t"
1835                 "pmaddwd %%mm6, %%mm4           \n\t"
1836                 "pmaddwd %%mm6, %%mm2           \n\t"
1837 #ifndef FAST_BGR2YV12
1838                 "psrad $8, %%mm4                \n\t"
1839                 "psrad $8, %%mm1                \n\t"
1840                 "psrad $8, %%mm2                \n\t"
1841                 "psrad $8, %%mm3                \n\t"
1842 #endif
1843                 "packssdw %%mm2, %%mm4          \n\t"
1844                 "packssdw %%mm3, %%mm1          \n\t"
1845                 "pmaddwd %%mm5, %%mm4           \n\t"
1846                 "pmaddwd %%mm5, %%mm1           \n\t"
1847                 "addl $24, %%ebx                \n\t"
1848                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1849                 "psraw $7, %%mm4                \n\t"
1850                 
1851                 "movq %%mm0, %%mm1              \n\t"
1852                 "punpckldq %%mm4, %%mm0         \n\t"
1853                 "punpckhdq %%mm4, %%mm1         \n\t"
1854                 "packsswb %%mm1, %%mm0          \n\t"
1855                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1856
1857                 "movd %%mm0, (%2, %%eax)        \n\t"
1858                 "punpckhdq %%mm0, %%mm0         \n\t"
1859                 "movd %%mm0, (%3, %%eax)        \n\t"
1860                 "addl $4, %%eax                 \n\t"
1861                 " js 1b                         \n\t"
1862                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1863                 : "%eax", "%ebx"
1864         );
1865 #else
1866         int i;
1867         for(i=0; i<width; i++)
1868         {
1869                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1870                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1871                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1872
1873                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1874                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1875         }
1876 #endif
1877 }
1878
1879 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1880 {
1881         int i;
1882         for(i=0; i<width; i++)
1883         {
1884                 int d= ((uint16_t*)src)[i];
1885                 int b= d&0x1F;
1886                 int g= (d>>5)&0x3F;
1887                 int r= (d>>11)&0x1F;
1888
1889                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1890         }
1891 }
1892
1893 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1894 {
1895         int i;
1896         for(i=0; i<width; i++)
1897         {
1898                 int d0= ((uint32_t*)src1)[i];
1899                 int d1= ((uint32_t*)src2)[i];
1900                 
1901                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1902                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1903
1904                 int dh2= (dh>>11) + (dh<<21);
1905                 int d= dh2 + dl;
1906
1907                 int b= d&0x7F;
1908                 int r= (d>>11)&0x7F;
1909                 int g= d>>21;
1910                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1911                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1912         }
1913 }
1914
1915 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1916 {
1917         int i;
1918         for(i=0; i<width; i++)
1919         {
1920                 int d= ((uint16_t*)src)[i];
1921                 int b= d&0x1F;
1922                 int g= (d>>5)&0x1F;
1923                 int r= (d>>10)&0x1F;
1924
1925                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1926         }
1927 }
1928
1929 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1930 {
1931         int i;
1932         for(i=0; i<width; i++)
1933         {
1934                 int d0= ((uint32_t*)src1)[i];
1935                 int d1= ((uint32_t*)src2)[i];
1936                 
1937                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1938                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1939
1940                 int dh2= (dh>>11) + (dh<<21);
1941                 int d= dh2 + dl;
1942
1943                 int b= d&0x7F;
1944                 int r= (d>>10)&0x7F;
1945                 int g= d>>21;
1946                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1947                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1948         }
1949 }
1950
1951
1952 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1953 {
1954         int i;
1955         for(i=0; i<width; i++)
1956         {
1957                 int r=  ((uint32_t*)src)[i]&0xFF;
1958                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1959                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1960
1961                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1962         }
1963 }
1964
1965 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1966 {
1967         int i;
1968         for(i=0; i<width; i++)
1969         {
1970                 const int a= ((uint32_t*)src1)[2*i+0];
1971                 const int e= ((uint32_t*)src1)[2*i+1];
1972                 const int c= ((uint32_t*)src2)[2*i+0];
1973                 const int d= ((uint32_t*)src2)[2*i+1];
1974                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1975                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1976                 const int r=  l&0x3FF;
1977                 const int g=  h>>8;
1978                 const int b=  l>>16;
1979
1980                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1981                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1982         }
1983 }
1984
1985 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1986 {
1987         int i;
1988         for(i=0; i<width; i++)
1989         {
1990                 int r= src[i*3+0];
1991                 int g= src[i*3+1];
1992                 int b= src[i*3+2];
1993
1994                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1995         }
1996 }
1997
1998 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1999 {
2000         int i;
2001         for(i=0; i<width; i++)
2002         {
2003                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2004                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2005                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2006
2007                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2008                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2009         }
2010 }
2011
2012
2013 // Bilinear / Bicubic scaling
2014 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2015                                   int16_t *filter, int16_t *filterPos, int filterSize)
2016 {
2017 #ifdef HAVE_MMX
2018         assert(filterSize % 4 == 0 && filterSize>0);
2019         if(filterSize==4) // allways true for upscaling, sometimes for down too
2020         {
2021                 int counter= -2*dstW;
2022                 filter-= counter*2;
2023                 filterPos-= counter/2;
2024                 dst-= counter/2;
2025                 asm volatile(
2026                         "pxor %%mm7, %%mm7              \n\t"
2027                         "movq "MANGLE(w02)", %%mm6      \n\t"
2028                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
2029                         "movl %%eax, %%ebp              \n\t"
2030                         ".balign 16                     \n\t"
2031                         "1:                             \n\t"
2032                         "movzwl (%2, %%ebp), %%eax      \n\t"
2033                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
2034                         "movq (%1, %%ebp, 4), %%mm1     \n\t"
2035                         "movq 8(%1, %%ebp, 4), %%mm3    \n\t"
2036                         "movd (%3, %%eax), %%mm0        \n\t"
2037                         "movd (%3, %%ebx), %%mm2        \n\t"
2038                         "punpcklbw %%mm7, %%mm0         \n\t"
2039                         "punpcklbw %%mm7, %%mm2         \n\t"
2040                         "pmaddwd %%mm1, %%mm0           \n\t"
2041                         "pmaddwd %%mm2, %%mm3           \n\t"
2042                         "psrad $8, %%mm0                \n\t"
2043                         "psrad $8, %%mm3                \n\t"
2044                         "packssdw %%mm3, %%mm0          \n\t"
2045                         "pmaddwd %%mm6, %%mm0           \n\t"
2046                         "packssdw %%mm0, %%mm0          \n\t"
2047                         "movd %%mm0, (%4, %%ebp)        \n\t"
2048                         "addl $4, %%ebp                 \n\t"
2049                         " jnc 1b                        \n\t"
2050
2051                         "popl %%ebp                     \n\t"
2052                         : "+a" (counter)
2053                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2054                         : "%ebx"
2055                 );
2056         }
2057         else if(filterSize==8)
2058         {
2059                 int counter= -2*dstW;
2060                 filter-= counter*4;
2061                 filterPos-= counter/2;
2062                 dst-= counter/2;
2063                 asm volatile(
2064                         "pxor %%mm7, %%mm7              \n\t"
2065                         "movq "MANGLE(w02)", %%mm6      \n\t"
2066                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
2067                         "movl %%eax, %%ebp              \n\t"
2068                         ".balign 16                     \n\t"
2069                         "1:                             \n\t"
2070                         "movzwl (%2, %%ebp), %%eax      \n\t"
2071                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
2072                         "movq (%1, %%ebp, 8), %%mm1     \n\t"
2073                         "movq 16(%1, %%ebp, 8), %%mm3   \n\t"
2074                         "movd (%3, %%eax), %%mm0        \n\t"
2075                         "movd (%3, %%ebx), %%mm2        \n\t"
2076                         "punpcklbw %%mm7, %%mm0         \n\t"
2077                         "punpcklbw %%mm7, %%mm2         \n\t"
2078                         "pmaddwd %%mm1, %%mm0           \n\t"
2079                         "pmaddwd %%mm2, %%mm3           \n\t"
2080
2081                         "movq 8(%1, %%ebp, 8), %%mm1    \n\t"
2082                         "movq 24(%1, %%ebp, 8), %%mm5   \n\t"
2083                         "movd 4(%3, %%eax), %%mm4       \n\t"
2084                         "movd 4(%3, %%ebx), %%mm2       \n\t"
2085                         "punpcklbw %%mm7, %%mm4         \n\t"
2086                         "punpcklbw %%mm7, %%mm2         \n\t"
2087                         "pmaddwd %%mm1, %%mm4           \n\t"
2088                         "pmaddwd %%mm2, %%mm5           \n\t"
2089                         "paddd %%mm4, %%mm0             \n\t"
2090                         "paddd %%mm5, %%mm3             \n\t"
2091                                                 
2092                         "psrad $8, %%mm0                \n\t"
2093                         "psrad $8, %%mm3                \n\t"
2094                         "packssdw %%mm3, %%mm0          \n\t"
2095                         "pmaddwd %%mm6, %%mm0           \n\t"
2096                         "packssdw %%mm0, %%mm0          \n\t"
2097                         "movd %%mm0, (%4, %%ebp)        \n\t"
2098                         "addl $4, %%ebp                 \n\t"
2099                         " jnc 1b                        \n\t"
2100
2101                         "popl %%ebp                     \n\t"
2102                         : "+a" (counter)
2103                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2104                         : "%ebx"
2105                 );
2106         }
2107         else
2108         {
2109                 int counter= -2*dstW;
2110 //              filter-= counter*filterSize/2;
2111                 filterPos-= counter/2;
2112                 dst-= counter/2;
2113                 asm volatile(
2114                         "pxor %%mm7, %%mm7              \n\t"
2115                         "movq "MANGLE(w02)", %%mm6      \n\t"
2116                         ".balign 16                     \n\t"
2117                         "1:                             \n\t"
2118                         "movl %2, %%ecx                 \n\t"
2119                         "movzwl (%%ecx, %0), %%eax      \n\t"
2120                         "movzwl 2(%%ecx, %0), %%ebx     \n\t"
2121                         "movl %5, %%ecx                 \n\t"
2122                         "pxor %%mm4, %%mm4              \n\t"
2123                         "pxor %%mm5, %%mm5              \n\t"
2124                         "2:                             \n\t"
2125                         "movq (%1), %%mm1               \n\t"
2126                         "movq (%1, %6), %%mm3           \n\t"
2127                         "movd (%%ecx, %%eax), %%mm0     \n\t"
2128                         "movd (%%ecx, %%ebx), %%mm2     \n\t"
2129                         "punpcklbw %%mm7, %%mm0         \n\t"
2130                         "punpcklbw %%mm7, %%mm2         \n\t"
2131                         "pmaddwd %%mm1, %%mm0           \n\t"
2132                         "pmaddwd %%mm2, %%mm3           \n\t"
2133                         "paddd %%mm3, %%mm5             \n\t"
2134                         "paddd %%mm0, %%mm4             \n\t"
2135                         "addl $8, %1                    \n\t"
2136                         "addl $4, %%ecx                 \n\t"
2137                         "cmpl %4, %%ecx                 \n\t"
2138                         " jb 2b                         \n\t"
2139                         "addl %6, %1                    \n\t"
2140                         "psrad $8, %%mm4                \n\t"
2141                         "psrad $8, %%mm5                \n\t"
2142                         "packssdw %%mm5, %%mm4          \n\t"
2143                         "pmaddwd %%mm6, %%mm4           \n\t"
2144                         "packssdw %%mm4, %%mm4          \n\t"
2145                         "movl %3, %%eax                 \n\t"
2146                         "movd %%mm4, (%%eax, %0)        \n\t"
2147                         "addl $4, %0                    \n\t"
2148                         " jnc 1b                        \n\t"
2149
2150                         : "+r" (counter), "+r" (filter)
2151                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2152                           "m" (src), "r" (filterSize*2)
2153                         : "%ebx", "%eax", "%ecx"
2154                 );
2155         }
2156 #else
2157 #ifdef HAVE_ALTIVEC
2158         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2159 #else
2160         int i;
2161         for(i=0; i<dstW; i++)
2162         {
2163                 int j;
2164                 int srcPos= filterPos[i];
2165                 int val=0;
2166 //              printf("filterPos: %d\n", filterPos[i]);
2167                 for(j=0; j<filterSize; j++)
2168                 {
2169 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2170                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2171                 }
2172 //              filter += hFilterSize;
2173                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2174 //              dst[i] = val>>7;
2175         }
2176 #endif
2177 #endif
2178 }
2179       // *** horizontal scale Y line to temp buffer
2180 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2181                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2182                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2183                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2184                                    int32_t *mmx2FilterPos)
2185 {
2186     if(srcFormat==IMGFMT_YUY2)
2187     {
2188         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2189         src= formatConvBuffer;
2190     }
2191     else if(srcFormat==IMGFMT_UYVY)
2192     {
2193         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2194         src= formatConvBuffer;
2195     }
2196     else if(srcFormat==IMGFMT_BGR32)
2197     {
2198         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2199         src= formatConvBuffer;
2200     }
2201     else if(srcFormat==IMGFMT_BGR24)
2202     {
2203         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2204         src= formatConvBuffer;
2205     }
2206     else if(srcFormat==IMGFMT_BGR16)
2207     {
2208         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2209         src= formatConvBuffer;
2210     }
2211     else if(srcFormat==IMGFMT_BGR15)
2212     {
2213         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2214         src= formatConvBuffer;
2215     }
2216     else if(srcFormat==IMGFMT_RGB32)
2217     {
2218         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2219         src= formatConvBuffer;
2220     }
2221     else if(srcFormat==IMGFMT_RGB24)
2222     {
2223         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2224         src= formatConvBuffer;
2225     }
2226
2227 #ifdef HAVE_MMX
2228         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2229     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2230 #else
2231     if(!(flags&SWS_FAST_BILINEAR))
2232 #endif
2233     {
2234         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2235     }
2236     else // Fast Bilinear upscale / crap downscale
2237     {
2238 #ifdef ARCH_X86
2239 #ifdef HAVE_MMX2
2240         int i;
2241         if(canMMX2BeUsed)
2242         {
2243                 asm volatile(
2244                         "pxor %%mm7, %%mm7              \n\t"
2245                         "movl %0, %%ecx                 \n\t"
2246                         "movl %1, %%edi                 \n\t"
2247                         "movl %2, %%edx                 \n\t"
2248                         "movl %3, %%ebx                 \n\t"
2249                         "xorl %%eax, %%eax              \n\t" // i
2250                         PREFETCH" (%%ecx)               \n\t"
2251                         PREFETCH" 32(%%ecx)             \n\t"
2252                         PREFETCH" 64(%%ecx)             \n\t"
2253
2254 #define FUNNY_Y_CODE \
2255                         "movl (%%ebx), %%esi            \n\t"\
2256                         "call *%4                       \n\t"\
2257                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2258                         "addl %%eax, %%edi              \n\t"\
2259                         "xorl %%eax, %%eax              \n\t"\
2260
2261 FUNNY_Y_CODE
2262 FUNNY_Y_CODE
2263 FUNNY_Y_CODE
2264 FUNNY_Y_CODE
2265 FUNNY_Y_CODE
2266 FUNNY_Y_CODE
2267 FUNNY_Y_CODE
2268 FUNNY_Y_CODE
2269
2270                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2271                         "m" (funnyYCode)
2272                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2273                 );
2274                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2275         }
2276         else
2277         {
2278 #endif
2279         //NO MMX just normal asm ...
2280         asm volatile(
2281                 "xorl %%eax, %%eax              \n\t" // i
2282                 "xorl %%ebx, %%ebx              \n\t" // xx
2283                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2284                 ".balign 16                     \n\t"
2285                 "1:                             \n\t"
2286                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
2287                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2288                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2289                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2290                 "shll $16, %%edi                \n\t"
2291                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2292                 "movl %1, %%edi                 \n\t"
2293                 "shrl $9, %%esi                 \n\t"
2294                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2295                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2296                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2297
2298                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
2299                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2300                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2301                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2302                 "shll $16, %%edi                \n\t"
2303                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2304                 "movl %1, %%edi                 \n\t"
2305                 "shrl $9, %%esi                 \n\t"
2306                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
2307                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2308                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2309
2310
2311                 "addl $2, %%eax                 \n\t"
2312                 "cmpl %2, %%eax                 \n\t"
2313                 " jb 1b                         \n\t"
2314
2315
2316                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2317                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2318                 );
2319 #ifdef HAVE_MMX2
2320         } //if MMX2 can't be used
2321 #endif
2322 #else
2323         int i;
2324         unsigned int xpos=0;
2325         for(i=0;i<dstWidth;i++)
2326         {
2327                 register unsigned int xx=xpos>>16;
2328                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2329                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2330                 xpos+=xInc;
2331         }
2332 #endif
2333     }
2334 }
2335
2336 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2337                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2338                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2339                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2340                                    int32_t *mmx2FilterPos)
2341 {
2342     if(srcFormat==IMGFMT_YUY2)
2343     {
2344         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2345         src1= formatConvBuffer;
2346         src2= formatConvBuffer+2048;
2347     }
2348     else if(srcFormat==IMGFMT_UYVY)
2349     {
2350         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2351         src1= formatConvBuffer;
2352         src2= formatConvBuffer+2048;
2353     }
2354     else if(srcFormat==IMGFMT_BGR32)
2355     {
2356         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2357         src1= formatConvBuffer;
2358         src2= formatConvBuffer+2048;
2359     }
2360     else if(srcFormat==IMGFMT_BGR24)
2361     {
2362         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2363         src1= formatConvBuffer;
2364         src2= formatConvBuffer+2048;
2365     }
2366     else if(srcFormat==IMGFMT_BGR16)
2367     {
2368         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2369         src1= formatConvBuffer;
2370         src2= formatConvBuffer+2048;
2371     }
2372     else if(srcFormat==IMGFMT_BGR15)
2373     {
2374         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2375         src1= formatConvBuffer;
2376         src2= formatConvBuffer+2048;
2377     }
2378     else if(srcFormat==IMGFMT_RGB32)
2379     {
2380         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2381         src1= formatConvBuffer;
2382         src2= formatConvBuffer+2048;
2383     }
2384     else if(srcFormat==IMGFMT_RGB24)
2385     {
2386         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2387         src1= formatConvBuffer;
2388         src2= formatConvBuffer+2048;
2389     }
2390     else if(isGray(srcFormat))
2391     {
2392         return;
2393     }
2394
2395 #ifdef HAVE_MMX
2396         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2397     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2398 #else
2399     if(!(flags&SWS_FAST_BILINEAR))
2400 #endif
2401     {
2402         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2403         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2404     }
2405     else // Fast Bilinear upscale / crap downscale
2406     {
2407 #ifdef ARCH_X86
2408 #ifdef HAVE_MMX2
2409         int i;
2410         if(canMMX2BeUsed)
2411         {
2412                 asm volatile(
2413                         "pxor %%mm7, %%mm7              \n\t"
2414                         "movl %0, %%ecx                 \n\t"
2415                         "movl %1, %%edi                 \n\t"
2416                         "movl %2, %%edx                 \n\t"
2417                         "movl %3, %%ebx                 \n\t"
2418                         "xorl %%eax, %%eax              \n\t" // i
2419                         PREFETCH" (%%ecx)               \n\t"
2420                         PREFETCH" 32(%%ecx)             \n\t"
2421                         PREFETCH" 64(%%ecx)             \n\t"
2422
2423 #define FUNNY_UV_CODE \
2424                         "movl (%%ebx), %%esi            \n\t"\
2425                         "call *%4                       \n\t"\
2426                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2427                         "addl %%eax, %%edi              \n\t"\
2428                         "xorl %%eax, %%eax              \n\t"\
2429
2430 FUNNY_UV_CODE
2431 FUNNY_UV_CODE
2432 FUNNY_UV_CODE
2433 FUNNY_UV_CODE
2434                         "xorl %%eax, %%eax              \n\t" // i
2435                         "movl %5, %%ecx                 \n\t" // src
2436                         "movl %1, %%edi                 \n\t" // buf1
2437                         "addl $4096, %%edi              \n\t"
2438                         PREFETCH" (%%ecx)               \n\t"
2439                         PREFETCH" 32(%%ecx)             \n\t"
2440                         PREFETCH" 64(%%ecx)             \n\t"
2441
2442 FUNNY_UV_CODE
2443 FUNNY_UV_CODE
2444 FUNNY_UV_CODE
2445 FUNNY_UV_CODE
2446
2447                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2448                         "m" (funnyUVCode), "m" (src2)
2449                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2450                 );
2451                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2452                 {
2453 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2454                         dst[i] = src1[srcW-1]*128;
2455                         dst[i+2048] = src2[srcW-1]*128;
2456                 }
2457         }
2458         else
2459         {
2460 #endif
2461         asm volatile(
2462                 "xorl %%eax, %%eax              \n\t" // i
2463                 "xorl %%ebx, %%ebx              \n\t" // xx
2464                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2465                 ".balign 16                     \n\t"
2466                 "1:                             \n\t"
2467                 "movl %0, %%esi                 \n\t"
2468                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
2469                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
2470                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2471                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2472                 "shll $16, %%edi                \n\t"
2473                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2474                 "movl %1, %%edi                 \n\t"
2475                 "shrl $9, %%esi                 \n\t"
2476                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2477
2478                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
2479                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
2480                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2481                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2482                 "shll $16, %%edi                \n\t"
2483                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2484                 "movl %1, %%edi                 \n\t"
2485                 "shrl $9, %%esi                 \n\t"
2486                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2487
2488                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2489                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2490                 "addl $1, %%eax                 \n\t"
2491                 "cmpl %2, %%eax                 \n\t"
2492                 " jb 1b                         \n\t"
2493
2494                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2495                 "r" (src2)
2496                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2497                 );
2498 #ifdef HAVE_MMX2
2499         } //if MMX2 can't be used
2500 #endif
2501 #else
2502         int i;
2503         unsigned int xpos=0;
2504         for(i=0;i<dstWidth;i++)
2505         {
2506                 register unsigned int xx=xpos>>16;
2507                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2508                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2509                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2510 /* slower
2511           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2512           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2513 */
2514                 xpos+=xInc;
2515         }
2516 #endif
2517    }
2518 }
2519
2520 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2521              int srcSliceH, uint8_t* dst[], int dstStride[]){
2522
2523         /* load a few things into local vars to make the code more readable? and faster */
2524         const int srcW= c->srcW;
2525         const int dstW= c->dstW;
2526         const int dstH= c->dstH;
2527         const int chrDstW= c->chrDstW;
2528         const int chrSrcW= c->chrSrcW;
2529         const int lumXInc= c->lumXInc;
2530         const int chrXInc= c->chrXInc;
2531         const int dstFormat= c->dstFormat;
2532         const int srcFormat= c->srcFormat;
2533         const int flags= c->flags;
2534         const int canMMX2BeUsed= c->canMMX2BeUsed;
2535         int16_t *vLumFilterPos= c->vLumFilterPos;
2536         int16_t *vChrFilterPos= c->vChrFilterPos;
2537         int16_t *hLumFilterPos= c->hLumFilterPos;
2538         int16_t *hChrFilterPos= c->hChrFilterPos;
2539         int16_t *vLumFilter= c->vLumFilter;
2540         int16_t *vChrFilter= c->vChrFilter;
2541         int16_t *hLumFilter= c->hLumFilter;
2542         int16_t *hChrFilter= c->hChrFilter;
2543         int32_t *lumMmxFilter= c->lumMmxFilter;
2544         int32_t *chrMmxFilter= c->chrMmxFilter;
2545         const int vLumFilterSize= c->vLumFilterSize;
2546         const int vChrFilterSize= c->vChrFilterSize;
2547         const int hLumFilterSize= c->hLumFilterSize;
2548         const int hChrFilterSize= c->hChrFilterSize;
2549         int16_t **lumPixBuf= c->lumPixBuf;
2550         int16_t **chrPixBuf= c->chrPixBuf;
2551         const int vLumBufSize= c->vLumBufSize;
2552         const int vChrBufSize= c->vChrBufSize;
2553         uint8_t *funnyYCode= c->funnyYCode;
2554         uint8_t *funnyUVCode= c->funnyUVCode;
2555         uint8_t *formatConvBuffer= c->formatConvBuffer;
2556         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2557         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2558         int lastDstY;
2559
2560         /* vars whch will change and which we need to storw back in the context */
2561         int dstY= c->dstY;
2562         int lumBufIndex= c->lumBufIndex;
2563         int chrBufIndex= c->chrBufIndex;
2564         int lastInLumBuf= c->lastInLumBuf;
2565         int lastInChrBuf= c->lastInChrBuf;
2566         
2567         if(isPacked(c->srcFormat)){
2568                 src[0]=
2569                 src[1]=
2570                 src[2]= src[0];
2571                 srcStride[0]=
2572                 srcStride[1]=
2573                 srcStride[2]= srcStride[0];
2574         }
2575         srcStride[1]<<= c->vChrDrop;
2576         srcStride[2]<<= c->vChrDrop;
2577
2578 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2579 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2580
2581 #if 0 //self test FIXME move to a vfilter or something
2582 {
2583 static volatile int i=0;
2584 i++;
2585 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2586         selfTest(src, srcStride, c->srcW, c->srcH);
2587 i--;
2588 }
2589 #endif
2590
2591 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2592 //dstStride[0],dstStride[1],dstStride[2]);
2593
2594         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2595         {
2596                 static int firstTime=1; //FIXME move this into the context perhaps
2597                 if(flags & SWS_PRINT_INFO && firstTime)
2598                 {
2599                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2600                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2601                         firstTime=0;
2602                 }
2603         }
2604
2605         /* Note the user might start scaling the picture in the middle so this will not get executed
2606            this is not really intended but works currently, so ppl might do it */
2607         if(srcSliceY ==0){
2608                 lumBufIndex=0;
2609                 chrBufIndex=0;
2610                 dstY=0; 
2611                 lastInLumBuf= -1;
2612                 lastInChrBuf= -1;
2613         }
2614
2615         lastDstY= dstY;
2616
2617         for(;dstY < dstH; dstY++){
2618                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2619                 const int chrDstY= dstY>>c->chrDstVSubSample;
2620                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2621                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2622
2623                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2624                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2625                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2626                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2627
2628 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2629 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2630                 //handle holes (FAST_BILINEAR & weird filters)
2631                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2632                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2633 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2634                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2635                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2636
2637                 // Do we have enough lines in this slice to output the dstY line
2638                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2639                 {
2640                         //Do horizontal scaling
2641                         while(lastInLumBuf < lastLumSrcY)
2642                         {
2643                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2644                                 lumBufIndex++;
2645 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2646                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2647                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2648                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2649 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2650                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2651                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2652                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2653                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2654                                 lastInLumBuf++;
2655                         }
2656                         while(lastInChrBuf < lastChrSrcY)
2657                         {
2658                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2659                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2660                                 chrBufIndex++;
2661                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2662                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2663                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2664                                 //FIXME replace parameters through context struct (some at least)
2665
2666                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2667                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2668                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2669                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2670                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2671                                 lastInChrBuf++;
2672                         }
2673                         //wrap buf index around to stay inside the ring buffer
2674                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2675                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2676                 }
2677                 else // not enough lines left in this slice -> load the rest in the buffer
2678                 {
2679 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2680                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2681                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2682                         vChrBufSize, vLumBufSize);*/
2683
2684                         //Do horizontal scaling
2685                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2686                         {
2687                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2688                                 lumBufIndex++;
2689                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2690                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2691                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2692                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2693                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2694                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2695                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2696                                 lastInLumBuf++;
2697                         }
2698                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2699                         {
2700                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2701                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2702                                 chrBufIndex++;
2703                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2704                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2705                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2706
2707                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2708                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2709                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2710                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2711                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2712                                 lastInChrBuf++;
2713                         }
2714                         //wrap buf index around to stay inside the ring buffer
2715                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2716                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2717                         break; //we can't output a dstY line so let's try with the next slice
2718                 }
2719
2720 #ifdef HAVE_MMX
2721                 b5Dither= dither8[dstY&1];
2722                 g6Dither= dither4[dstY&1];
2723                 g5Dither= dither8[dstY&1];
2724                 r5Dither= dither8[(dstY+1)&1];
2725 #endif
2726             if(dstY < dstH-2)
2727             {
2728                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2729                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2730 #ifdef HAVE_MMX
2731                 int i;
2732                 for(i=0; i<vLumFilterSize; i++)
2733                 {
2734                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2735                         lumMmxFilter[4*i+2]= 
2736                         lumMmxFilter[4*i+3]= 
2737                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2738                 }
2739                 for(i=0; i<vChrFilterSize; i++)
2740                 {
2741                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2742                         chrMmxFilter[4*i+2]= 
2743                         chrMmxFilter[4*i+3]= 
2744                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2745                 }
2746 #endif
2747                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2748                 {
2749                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2750                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2751                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2752                         {
2753                                 int16_t *lumBuf = lumPixBuf[0];
2754                                 int16_t *chrBuf= chrPixBuf[0];
2755                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2756                         }
2757                         else //General YV12
2758                         {
2759                                 RENAME(yuv2yuvX)(c,
2760                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2761                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2762                                         dest, uDest, vDest, dstW, chrDstW);
2763                         }
2764                 }
2765                 else
2766                 {
2767                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2768                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2769                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2770                         {
2771                                 int chrAlpha= vChrFilter[2*dstY+1];
2772                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2773                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2774                         }
2775                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2776                         {
2777                                 int lumAlpha= vLumFilter[2*dstY+1];
2778                                 int chrAlpha= vChrFilter[2*dstY+1];
2779                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2780                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2781                         }
2782                         else //General RGB
2783                         {
2784                                 RENAME(yuv2packedX)(c,
2785                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2786                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2787                                         dest, dstW, dstY);
2788                         }
2789                 }
2790             }
2791             else // hmm looks like we can't use MMX here without overwriting this array's tail
2792             {
2793                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2794                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2795                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2796                 {
2797                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799                         yuv2yuvXinC(
2800                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2801                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2802                                 dest, uDest, vDest, dstW, chrDstW);
2803                 }
2804                 else
2805                 {
2806                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2807                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2808                         yuv2packedXinC(c, 
2809                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2810                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2811                                 dest, dstW, dstY);
2812                 }
2813             }
2814         }
2815
2816 #ifdef HAVE_MMX
2817         __asm __volatile(SFENCE:::"memory");
2818         __asm __volatile(EMMS:::"memory");
2819 #endif
2820         /* store changed local vars back in the context */
2821         c->dstY= dstY;
2822         c->lumBufIndex= lumBufIndex;
2823         c->chrBufIndex= chrBufIndex;
2824         c->lastInLumBuf= lastInLumBuf;
2825         c->lastInChrBuf= lastInChrBuf;
2826
2827         return dstY - lastDstY;
2828 }