]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
FreeBSD fix
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS     "femms"
30 #else
31 #define EMMS     "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69                         "xor %%"REG_a", %%"REG_a"       \n\t"\
70                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71                         "movq %%mm3, %%mm4              \n\t"\
72                         "lea " offset "(%0), %%"REG_d"  \n\t"\
73                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74                         ".balign 16                     \n\t" /* FIXME Unroll? */\
75                         "1:                             \n\t"\
76                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79                         "add $16, %%"REG_d"             \n\t"\
80                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81                         "test %%"REG_S", %%"REG_S"      \n\t"\
82                         "pmulhw %%mm0, %%mm2            \n\t"\
83                         "pmulhw %%mm0, %%mm5            \n\t"\
84                         "paddw %%mm2, %%mm3             \n\t"\
85                         "paddw %%mm5, %%mm4             \n\t"\
86                         " jnz 1b                        \n\t"\
87                         "psraw $3, %%mm3                \n\t"\
88                         "psraw $3, %%mm4                \n\t"\
89                         "packuswb %%mm4, %%mm3          \n\t"\
90                         MOVNTQ(%%mm3, (%1, %%REGa))\
91                         "add $8, %%"REG_a"              \n\t"\
92                         "cmp %2, %%"REG_a"              \n\t"\
93                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94                         "movq %%mm3, %%mm4              \n\t"\
95                         "lea " offset "(%0), %%"REG_d"  \n\t"\
96                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97                         "jb 1b                          \n\t"
98
99 #define YSCALEYUV2YV121 \
100                         "mov %2, %%"REG_a"              \n\t"\
101                         ".balign 16                     \n\t" /* FIXME Unroll? */\
102                         "1:                             \n\t"\
103                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105                         "psraw $7, %%mm0                \n\t"\
106                         "psraw $7, %%mm1                \n\t"\
107                         "packuswb %%mm1, %%mm0          \n\t"\
108                         MOVNTQ(%%mm0, (%1, %%REGa))\
109                         "add $8, %%"REG_a"              \n\t"\
110                         "jnc 1b                         \n\t"
111
112 /*
113                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115                            "r" (dest), "m" (dstW),
116                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120                 "xor %%"REG_a", %%"REG_a"       \n\t"\
121                 ".balign 16                     \n\t"\
122                 "nop                            \n\t"\
123                 "1:                             \n\t"\
124                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127                 "movq %%mm3, %%mm4              \n\t"\
128                 ".balign 16                     \n\t"\
129                 "2:                             \n\t"\
130                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133                 "add $16, %%"REG_d"             \n\t"\
134                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135                 "pmulhw %%mm0, %%mm2            \n\t"\
136                 "pmulhw %%mm0, %%mm5            \n\t"\
137                 "paddw %%mm2, %%mm3             \n\t"\
138                 "paddw %%mm5, %%mm4             \n\t"\
139                 "test %%"REG_S", %%"REG_S"      \n\t"\
140                 " jnz 2b                        \n\t"\
141 \
142                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145                 "movq %%mm1, %%mm7              \n\t"\
146                 ".balign 16                     \n\t"\
147                 "2:                             \n\t"\
148                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151                 "add $16, %%"REG_d"             \n\t"\
152                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153                 "pmulhw %%mm0, %%mm2            \n\t"\
154                 "pmulhw %%mm0, %%mm5            \n\t"\
155                 "paddw %%mm2, %%mm1             \n\t"\
156                 "paddw %%mm5, %%mm7             \n\t"\
157                 "test %%"REG_S", %%"REG_S"      \n\t"\
158                 " jnz 2b                        \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162                 YSCALEYUV2PACKEDX\
163                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177                 "paddw %%mm3, %%mm4             \n\t"\
178                 "movq %%mm2, %%mm0              \n\t"\
179                 "movq %%mm5, %%mm6              \n\t"\
180                 "movq %%mm4, %%mm3              \n\t"\
181                 "punpcklwd %%mm2, %%mm2         \n\t"\
182                 "punpcklwd %%mm5, %%mm5         \n\t"\
183                 "punpcklwd %%mm4, %%mm4         \n\t"\
184                 "paddw %%mm1, %%mm2             \n\t"\
185                 "paddw %%mm1, %%mm5             \n\t"\
186                 "paddw %%mm1, %%mm4             \n\t"\
187                 "punpckhwd %%mm0, %%mm0         \n\t"\
188                 "punpckhwd %%mm6, %%mm6         \n\t"\
189                 "punpckhwd %%mm3, %%mm3         \n\t"\
190                 "paddw %%mm7, %%mm0             \n\t"\
191                 "paddw %%mm7, %%mm6             \n\t"\
192                 "paddw %%mm7, %%mm3             \n\t"\
193                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194                 "packuswb %%mm0, %%mm2          \n\t"\
195                 "packuswb %%mm6, %%mm5          \n\t"\
196                 "packuswb %%mm3, %%mm4          \n\t"\
197                 "pxor %%mm7, %%mm7              \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200                 "pxor %%mm7, %%mm7              \n\t"\
201                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202                 "punpcklwd %%mm6, %%mm6         \n\t"\
203                 "punpcklwd %%mm6, %%mm6         \n\t"\
204                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205                 "punpcklwd %%mm5, %%mm5         \n\t"\
206                 "punpcklwd %%mm5, %%mm5         \n\t"\
207                 "xor %%"REG_a", %%"REG_a"               \n\t"\
208                 ".balign 16                     \n\t"\
209                 "1:                             \n\t"\
210                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237 \
238 \
239                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
243                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
244                 "packuswb %%mm3, %%mm3          \n\t"\
245 \
246                 "packuswb %%mm0, %%mm0          \n\t"\
247                 "paddw %%mm4, %%mm2             \n\t"\
248                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
249 \
250                 "packuswb %%mm1, %%mm1          \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256                 "psraw $3, %%mm0                \n\t"\
257                 "psraw $3, %%mm1                \n\t"\
258                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260                 "xor "#index", "#index"         \n\t"\
261                 ".balign 16                     \n\t"\
262                 "1:                             \n\t"\
263                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288                 
289 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290                 
291 #define REAL_YSCALEYUV2RGB(index, c) \
292                 "xor "#index", "#index" \n\t"\
293                 ".balign 16                     \n\t"\
294                 "1:                             \n\t"\
295                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334                 "paddw %%mm3, %%mm4             \n\t"\
335                 "movq %%mm2, %%mm0              \n\t"\
336                 "movq %%mm5, %%mm6              \n\t"\
337                 "movq %%mm4, %%mm3              \n\t"\
338                 "punpcklwd %%mm2, %%mm2         \n\t"\
339                 "punpcklwd %%mm5, %%mm5         \n\t"\
340                 "punpcklwd %%mm4, %%mm4         \n\t"\
341                 "paddw %%mm1, %%mm2             \n\t"\
342                 "paddw %%mm1, %%mm5             \n\t"\
343                 "paddw %%mm1, %%mm4             \n\t"\
344                 "punpckhwd %%mm0, %%mm0         \n\t"\
345                 "punpckhwd %%mm6, %%mm6         \n\t"\
346                 "punpckhwd %%mm3, %%mm3         \n\t"\
347                 "paddw %%mm7, %%mm0             \n\t"\
348                 "paddw %%mm7, %%mm6             \n\t"\
349                 "paddw %%mm7, %%mm3             \n\t"\
350                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351                 "packuswb %%mm0, %%mm2          \n\t"\
352                 "packuswb %%mm6, %%mm5          \n\t"\
353                 "packuswb %%mm3, %%mm4          \n\t"\
354                 "pxor %%mm7, %%mm7              \n\t"
355 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356                 
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358                 "xor "#index", "#index"         \n\t"\
359                 ".balign 16                     \n\t"\
360                 "1:                             \n\t"\
361                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363                 "psraw $7, %%mm3                \n\t" \
364                 "psraw $7, %%mm4                \n\t" \
365                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367                 "psraw $7, %%mm1                \n\t" \
368                 "psraw $7, %%mm7                \n\t" \
369                 
370 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371                 
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373                 "xor "#index", "#index" \n\t"\
374                 ".balign 16                     \n\t"\
375                 "1:                             \n\t"\
376                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398                 "paddw %%mm3, %%mm4             \n\t"\
399                 "movq %%mm2, %%mm0              \n\t"\
400                 "movq %%mm5, %%mm6              \n\t"\
401                 "movq %%mm4, %%mm3              \n\t"\
402                 "punpcklwd %%mm2, %%mm2         \n\t"\
403                 "punpcklwd %%mm5, %%mm5         \n\t"\
404                 "punpcklwd %%mm4, %%mm4         \n\t"\
405                 "paddw %%mm1, %%mm2             \n\t"\
406                 "paddw %%mm1, %%mm5             \n\t"\
407                 "paddw %%mm1, %%mm4             \n\t"\
408                 "punpckhwd %%mm0, %%mm0         \n\t"\
409                 "punpckhwd %%mm6, %%mm6         \n\t"\
410                 "punpckhwd %%mm3, %%mm3         \n\t"\
411                 "paddw %%mm7, %%mm0             \n\t"\
412                 "paddw %%mm7, %%mm6             \n\t"\
413                 "paddw %%mm7, %%mm3             \n\t"\
414                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415                 "packuswb %%mm0, %%mm2          \n\t"\
416                 "packuswb %%mm6, %%mm5          \n\t"\
417                 "packuswb %%mm3, %%mm4          \n\t"\
418                 "pxor %%mm7, %%mm7              \n\t"
419 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422                 "xor "#index", "#index"         \n\t"\
423                 ".balign 16                     \n\t"\
424                 "1:                             \n\t"\
425                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431                 "psrlw $8, %%mm3                \n\t" \
432                 "psrlw $8, %%mm4                \n\t" \
433                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435                 "psraw $7, %%mm1                \n\t" \
436                 "psraw $7, %%mm7                \n\t" 
437 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438                 
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441                 "xor "#index", "#index"         \n\t"\
442                 ".balign 16                     \n\t"\
443                 "1:                             \n\t"\
444                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470                 "paddw %%mm3, %%mm4             \n\t"\
471                 "movq %%mm2, %%mm0              \n\t"\
472                 "movq %%mm5, %%mm6              \n\t"\
473                 "movq %%mm4, %%mm3              \n\t"\
474                 "punpcklwd %%mm2, %%mm2         \n\t"\
475                 "punpcklwd %%mm5, %%mm5         \n\t"\
476                 "punpcklwd %%mm4, %%mm4         \n\t"\
477                 "paddw %%mm1, %%mm2             \n\t"\
478                 "paddw %%mm1, %%mm5             \n\t"\
479                 "paddw %%mm1, %%mm4             \n\t"\
480                 "punpckhwd %%mm0, %%mm0         \n\t"\
481                 "punpckhwd %%mm6, %%mm6         \n\t"\
482                 "punpckhwd %%mm3, %%mm3         \n\t"\
483                 "paddw %%mm7, %%mm0             \n\t"\
484                 "paddw %%mm7, %%mm6             \n\t"\
485                 "paddw %%mm7, %%mm3             \n\t"\
486                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487                 "packuswb %%mm0, %%mm2          \n\t"\
488                 "packuswb %%mm6, %%mm5          \n\t"\
489                 "packuswb %%mm3, %%mm4          \n\t"\
490                 "pxor %%mm7, %%mm7              \n\t"
491 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495                         "movq %%mm2, %%mm1              \n\t" /* B */\
496                         "movq %%mm5, %%mm6              \n\t" /* R */\
497                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507 \
508                         MOVNTQ(%%mm0, (dst, index, 4))\
509                         MOVNTQ(%%mm2, 8(dst, index, 4))\
510                         MOVNTQ(%%mm1, 16(dst, index, 4))\
511                         MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513                         "add $8, "#index"               \n\t"\
514                         "cmp "#dstw", "#index"          \n\t"\
515                         " jb 1b                         \n\t"
516 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522                         "psrlq $3, %%mm2                \n\t"\
523 \
524                         "movq %%mm2, %%mm1              \n\t"\
525                         "movq %%mm4, %%mm3              \n\t"\
526 \
527                         "punpcklbw %%mm7, %%mm3         \n\t"\
528                         "punpcklbw %%mm5, %%mm2         \n\t"\
529                         "punpckhbw %%mm7, %%mm4         \n\t"\
530                         "punpckhbw %%mm5, %%mm1         \n\t"\
531 \
532                         "psllq $3, %%mm3                \n\t"\
533                         "psllq $3, %%mm4                \n\t"\
534 \
535                         "por %%mm3, %%mm2               \n\t"\
536                         "por %%mm4, %%mm1               \n\t"\
537 \
538                         MOVNTQ(%%mm2, (dst, index, 2))\
539                         MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541                         "add $8, "#index"               \n\t"\
542                         "cmp "#dstw", "#index"          \n\t"\
543                         " jb 1b                         \n\t"
544 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550                         "psrlq $3, %%mm2                \n\t"\
551                         "psrlq $1, %%mm5                \n\t"\
552 \
553                         "movq %%mm2, %%mm1              \n\t"\
554                         "movq %%mm4, %%mm3              \n\t"\
555 \
556                         "punpcklbw %%mm7, %%mm3         \n\t"\
557                         "punpcklbw %%mm5, %%mm2         \n\t"\
558                         "punpckhbw %%mm7, %%mm4         \n\t"\
559                         "punpckhbw %%mm5, %%mm1         \n\t"\
560 \
561                         "psllq $2, %%mm3                \n\t"\
562                         "psllq $2, %%mm4                \n\t"\
563 \
564                         "por %%mm3, %%mm2               \n\t"\
565                         "por %%mm4, %%mm1               \n\t"\
566 \
567                         MOVNTQ(%%mm2, (dst, index, 2))\
568                         MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570                         "add $8, "#index"               \n\t"\
571                         "cmp "#dstw", "#index"          \n\t"\
572                         " jb 1b                         \n\t"
573 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577                         "movq %%mm2, %%mm1              \n\t" /* B */\
578                         "movq %%mm5, %%mm6              \n\t" /* R */\
579                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589 \
590                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598 \
599                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612 \
613                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621 \
622                         MOVNTQ(%%mm0, (dst))\
623                         MOVNTQ(%%mm2, 8(dst))\
624                         MOVNTQ(%%mm3, 16(dst))\
625                         "add $24, "#dst"                \n\t"\
626 \
627                         "add $8, "#index"               \n\t"\
628                         "cmp "#dstw", "#index"          \n\t"\
629                         " jb 1b                         \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633                         "movq %%mm2, %%mm1              \n\t" /* B */\
634                         "movq %%mm5, %%mm6              \n\t" /* R */\
635                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645 \
646                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650 \
651                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655 \
656                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660 \
661                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665                         MOVNTQ(%%mm0, (dst))\
666 \
667                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671                         MOVNTQ(%%mm6, 8(dst))\
672 \
673                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676                         MOVNTQ(%%mm5, 16(dst))\
677 \
678                         "add $24, "#dst"                \n\t"\
679 \
680                         "add $8, "#index"                       \n\t"\
681                         "cmp "#dstw", "#index"                  \n\t"\
682                         " jb 1b                         \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
687                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
688                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691 \
692                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695 \
696                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697                         "por %%mm1, %%mm6               \n\t"\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, (dst))\
700 \
701                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705 \
706                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709 \
710                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711                         "por %%mm3, %%mm6               \n\t"\
712                         MOVNTQ(%%mm6, 8(dst))\
713 \
714                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717 \
718                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721 \
722                         "por %%mm1, %%mm3               \n\t"\
723                         "por %%mm3, %%mm6               \n\t"\
724                         MOVNTQ(%%mm6, 16(dst))\
725 \
726                         "add $24, "#dst"                \n\t"\
727 \
728                         "add $8, "#index"               \n\t"\
729                         "cmp "#dstw", "#index"          \n\t"\
730                         " jb 1b                         \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741                         "packuswb %%mm3, %%mm3          \n\t"\
742                         "packuswb %%mm4, %%mm4          \n\t"\
743                         "packuswb %%mm7, %%mm1          \n\t"\
744                         "punpcklbw %%mm4, %%mm3         \n\t"\
745                         "movq %%mm1, %%mm7              \n\t"\
746                         "punpcklbw %%mm3, %%mm1         \n\t"\
747                         "punpckhbw %%mm3, %%mm7         \n\t"\
748 \
749                         MOVNTQ(%%mm1, (dst, index, 2))\
750                         MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752                         "add $8, "#index"               \n\t"\
753                         "cmp "#dstw", "#index"          \n\t"\
754                         " jb 1b                         \n\t"
755 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761 {
762 #ifdef HAVE_MMX
763         if(uDest != NULL)
764         {
765                 asm volatile(
766                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767                                 :: "r" (&c->redDither),
768                                 "r" (uDest), "m" ((long)chrDstW)
769                                 : "%"REG_a, "%"REG_d, "%"REG_S
770                         );
771
772                 asm volatile(
773                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774                                 :: "r" (&c->redDither),
775                                 "r" (vDest), "m" ((long)chrDstW)
776                                 : "%"REG_a, "%"REG_d, "%"REG_S
777                         );
778         }
779
780         asm volatile(
781                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782                         :: "r" (&c->redDither),
783                            "r" (dest), "m" ((long)dstW)
784                         : "%"REG_a, "%"REG_d, "%"REG_S
785                 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789                       chrFilter, chrSrc, chrFilterSize,
790                       dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793             chrFilter, chrSrc, chrFilterSize,
794             dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
801 {
802 #ifdef HAVE_MMX
803         if(uDest != NULL)
804         {
805                 asm volatile(
806                                 YSCALEYUV2YV121
807                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
808                                 "g" ((long)-chrDstW)
809                                 : "%"REG_a
810                         );
811
812                 asm volatile(
813                                 YSCALEYUV2YV121
814                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
815                                 "g" ((long)-chrDstW)
816                                 : "%"REG_a
817                         );
818         }
819
820         asm volatile(
821                 YSCALEYUV2YV121
822                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
823                 "g" ((long)-dstW)
824                 : "%"REG_a
825         );
826 #else
827         int i;
828         for(i=0; i<dstW; i++)
829         {
830                 int val= lumSrc[i]>>7;
831                 
832                 if(val&256){
833                         if(val<0) val=0;
834                         else      val=255;
835                 }
836
837                 dest[i]= val;
838         }
839
840         if(uDest != NULL)
841                 for(i=0; i<chrDstW; i++)
842                 {
843                         int u=chrSrc[i]>>7;
844                         int v=chrSrc[i + 2048]>>7;
845
846                         if((u|v)&256){
847                                 if(u<0)         u=0;
848                                 else if (u>255) u=255;
849                                 if(v<0)         v=0;
850                                 else if (v>255) v=255;
851                         }
852
853                         uDest[i]= u;
854                         vDest[i]= v;
855                 }
856 #endif
857 }
858
859
860 /**
861  * vertical scale YV12 to RGB
862  */
863 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865                             uint8_t *dest, int dstW, int dstY)
866 {
867         int dummy=0;
868         switch(c->dstFormat)
869         {
870 #ifdef HAVE_MMX
871         case IMGFMT_BGR32:
872                 {
873                         asm volatile(
874                                 YSCALEYUV2RGBX
875                                 WRITEBGR32(%4, %5, %%REGa)
876
877                         :: "r" (&c->redDither), 
878                            "m" (dummy), "m" (dummy), "m" (dummy),
879                            "r" (dest), "m" (dstW)
880                         : "%"REG_a, "%"REG_d, "%"REG_S
881                         );
882                 }
883                 break;
884         case IMGFMT_BGR24:
885                 {
886                         asm volatile(
887                                 YSCALEYUV2RGBX
888                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889                                 "add %4, %%"REG_b"                      \n\t"
890                                 WRITEBGR24(%%REGb, %5, %%REGa)
891
892                         :: "r" (&c->redDither), 
893                            "m" (dummy), "m" (dummy), "m" (dummy),
894                            "r" (dest), "m" (dstW)
895                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
896                         );
897                 }
898                 break;
899         case IMGFMT_BGR15:
900                 {
901                         asm volatile(
902                                 YSCALEYUV2RGBX
903                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
904 #ifdef DITHER1XBPP
905                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
908 #endif
909
910                                 WRITEBGR15(%4, %5, %%REGa)
911
912                         :: "r" (&c->redDither), 
913                            "m" (dummy), "m" (dummy), "m" (dummy),
914                            "r" (dest), "m" (dstW)
915                         : "%"REG_a, "%"REG_d, "%"REG_S
916                         );
917                 }
918                 break;
919         case IMGFMT_BGR16:
920                 {
921                         asm volatile(
922                                 YSCALEYUV2RGBX
923                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
924 #ifdef DITHER1XBPP
925                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
928 #endif
929
930                                 WRITEBGR16(%4, %5, %%REGa)
931
932                         :: "r" (&c->redDither), 
933                            "m" (dummy), "m" (dummy), "m" (dummy),
934                            "r" (dest), "m" (dstW)
935                         : "%"REG_a, "%"REG_d, "%"REG_S
936                         );
937                 }
938                 break;
939         case IMGFMT_YUY2:
940                 {
941                         asm volatile(
942                                 YSCALEYUV2PACKEDX
943                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
944
945                                 "psraw $3, %%mm3                \n\t"
946                                 "psraw $3, %%mm4                \n\t"
947                                 "psraw $3, %%mm1                \n\t"
948                                 "psraw $3, %%mm7                \n\t"
949                                 WRITEYUY2(%4, %5, %%REGa)
950
951                         :: "r" (&c->redDither), 
952                            "m" (dummy), "m" (dummy), "m" (dummy),
953                            "r" (dest), "m" (dstW)
954                         : "%"REG_a, "%"REG_d, "%"REG_S
955                         );
956                 }
957                 break;
958 #endif
959         default:
960 #ifdef HAVE_ALTIVEC
961                 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962                             chrFilter, chrSrc, chrFilterSize,
963                             dest, dstW, dstY);
964 #else
965                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966                             chrFilter, chrSrc, chrFilterSize,
967                             dest, dstW, dstY);
968 #endif
969                 break;
970         }
971 }
972
973 /**
974  * vertical bilinear scale YV12 to RGB
975  */
976 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
978 {
979         int yalpha1=yalpha^4095;
980         int uvalpha1=uvalpha^4095;
981         int i;
982
983 #if 0 //isn't used
984         if(flags&SWS_FULL_CHR_H_INT)
985         {
986                 switch(dstFormat)
987                 {
988 #ifdef HAVE_MMX
989                 case IMGFMT_BGR32:
990                         asm volatile(
991
992
993 FULL_YSCALEYUV2RGB
994                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
995                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
996
997                         "movq %%mm3, %%mm1              \n\t"
998                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
999                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1000
1001                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1003
1004                         "add $4, %%"REG_a"              \n\t"
1005                         "cmp %5, %%"REG_a"              \n\t"
1006                         " jb 1b                         \n\t"
1007
1008
1009                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010                         "m" (yalpha1), "m" (uvalpha1)
1011                         : "%"REG_a
1012                         );
1013                         break;
1014                 case IMGFMT_BGR24:
1015                         asm volatile(
1016
1017 FULL_YSCALEYUV2RGB
1018
1019                                                                 // lsb ... msb
1020                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1021                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1022
1023                         "movq %%mm3, %%mm1              \n\t"
1024                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1025                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1026
1027                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1028                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1029                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1032                         "movq %%mm1, %%mm2              \n\t"
1033                         "psllq $48, %%mm1               \n\t" // 000000BG
1034                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1035
1036                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1037                         "psrld $16, %%mm2               \n\t" // R000R000
1038                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1039                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1040
1041                         "mov %4, %%"REG_b"              \n\t"
1042                         "add %%"REG_a", %%"REG_b"       \n\t"
1043
1044 #ifdef HAVE_MMX2
1045                         //FIXME Alignment
1046                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1048 #else
1049                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1050                         "psrlq $32, %%mm3               \n\t"
1051                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1052                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1053 #endif
1054                         "add $4, %%"REG_a"              \n\t"
1055                         "cmp %5, %%"REG_a"              \n\t"
1056                         " jb 1b                         \n\t"
1057
1058                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059                         "m" (yalpha1), "m" (uvalpha1)
1060                         : "%"REG_a, "%"REG_b
1061                         );
1062                         break;
1063                 case IMGFMT_BGR15:
1064                         asm volatile(
1065
1066 FULL_YSCALEYUV2RGB
1067 #ifdef DITHER1XBPP
1068                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1071 #endif
1072                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1073                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1074                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1075
1076                         "psrlw $3, %%mm3                \n\t"
1077                         "psllw $2, %%mm1                \n\t"
1078                         "psllw $7, %%mm0                \n\t"
1079                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1080                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1081
1082                         "por %%mm3, %%mm1               \n\t"
1083                         "por %%mm1, %%mm0               \n\t"
1084
1085                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1086
1087                         "add $4, %%"REG_a"              \n\t"
1088                         "cmp %5, %%"REG_a"              \n\t"
1089                         " jb 1b                         \n\t"
1090
1091                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092                         "m" (yalpha1), "m" (uvalpha1)
1093                         : "%"REG_a
1094                         );
1095                         break;
1096                 case IMGFMT_BGR16:
1097                         asm volatile(
1098
1099 FULL_YSCALEYUV2RGB
1100 #ifdef DITHER1XBPP
1101                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1104 #endif
1105                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1106                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1107                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1108
1109                         "psrlw $3, %%mm3                \n\t"
1110                         "psllw $3, %%mm1                \n\t"
1111                         "psllw $8, %%mm0                \n\t"
1112                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1113                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1114
1115                         "por %%mm3, %%mm1               \n\t"
1116                         "por %%mm1, %%mm0               \n\t"
1117
1118                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1119
1120                         "add $4, %%"REG_a"              \n\t"
1121                         "cmp %5, %%"REG_a"              \n\t"
1122                         " jb 1b                         \n\t"
1123
1124                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125                         "m" (yalpha1), "m" (uvalpha1)
1126                         : "%"REG_a
1127                         );
1128                 break;
1129 #endif
1130                 case IMGFMT_RGB32:
1131 #ifndef HAVE_MMX
1132                 case IMGFMT_BGR32:
1133 #endif
1134                 if(dstFormat==IMGFMT_BGR32)
1135                 {
1136                         int i;
1137 #ifdef WORDS_BIGENDIAN
1138                         dest++;
1139 #endif
1140                         for(i=0;i<dstW;i++){
1141                                 // vertical linear interpolation && yuv2rgb in a single step:
1142                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1148                                 dest+= 4;
1149                         }
1150                 }
1151                 else if(dstFormat==IMGFMT_BGR24)
1152                 {
1153                         int i;
1154                         for(i=0;i<dstW;i++){
1155                                 // vertical linear interpolation && yuv2rgb in a single step:
1156                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162                                 dest+= 3;
1163                         }
1164                 }
1165                 else if(dstFormat==IMGFMT_BGR16)
1166                 {
1167                         int i;
1168                         for(i=0;i<dstW;i++){
1169                                 // vertical linear interpolation && yuv2rgb in a single step:
1170                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173
1174                                 ((uint16_t*)dest)[i] =
1175                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1178                         }
1179                 }
1180                 else if(dstFormat==IMGFMT_BGR15)
1181                 {
1182                         int i;
1183                         for(i=0;i<dstW;i++){
1184                                 // vertical linear interpolation && yuv2rgb in a single step:
1185                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1188
1189                                 ((uint16_t*)dest)[i] =
1190                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1193                         }
1194                 }
1195         }//FULL_UV_IPOL
1196         else
1197         {
1198 #endif // if 0
1199 #ifdef HAVE_MMX
1200         switch(c->dstFormat)
1201         {
1202 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1203         case IMGFMT_BGR32:
1204                         asm volatile(
1205                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1206                                 "mov %4, %%"REG_SP"                     \n\t"
1207                                 YSCALEYUV2RGB(%%REGa, %5)
1208                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1210
1211                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1212                         "r" (&c->redDither)
1213                         : "%"REG_a
1214                         );
1215                         return;
1216         case IMGFMT_BGR24:
1217                         asm volatile(
1218                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1219                                 "mov %4, %%"REG_SP"                     \n\t"
1220                                 YSCALEYUV2RGB(%%REGa, %5)
1221                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1223                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1224                         "r" (&c->redDither)
1225                         : "%"REG_a
1226                         );
1227                         return;
1228         case IMGFMT_BGR15:
1229                         asm volatile(
1230                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1231                                 "mov %4, %%"REG_SP"                     \n\t"
1232                                 YSCALEYUV2RGB(%%REGa, %5)
1233                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1234 #ifdef DITHER1XBPP
1235                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238 #endif
1239
1240                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1242
1243                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1244                         "r" (&c->redDither)
1245                         : "%"REG_a
1246                         );
1247                         return;
1248         case IMGFMT_BGR16:
1249                         asm volatile(
1250                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1251                                 "mov %4, %%"REG_SP"                     \n\t"
1252                                 YSCALEYUV2RGB(%%REGa, %5)
1253                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1254 #ifdef DITHER1XBPP
1255                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258 #endif
1259
1260                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1262                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1263                         "r" (&c->redDither)
1264                         : "%"REG_a
1265                         );
1266                         return;
1267         case IMGFMT_YUY2:
1268                         asm volatile(
1269                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1270                                 "mov %4, %%"REG_SP"                     \n\t"
1271                                 YSCALEYUV2PACKED(%%REGa, %5)
1272                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1274                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1275                         "r" (&c->redDither)
1276                         : "%"REG_a
1277                         );
1278                         return;
1279         default: break;
1280         }
1281 #endif //HAVE_MMX
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1283 }
1284
1285 /**
1286  * YV12 to RGB without scaling or interpolating
1287  */
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290 {
1291         const int yalpha1=0;
1292         int i;
1293         
1294         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295         const int yalpha= 4096; //FIXME ...
1296
1297         if(flags&SWS_FULL_CHR_H_INT)
1298         {
1299                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300                 return;
1301         }
1302
1303 #ifdef HAVE_MMX
1304         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1305         {
1306                 switch(dstFormat)
1307                 {
1308                 case IMGFMT_BGR32:
1309                         asm volatile(
1310                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1311                                 "mov %4, %%"REG_SP"                     \n\t"
1312                                 YSCALEYUV2RGB1(%%REGa, %5)
1313                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1315
1316                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1317                         "r" (&c->redDither)
1318                         : "%"REG_a
1319                         );
1320                         return;
1321                 case IMGFMT_BGR24:
1322                         asm volatile(
1323                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1324                                 "mov %4, %%"REG_SP"                     \n\t"
1325                                 YSCALEYUV2RGB1(%%REGa, %5)
1326                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1328
1329                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1330                         "r" (&c->redDither)
1331                         : "%"REG_a
1332                         );
1333                         return;
1334                 case IMGFMT_BGR15:
1335                         asm volatile(
1336                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1337                                 "mov %4, %%"REG_SP"                     \n\t"
1338                                 YSCALEYUV2RGB1(%%REGa, %5)
1339                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1340 #ifdef DITHER1XBPP
1341                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1344 #endif
1345                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1347
1348                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1349                         "r" (&c->redDither)
1350                         : "%"REG_a
1351                         );
1352                         return;
1353                 case IMGFMT_BGR16:
1354                         asm volatile(
1355                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1356                                 "mov %4, %%"REG_SP"                     \n\t"
1357                                 YSCALEYUV2RGB1(%%REGa, %5)
1358                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1359 #ifdef DITHER1XBPP
1360                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1363 #endif
1364
1365                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1367
1368                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1369                         "r" (&c->redDither)
1370                         : "%"REG_a
1371                         );
1372                         return;
1373                 case IMGFMT_YUY2:
1374                         asm volatile(
1375                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1376                                 "mov %4, %%"REG_SP"                     \n\t"
1377                                 YSCALEYUV2PACKED1(%%REGa, %5)
1378                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1380
1381                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1382                         "r" (&c->redDither)
1383                         : "%"REG_a
1384                         );
1385                         return;
1386                 }
1387         }
1388         else
1389         {
1390                 switch(dstFormat)
1391                 {
1392                 case IMGFMT_BGR32:
1393                         asm volatile(
1394                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1395                                 "mov %4, %%"REG_SP"                     \n\t"
1396                                 YSCALEYUV2RGB1b(%%REGa, %5)
1397                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1399
1400                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1401                         "r" (&c->redDither)
1402                         : "%"REG_a
1403                         );
1404                         return;
1405                 case IMGFMT_BGR24:
1406                         asm volatile(
1407                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1408                                 "mov %4, %%"REG_SP"                     \n\t"
1409                                 YSCALEYUV2RGB1b(%%REGa, %5)
1410                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1412
1413                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1414                         "r" (&c->redDither)
1415                         : "%"REG_a
1416                         );
1417                         return;
1418                 case IMGFMT_BGR15:
1419                         asm volatile(
1420                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1421                                 "mov %4, %%"REG_SP"                     \n\t"
1422                                 YSCALEYUV2RGB1b(%%REGa, %5)
1423                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424 #ifdef DITHER1XBPP
1425                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1428 #endif
1429                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1431
1432                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1433                         "r" (&c->redDither)
1434                         : "%"REG_a
1435                         );
1436                         return;
1437                 case IMGFMT_BGR16:
1438                         asm volatile(
1439                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1440                                 "mov %4, %%"REG_SP"                     \n\t"
1441                                 YSCALEYUV2RGB1b(%%REGa, %5)
1442                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443 #ifdef DITHER1XBPP
1444                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1447 #endif
1448
1449                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1451
1452                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453                         "r" (&c->redDither)
1454                         : "%"REG_a
1455                         );
1456                         return;
1457                 case IMGFMT_YUY2:
1458                         asm volatile(
1459                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1460                                 "mov %4, %%"REG_SP"                     \n\t"
1461                                 YSCALEYUV2PACKED1b(%%REGa, %5)
1462                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1464
1465                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1466                         "r" (&c->redDither)
1467                         : "%"REG_a
1468                         );
1469                         return;
1470                 }
1471         }
1472 #endif
1473         if( uvalpha < 2048 )
1474         {
1475                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1476         }else{
1477                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1478         }
1479 }
1480
1481 //FIXME yuy2* can read upto 7 samples to much
1482
1483 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1484 {
1485 #ifdef HAVE_MMX
1486         asm volatile(
1487                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488                 "mov %0, %%"REG_a"              \n\t"
1489                 "1:                             \n\t"
1490                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1491                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1492                 "pand %%mm2, %%mm0              \n\t"
1493                 "pand %%mm2, %%mm1              \n\t"
1494                 "packuswb %%mm1, %%mm0          \n\t"
1495                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1496                 "add $8, %%"REG_a"              \n\t"
1497                 " js 1b                         \n\t"
1498                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1499                 : "%"REG_a
1500         );
1501 #else
1502         int i;
1503         for(i=0; i<width; i++)
1504                 dst[i]= src[2*i];
1505 #endif
1506 }
1507
1508 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1509 {
1510 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1511         asm volatile(
1512                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513                 "mov %0, %%"REG_a"              \n\t"
1514                 "1:                             \n\t"
1515                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1516                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1517                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1518                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1519                 PAVGB(%%mm2, %%mm0)
1520                 PAVGB(%%mm3, %%mm1)
1521                 "psrlw $8, %%mm0                \n\t"
1522                 "psrlw $8, %%mm1                \n\t"
1523                 "packuswb %%mm1, %%mm0          \n\t"
1524                 "movq %%mm0, %%mm1              \n\t"
1525                 "psrlw $8, %%mm0                \n\t"
1526                 "pand %%mm4, %%mm1              \n\t"
1527                 "packuswb %%mm0, %%mm0          \n\t"
1528                 "packuswb %%mm1, %%mm1          \n\t"
1529                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1530                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1531                 "add $4, %%"REG_a"              \n\t"
1532                 " js 1b                         \n\t"
1533                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1534                 : "%"REG_a
1535         );
1536 #else
1537         int i;
1538         for(i=0; i<width; i++)
1539         {
1540                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1542         }
1543 #endif
1544 }
1545
1546 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1548 {
1549 #ifdef HAVE_MMX
1550         asm volatile(
1551                 "mov %0, %%"REG_a"              \n\t"
1552                 "1:                             \n\t"
1553                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1554                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1555                 "psrlw $8, %%mm0                \n\t"
1556                 "psrlw $8, %%mm1                \n\t"
1557                 "packuswb %%mm1, %%mm0          \n\t"
1558                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1559                 "add $8, %%"REG_a"              \n\t"
1560                 " js 1b                         \n\t"
1561                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1562                 : "%"REG_a
1563         );
1564 #else
1565         int i;
1566         for(i=0; i<width; i++)
1567                 dst[i]= src[2*i+1];
1568 #endif
1569 }
1570
1571 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1572 {
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574         asm volatile(
1575                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576                 "mov %0, %%"REG_a"              \n\t"
1577                 "1:                             \n\t"
1578                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1579                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1580                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1581                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1582                 PAVGB(%%mm2, %%mm0)
1583                 PAVGB(%%mm3, %%mm1)
1584                 "pand %%mm4, %%mm0              \n\t"
1585                 "pand %%mm4, %%mm1              \n\t"
1586                 "packuswb %%mm1, %%mm0          \n\t"
1587                 "movq %%mm0, %%mm1              \n\t"
1588                 "psrlw $8, %%mm0                \n\t"
1589                 "pand %%mm4, %%mm1              \n\t"
1590                 "packuswb %%mm0, %%mm0          \n\t"
1591                 "packuswb %%mm1, %%mm1          \n\t"
1592                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1593                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1594                 "add $4, %%"REG_a"              \n\t"
1595                 " js 1b                         \n\t"
1596                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1597                 : "%"REG_a
1598         );
1599 #else
1600         int i;
1601         for(i=0; i<width; i++)
1602         {
1603                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1605         }
1606 #endif
1607 }
1608
1609 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1610 {
1611 #ifdef HAVE_MMXFIXME
1612 #else
1613         int i;
1614         for(i=0; i<width; i++)
1615         {
1616                 int b=  ((uint32_t*)src)[i]&0xFF;
1617                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1619
1620                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1621         }
1622 #endif
1623 }
1624
1625 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1626 {
1627 #ifdef HAVE_MMXFIXME
1628 #else
1629         int i;
1630         for(i=0; i<width; i++)
1631         {
1632                 const int a= ((uint32_t*)src1)[2*i+0];
1633                 const int e= ((uint32_t*)src1)[2*i+1];
1634                 const int c= ((uint32_t*)src2)[2*i+0];
1635                 const int d= ((uint32_t*)src2)[2*i+1];
1636                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638                 const int b=  l&0x3FF;
1639                 const int g=  h>>8;
1640                 const int r=  l>>16;
1641
1642                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1644         }
1645 #endif
1646 }
1647
1648 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1649 {
1650 #ifdef HAVE_MMX
1651         asm volatile(
1652                 "mov %2, %%"REG_a"              \n\t"
1653                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1654                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1655                 "pxor %%mm7, %%mm7              \n\t"
1656                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1657                 ".balign 16                     \n\t"
1658                 "1:                             \n\t"
1659                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1660                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1661                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1662                 "punpcklbw %%mm7, %%mm0         \n\t"
1663                 "punpcklbw %%mm7, %%mm1         \n\t"
1664                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1665                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1666                 "punpcklbw %%mm7, %%mm2         \n\t"
1667                 "punpcklbw %%mm7, %%mm3         \n\t"
1668                 "pmaddwd %%mm6, %%mm0           \n\t"
1669                 "pmaddwd %%mm6, %%mm1           \n\t"
1670                 "pmaddwd %%mm6, %%mm2           \n\t"
1671                 "pmaddwd %%mm6, %%mm3           \n\t"
1672 #ifndef FAST_BGR2YV12
1673                 "psrad $8, %%mm0                \n\t"
1674                 "psrad $8, %%mm1                \n\t"
1675                 "psrad $8, %%mm2                \n\t"
1676                 "psrad $8, %%mm3                \n\t"
1677 #endif
1678                 "packssdw %%mm1, %%mm0          \n\t"
1679                 "packssdw %%mm3, %%mm2          \n\t"
1680                 "pmaddwd %%mm5, %%mm0           \n\t"
1681                 "pmaddwd %%mm5, %%mm2           \n\t"
1682                 "packssdw %%mm2, %%mm0          \n\t"
1683                 "psraw $7, %%mm0                \n\t"
1684
1685                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1686                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1687                 "punpcklbw %%mm7, %%mm4         \n\t"
1688                 "punpcklbw %%mm7, %%mm1         \n\t"
1689                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1690                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1691                 "punpcklbw %%mm7, %%mm2         \n\t"
1692                 "punpcklbw %%mm7, %%mm3         \n\t"
1693                 "pmaddwd %%mm6, %%mm4           \n\t"
1694                 "pmaddwd %%mm6, %%mm1           \n\t"
1695                 "pmaddwd %%mm6, %%mm2           \n\t"
1696                 "pmaddwd %%mm6, %%mm3           \n\t"
1697 #ifndef FAST_BGR2YV12
1698                 "psrad $8, %%mm4                \n\t"
1699                 "psrad $8, %%mm1                \n\t"
1700                 "psrad $8, %%mm2                \n\t"
1701                 "psrad $8, %%mm3                \n\t"
1702 #endif
1703                 "packssdw %%mm1, %%mm4          \n\t"
1704                 "packssdw %%mm3, %%mm2          \n\t"
1705                 "pmaddwd %%mm5, %%mm4           \n\t"
1706                 "pmaddwd %%mm5, %%mm2           \n\t"
1707                 "add $24, %%"REG_b"             \n\t"
1708                 "packssdw %%mm2, %%mm4          \n\t"
1709                 "psraw $7, %%mm4                \n\t"
1710
1711                 "packuswb %%mm4, %%mm0          \n\t"
1712                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1713
1714                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1715                 "add $8, %%"REG_a"              \n\t"
1716                 " js 1b                         \n\t"
1717                 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718                 : "%"REG_a, "%"REG_b
1719         );
1720 #else
1721         int i;
1722         for(i=0; i<width; i++)
1723         {
1724                 int b= src[i*3+0];
1725                 int g= src[i*3+1];
1726                 int r= src[i*3+2];
1727
1728                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1729         }
1730 #endif
1731 }
1732
1733 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1734 {
1735 #ifdef HAVE_MMX
1736         asm volatile(
1737                 "mov %4, %%"REG_a"              \n\t"
1738                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1739                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1740                 "pxor %%mm7, %%mm7              \n\t"
1741                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1742                 "add %%"REG_b", %%"REG_b"       \n\t"
1743                 ".balign 16                     \n\t"
1744                 "1:                             \n\t"
1745                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1746                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1747 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1749                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1750                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1751                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1752                 PAVGB(%%mm1, %%mm0)
1753                 PAVGB(%%mm3, %%mm2)
1754                 "movq %%mm0, %%mm1              \n\t"
1755                 "movq %%mm2, %%mm3              \n\t"
1756                 "psrlq $24, %%mm0               \n\t"
1757                 "psrlq $24, %%mm2               \n\t"
1758                 PAVGB(%%mm1, %%mm0)
1759                 PAVGB(%%mm3, %%mm2)
1760                 "punpcklbw %%mm7, %%mm0         \n\t"
1761                 "punpcklbw %%mm7, %%mm2         \n\t"
1762 #else
1763                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1764                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1765                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1766                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1767                 "punpcklbw %%mm7, %%mm0         \n\t"
1768                 "punpcklbw %%mm7, %%mm1         \n\t"
1769                 "punpcklbw %%mm7, %%mm2         \n\t"
1770                 "punpcklbw %%mm7, %%mm3         \n\t"
1771                 "paddw %%mm1, %%mm0             \n\t"
1772                 "paddw %%mm3, %%mm2             \n\t"
1773                 "paddw %%mm2, %%mm0             \n\t"
1774                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1775                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1776                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1777                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1778                 "punpcklbw %%mm7, %%mm4         \n\t"
1779                 "punpcklbw %%mm7, %%mm1         \n\t"
1780                 "punpcklbw %%mm7, %%mm2         \n\t"
1781                 "punpcklbw %%mm7, %%mm3         \n\t"
1782                 "paddw %%mm1, %%mm4             \n\t"
1783                 "paddw %%mm3, %%mm2             \n\t"
1784                 "paddw %%mm4, %%mm2             \n\t"
1785                 "psrlw $2, %%mm0                \n\t"
1786                 "psrlw $2, %%mm2                \n\t"
1787 #endif
1788                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1789                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1790                 
1791                 "pmaddwd %%mm0, %%mm1           \n\t"
1792                 "pmaddwd %%mm2, %%mm3           \n\t"
1793                 "pmaddwd %%mm6, %%mm0           \n\t"
1794                 "pmaddwd %%mm6, %%mm2           \n\t"
1795 #ifndef FAST_BGR2YV12
1796                 "psrad $8, %%mm0                \n\t"
1797                 "psrad $8, %%mm1                \n\t"
1798                 "psrad $8, %%mm2                \n\t"
1799                 "psrad $8, %%mm3                \n\t"
1800 #endif
1801                 "packssdw %%mm2, %%mm0          \n\t"
1802                 "packssdw %%mm3, %%mm1          \n\t"
1803                 "pmaddwd %%mm5, %%mm0           \n\t"
1804                 "pmaddwd %%mm5, %%mm1           \n\t"
1805                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1806                 "psraw $7, %%mm0                \n\t"
1807
1808 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1810                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1811                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1812                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1813                 PAVGB(%%mm1, %%mm4)
1814                 PAVGB(%%mm3, %%mm2)
1815                 "movq %%mm4, %%mm1              \n\t"
1816                 "movq %%mm2, %%mm3              \n\t"
1817                 "psrlq $24, %%mm4               \n\t"
1818                 "psrlq $24, %%mm2               \n\t"
1819                 PAVGB(%%mm1, %%mm4)
1820                 PAVGB(%%mm3, %%mm2)
1821                 "punpcklbw %%mm7, %%mm4         \n\t"
1822                 "punpcklbw %%mm7, %%mm2         \n\t"
1823 #else
1824                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1825                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1826                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1827                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1828                 "punpcklbw %%mm7, %%mm4         \n\t"
1829                 "punpcklbw %%mm7, %%mm1         \n\t"
1830                 "punpcklbw %%mm7, %%mm2         \n\t"
1831                 "punpcklbw %%mm7, %%mm3         \n\t"
1832                 "paddw %%mm1, %%mm4             \n\t"
1833                 "paddw %%mm3, %%mm2             \n\t"
1834                 "paddw %%mm2, %%mm4             \n\t"
1835                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1836                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1837                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1838                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1839                 "punpcklbw %%mm7, %%mm5         \n\t"
1840                 "punpcklbw %%mm7, %%mm1         \n\t"
1841                 "punpcklbw %%mm7, %%mm2         \n\t"
1842                 "punpcklbw %%mm7, %%mm3         \n\t"
1843                 "paddw %%mm1, %%mm5             \n\t"
1844                 "paddw %%mm3, %%mm2             \n\t"
1845                 "paddw %%mm5, %%mm2             \n\t"
1846                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1847                 "psrlw $2, %%mm4                \n\t"
1848                 "psrlw $2, %%mm2                \n\t"
1849 #endif
1850                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1851                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1852                 
1853                 "pmaddwd %%mm4, %%mm1           \n\t"
1854                 "pmaddwd %%mm2, %%mm3           \n\t"
1855                 "pmaddwd %%mm6, %%mm4           \n\t"
1856                 "pmaddwd %%mm6, %%mm2           \n\t"
1857 #ifndef FAST_BGR2YV12
1858                 "psrad $8, %%mm4                \n\t"
1859                 "psrad $8, %%mm1                \n\t"
1860                 "psrad $8, %%mm2                \n\t"
1861                 "psrad $8, %%mm3                \n\t"
1862 #endif
1863                 "packssdw %%mm2, %%mm4          \n\t"
1864                 "packssdw %%mm3, %%mm1          \n\t"
1865                 "pmaddwd %%mm5, %%mm4           \n\t"
1866                 "pmaddwd %%mm5, %%mm1           \n\t"
1867                 "add $24, %%"REG_b"             \n\t"
1868                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1869                 "psraw $7, %%mm4                \n\t"
1870                 
1871                 "movq %%mm0, %%mm1              \n\t"
1872                 "punpckldq %%mm4, %%mm0         \n\t"
1873                 "punpckhdq %%mm4, %%mm1         \n\t"
1874                 "packsswb %%mm1, %%mm0          \n\t"
1875                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1876
1877                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1878                 "punpckhdq %%mm0, %%mm0         \n\t"
1879                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1880                 "add $4, %%"REG_a"              \n\t"
1881                 " js 1b                         \n\t"
1882                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883                 : "%"REG_a, "%"REG_b
1884         );
1885 #else
1886         int i;
1887         for(i=0; i<width; i++)
1888         {
1889                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1892
1893                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1895         }
1896 #endif
1897 }
1898
1899 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1900 {
1901         int i;
1902         for(i=0; i<width; i++)
1903         {
1904                 int d= ((uint16_t*)src)[i];
1905                 int b= d&0x1F;
1906                 int g= (d>>5)&0x3F;
1907                 int r= (d>>11)&0x1F;
1908
1909                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1910         }
1911 }
1912
1913 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1914 {
1915         int i;
1916         for(i=0; i<width; i++)
1917         {
1918                 int d0= ((uint32_t*)src1)[i];
1919                 int d1= ((uint32_t*)src2)[i];
1920                 
1921                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1923
1924                 int dh2= (dh>>11) + (dh<<21);
1925                 int d= dh2 + dl;
1926
1927                 int b= d&0x7F;
1928                 int r= (d>>11)&0x7F;
1929                 int g= d>>21;
1930                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1932         }
1933 }
1934
1935 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1936 {
1937         int i;
1938         for(i=0; i<width; i++)
1939         {
1940                 int d= ((uint16_t*)src)[i];
1941                 int b= d&0x1F;
1942                 int g= (d>>5)&0x1F;
1943                 int r= (d>>10)&0x1F;
1944
1945                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1946         }
1947 }
1948
1949 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1950 {
1951         int i;
1952         for(i=0; i<width; i++)
1953         {
1954                 int d0= ((uint32_t*)src1)[i];
1955                 int d1= ((uint32_t*)src2)[i];
1956                 
1957                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1959
1960                 int dh2= (dh>>11) + (dh<<21);
1961                 int d= dh2 + dl;
1962
1963                 int b= d&0x7F;
1964                 int r= (d>>10)&0x7F;
1965                 int g= d>>21;
1966                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1968         }
1969 }
1970
1971
1972 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1973 {
1974         int i;
1975         for(i=0; i<width; i++)
1976         {
1977                 int r=  ((uint32_t*)src)[i]&0xFF;
1978                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1980
1981                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1982         }
1983 }
1984
1985 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1986 {
1987         int i;
1988         for(i=0; i<width; i++)
1989         {
1990                 const int a= ((uint32_t*)src1)[2*i+0];
1991                 const int e= ((uint32_t*)src1)[2*i+1];
1992                 const int c= ((uint32_t*)src2)[2*i+0];
1993                 const int d= ((uint32_t*)src2)[2*i+1];
1994                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996                 const int r=  l&0x3FF;
1997                 const int g=  h>>8;
1998                 const int b=  l>>16;
1999
2000                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2002         }
2003 }
2004
2005 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2006 {
2007         int i;
2008         for(i=0; i<width; i++)
2009         {
2010                 int r= src[i*3+0];
2011                 int g= src[i*3+1];
2012                 int b= src[i*3+2];
2013
2014                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2015         }
2016 }
2017
2018 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2019 {
2020         int i;
2021         for(i=0; i<width; i++)
2022         {
2023                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2026
2027                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2029         }
2030 }
2031
2032
2033 // Bilinear / Bicubic scaling
2034 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035                                   int16_t *filter, int16_t *filterPos, int filterSize)
2036 {
2037 #ifdef HAVE_MMX
2038         assert(filterSize % 4 == 0 && filterSize>0);
2039         if(filterSize==4) // allways true for upscaling, sometimes for down too
2040         {
2041                 long counter= -2*dstW;
2042                 filter-= counter*2;
2043                 filterPos-= counter/2;
2044                 dst-= counter/2;
2045                 asm volatile(
2046                         "pxor %%mm7, %%mm7              \n\t"
2047                         "movq "MANGLE(w02)", %%mm6      \n\t"
2048                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2049                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2050                         ".balign 16                     \n\t"
2051                         "1:                             \n\t"
2052                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2053                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2054                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2057                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2058                         "punpcklbw %%mm7, %%mm0         \n\t"
2059                         "punpcklbw %%mm7, %%mm2         \n\t"
2060                         "pmaddwd %%mm1, %%mm0           \n\t"
2061                         "pmaddwd %%mm2, %%mm3           \n\t"
2062                         "psrad $8, %%mm0                \n\t"
2063                         "psrad $8, %%mm3                \n\t"
2064                         "packssdw %%mm3, %%mm0          \n\t"
2065                         "pmaddwd %%mm6, %%mm0           \n\t"
2066                         "packssdw %%mm0, %%mm0          \n\t"
2067                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2068                         "add $4, %%"REG_BP"             \n\t"
2069                         " jnc 1b                        \n\t"
2070
2071                         "pop %%"REG_BP"                 \n\t"
2072                         : "+a" (counter)
2073                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2074                         : "%"REG_b
2075                 );
2076         }
2077         else if(filterSize==8)
2078         {
2079                 long counter= -2*dstW;
2080                 filter-= counter*4;
2081                 filterPos-= counter/2;
2082                 dst-= counter/2;
2083                 asm volatile(
2084                         "pxor %%mm7, %%mm7              \n\t"
2085                         "movq "MANGLE(w02)", %%mm6      \n\t"
2086                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2087                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2088                         ".balign 16                     \n\t"
2089                         "1:                             \n\t"
2090                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2091                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2092                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2095                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2096                         "punpcklbw %%mm7, %%mm0         \n\t"
2097                         "punpcklbw %%mm7, %%mm2         \n\t"
2098                         "pmaddwd %%mm1, %%mm0           \n\t"
2099                         "pmaddwd %%mm2, %%mm3           \n\t"
2100
2101                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2104                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2105                         "punpcklbw %%mm7, %%mm4         \n\t"
2106                         "punpcklbw %%mm7, %%mm2         \n\t"
2107                         "pmaddwd %%mm1, %%mm4           \n\t"
2108                         "pmaddwd %%mm2, %%mm5           \n\t"
2109                         "paddd %%mm4, %%mm0             \n\t"
2110                         "paddd %%mm5, %%mm3             \n\t"
2111                                                 
2112                         "psrad $8, %%mm0                \n\t"
2113                         "psrad $8, %%mm3                \n\t"
2114                         "packssdw %%mm3, %%mm0          \n\t"
2115                         "pmaddwd %%mm6, %%mm0           \n\t"
2116                         "packssdw %%mm0, %%mm0          \n\t"
2117                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2118                         "add $4, %%"REG_BP"             \n\t"
2119                         " jnc 1b                        \n\t"
2120
2121                         "pop %%"REG_BP"                 \n\t"
2122                         : "+a" (counter)
2123                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2124                         : "%"REG_b
2125                 );
2126         }
2127         else
2128         {
2129                 long counter= -2*dstW;
2130 //              filter-= counter*filterSize/2;
2131                 filterPos-= counter/2;
2132                 dst-= counter/2;
2133                 asm volatile(
2134                         "pxor %%mm7, %%mm7              \n\t"
2135                         "movq "MANGLE(w02)", %%mm6      \n\t"
2136                         ".balign 16                     \n\t"
2137                         "1:                             \n\t"
2138                         "mov %2, %%"REG_c"              \n\t"
2139                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2140                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2141                         "mov %5, %%"REG_c"              \n\t"
2142                         "pxor %%mm4, %%mm4              \n\t"
2143                         "pxor %%mm5, %%mm5              \n\t"
2144                         "2:                             \n\t"
2145                         "movq (%1), %%mm1               \n\t"
2146                         "movq (%1, %6), %%mm3           \n\t"
2147                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149                         "punpcklbw %%mm7, %%mm0         \n\t"
2150                         "punpcklbw %%mm7, %%mm2         \n\t"
2151                         "pmaddwd %%mm1, %%mm0           \n\t"
2152                         "pmaddwd %%mm2, %%mm3           \n\t"
2153                         "paddd %%mm3, %%mm5             \n\t"
2154                         "paddd %%mm0, %%mm4             \n\t"
2155                         "add $8, %1                     \n\t"
2156                         "add $4, %%"REG_c"              \n\t"
2157                         "cmp %4, %%"REG_c"              \n\t"
2158                         " jb 2b                         \n\t"
2159                         "add %6, %1                     \n\t"
2160                         "psrad $8, %%mm4                \n\t"
2161                         "psrad $8, %%mm5                \n\t"
2162                         "packssdw %%mm5, %%mm4          \n\t"
2163                         "pmaddwd %%mm6, %%mm4           \n\t"
2164                         "packssdw %%mm4, %%mm4          \n\t"
2165                         "mov %3, %%"REG_a"              \n\t"
2166                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2167                         "add $4, %0                     \n\t"
2168                         " jnc 1b                        \n\t"
2169
2170                         : "+r" (counter), "+r" (filter)
2171                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172                           "m" (src), "r" ((long)filterSize*2)
2173                         : "%"REG_b, "%"REG_a, "%"REG_c
2174                 );
2175         }
2176 #else
2177 #ifdef HAVE_ALTIVEC
2178         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2179 #else
2180         int i;
2181         for(i=0; i<dstW; i++)
2182         {
2183                 int j;
2184                 int srcPos= filterPos[i];
2185                 int val=0;
2186 //              printf("filterPos: %d\n", filterPos[i]);
2187                 for(j=0; j<filterSize; j++)
2188                 {
2189 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2191                 }
2192 //              filter += hFilterSize;
2193                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2194 //              dst[i] = val>>7;
2195         }
2196 #endif
2197 #endif
2198 }
2199       // *** horizontal scale Y line to temp buffer
2200 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2203                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204                                    int32_t *mmx2FilterPos)
2205 {
2206     if(srcFormat==IMGFMT_YUY2)
2207     {
2208         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209         src= formatConvBuffer;
2210     }
2211     else if(srcFormat==IMGFMT_UYVY)
2212     {
2213         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214         src= formatConvBuffer;
2215     }
2216     else if(srcFormat==IMGFMT_BGR32)
2217     {
2218         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219         src= formatConvBuffer;
2220     }
2221     else if(srcFormat==IMGFMT_BGR24)
2222     {
2223         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224         src= formatConvBuffer;
2225     }
2226     else if(srcFormat==IMGFMT_BGR16)
2227     {
2228         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229         src= formatConvBuffer;
2230     }
2231     else if(srcFormat==IMGFMT_BGR15)
2232     {
2233         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234         src= formatConvBuffer;
2235     }
2236     else if(srcFormat==IMGFMT_RGB32)
2237     {
2238         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239         src= formatConvBuffer;
2240     }
2241     else if(srcFormat==IMGFMT_RGB24)
2242     {
2243         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244         src= formatConvBuffer;
2245     }
2246
2247 #ifdef HAVE_MMX
2248         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2250 #else
2251     if(!(flags&SWS_FAST_BILINEAR))
2252 #endif
2253     {
2254         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2255     }
2256     else // Fast Bilinear upscale / crap downscale
2257     {
2258 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2259 #ifdef HAVE_MMX2
2260         int i;
2261         if(canMMX2BeUsed)
2262         {
2263                 asm volatile(
2264                         "pxor %%mm7, %%mm7              \n\t"
2265                         "mov %0, %%"REG_c"              \n\t"
2266                         "mov %1, %%"REG_D"              \n\t"
2267                         "mov %2, %%"REG_d"              \n\t"
2268                         "mov %3, %%"REG_b"              \n\t"
2269                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2270                         PREFETCH" (%%"REG_c")           \n\t"
2271                         PREFETCH" 32(%%"REG_c")         \n\t"
2272                         PREFETCH" 64(%%"REG_c")         \n\t"
2273
2274 #ifdef ARCH_X86_64
2275
2276 #define FUNNY_Y_CODE \
2277                         "movl (%%"REG_b"), %%esi        \n\t"\
2278                         "call *%4                       \n\t"\
2279                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2280                         "add %%"REG_S", %%"REG_c"       \n\t"\
2281                         "add %%"REG_a", %%"REG_D"       \n\t"\
2282                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2283
2284 #else
2285
2286 #define FUNNY_Y_CODE \
2287                         "movl (%%"REG_b"), %%esi        \n\t"\
2288                         "call *%4                       \n\t"\
2289                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2290                         "add %%"REG_a", %%"REG_D"       \n\t"\
2291                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2292
2293 #endif
2294
2295 FUNNY_Y_CODE
2296 FUNNY_Y_CODE
2297 FUNNY_Y_CODE
2298 FUNNY_Y_CODE
2299 FUNNY_Y_CODE
2300 FUNNY_Y_CODE
2301 FUNNY_Y_CODE
2302 FUNNY_Y_CODE
2303
2304                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2305                         "m" (funnyYCode)
2306                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2307                 );
2308                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2309         }
2310         else
2311         {
2312 #endif
2313         //NO MMX just normal asm ...
2314         asm volatile(
2315                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2316                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2317                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2318                 ".balign 16                     \n\t"
2319                 "1:                             \n\t"
2320                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2321                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2322                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2323                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2324                 "shll $16, %%edi                \n\t"
2325                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2326                 "mov %1, %%"REG_D"              \n\t"
2327                 "shrl $9, %%esi                 \n\t"
2328                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2329                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2330                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2331
2332                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2333                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2334                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2335                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2336                 "shll $16, %%edi                \n\t"
2337                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2338                 "mov %1, %%"REG_D"              \n\t"
2339                 "shrl $9, %%esi                 \n\t"
2340                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2341                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2342                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2343
2344
2345                 "add $2, %%"REG_a"              \n\t"
2346                 "cmp %2, %%"REG_a"              \n\t"
2347                 " jb 1b                         \n\t"
2348
2349
2350                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2351                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2352                 );
2353 #ifdef HAVE_MMX2
2354         } //if MMX2 can't be used
2355 #endif
2356 #else
2357         int i;
2358         unsigned int xpos=0;
2359         for(i=0;i<dstWidth;i++)
2360         {
2361                 register unsigned int xx=xpos>>16;
2362                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2363                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2364                 xpos+=xInc;
2365         }
2366 #endif
2367     }
2368 }
2369
2370 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2371                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2372                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2373                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2374                                    int32_t *mmx2FilterPos)
2375 {
2376     if(srcFormat==IMGFMT_YUY2)
2377     {
2378         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2379         src1= formatConvBuffer;
2380         src2= formatConvBuffer+2048;
2381     }
2382     else if(srcFormat==IMGFMT_UYVY)
2383     {
2384         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385         src1= formatConvBuffer;
2386         src2= formatConvBuffer+2048;
2387     }
2388     else if(srcFormat==IMGFMT_BGR32)
2389     {
2390         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391         src1= formatConvBuffer;
2392         src2= formatConvBuffer+2048;
2393     }
2394     else if(srcFormat==IMGFMT_BGR24)
2395     {
2396         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397         src1= formatConvBuffer;
2398         src2= formatConvBuffer+2048;
2399     }
2400     else if(srcFormat==IMGFMT_BGR16)
2401     {
2402         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403         src1= formatConvBuffer;
2404         src2= formatConvBuffer+2048;
2405     }
2406     else if(srcFormat==IMGFMT_BGR15)
2407     {
2408         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409         src1= formatConvBuffer;
2410         src2= formatConvBuffer+2048;
2411     }
2412     else if(srcFormat==IMGFMT_RGB32)
2413     {
2414         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415         src1= formatConvBuffer;
2416         src2= formatConvBuffer+2048;
2417     }
2418     else if(srcFormat==IMGFMT_RGB24)
2419     {
2420         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421         src1= formatConvBuffer;
2422         src2= formatConvBuffer+2048;
2423     }
2424     else if(isGray(srcFormat))
2425     {
2426         return;
2427     }
2428
2429 #ifdef HAVE_MMX
2430         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2431     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2432 #else
2433     if(!(flags&SWS_FAST_BILINEAR))
2434 #endif
2435     {
2436         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2437         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2438     }
2439     else // Fast Bilinear upscale / crap downscale
2440     {
2441 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2442 #ifdef HAVE_MMX2
2443         int i;
2444         if(canMMX2BeUsed)
2445         {
2446                 asm volatile(
2447                         "pxor %%mm7, %%mm7              \n\t"
2448                         "mov %0, %%"REG_c"              \n\t"
2449                         "mov %1, %%"REG_D"              \n\t"
2450                         "mov %2, %%"REG_d"              \n\t"
2451                         "mov %3, %%"REG_b"              \n\t"
2452                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2453                         PREFETCH" (%%"REG_c")           \n\t"
2454                         PREFETCH" 32(%%"REG_c")         \n\t"
2455                         PREFETCH" 64(%%"REG_c")         \n\t"
2456
2457 #ifdef ARCH_X86_64
2458
2459 #define FUNNY_UV_CODE \
2460                         "movl (%%"REG_b"), %%esi        \n\t"\
2461                         "call *%4                       \n\t"\
2462                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2463                         "add %%"REG_S", %%"REG_c"       \n\t"\
2464                         "add %%"REG_a", %%"REG_D"       \n\t"\
2465                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2466
2467 #else
2468
2469 #define FUNNY_UV_CODE \
2470                         "movl (%%"REG_b"), %%esi        \n\t"\
2471                         "call *%4                       \n\t"\
2472                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2473                         "add %%"REG_a", %%"REG_D"       \n\t"\
2474                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2475
2476 #endif
2477
2478 FUNNY_UV_CODE
2479 FUNNY_UV_CODE
2480 FUNNY_UV_CODE
2481 FUNNY_UV_CODE
2482                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2483                         "mov %5, %%"REG_c"              \n\t" // src
2484                         "mov %1, %%"REG_D"              \n\t" // buf1
2485                         "add $4096, %%"REG_D"           \n\t"
2486                         PREFETCH" (%%"REG_c")           \n\t"
2487                         PREFETCH" 32(%%"REG_c")         \n\t"
2488                         PREFETCH" 64(%%"REG_c")         \n\t"
2489
2490 FUNNY_UV_CODE
2491 FUNNY_UV_CODE
2492 FUNNY_UV_CODE
2493 FUNNY_UV_CODE
2494
2495                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2496                         "m" (funnyUVCode), "m" (src2)
2497                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2498                 );
2499                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2500                 {
2501 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2502                         dst[i] = src1[srcW-1]*128;
2503                         dst[i+2048] = src2[srcW-1]*128;
2504                 }
2505         }
2506         else
2507         {
2508 #endif
2509         asm volatile(
2510                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2511                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2512                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2513                 ".balign 16                     \n\t"
2514                 "1:                             \n\t"
2515                 "mov %0, %%"REG_S"              \n\t"
2516                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2517                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2518                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2519                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2520                 "shll $16, %%edi                \n\t"
2521                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2522                 "mov %1, %%"REG_D"              \n\t"
2523                 "shrl $9, %%esi                 \n\t"
2524                 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2525
2526                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2527                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2528                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2529                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2530                 "shll $16, %%edi                \n\t"
2531                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2532                 "mov %1, %%"REG_D"              \n\t"
2533                 "shrl $9, %%esi                 \n\t"
2534                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2535
2536                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2537                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2538                 "add $1, %%"REG_a"              \n\t"
2539                 "cmp %2, %%"REG_a"              \n\t"
2540                 " jb 1b                         \n\t"
2541
2542                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2543                 "r" (src2)
2544                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2545                 );
2546 #ifdef HAVE_MMX2
2547         } //if MMX2 can't be used
2548 #endif
2549 #else
2550         int i;
2551         unsigned int xpos=0;
2552         for(i=0;i<dstWidth;i++)
2553         {
2554                 register unsigned int xx=xpos>>16;
2555                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2556                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2557                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2558 /* slower
2559           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2560           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2561 */
2562                 xpos+=xInc;
2563         }
2564 #endif
2565    }
2566 }
2567
2568 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2569              int srcSliceH, uint8_t* dst[], int dstStride[]){
2570
2571         /* load a few things into local vars to make the code more readable? and faster */
2572         const int srcW= c->srcW;
2573         const int dstW= c->dstW;
2574         const int dstH= c->dstH;
2575         const int chrDstW= c->chrDstW;
2576         const int chrSrcW= c->chrSrcW;
2577         const int lumXInc= c->lumXInc;
2578         const int chrXInc= c->chrXInc;
2579         const int dstFormat= c->dstFormat;
2580         const int srcFormat= c->srcFormat;
2581         const int flags= c->flags;
2582         const int canMMX2BeUsed= c->canMMX2BeUsed;
2583         int16_t *vLumFilterPos= c->vLumFilterPos;
2584         int16_t *vChrFilterPos= c->vChrFilterPos;
2585         int16_t *hLumFilterPos= c->hLumFilterPos;
2586         int16_t *hChrFilterPos= c->hChrFilterPos;
2587         int16_t *vLumFilter= c->vLumFilter;
2588         int16_t *vChrFilter= c->vChrFilter;
2589         int16_t *hLumFilter= c->hLumFilter;
2590         int16_t *hChrFilter= c->hChrFilter;
2591         int32_t *lumMmxFilter= c->lumMmxFilter;
2592         int32_t *chrMmxFilter= c->chrMmxFilter;
2593         const int vLumFilterSize= c->vLumFilterSize;
2594         const int vChrFilterSize= c->vChrFilterSize;
2595         const int hLumFilterSize= c->hLumFilterSize;
2596         const int hChrFilterSize= c->hChrFilterSize;
2597         int16_t **lumPixBuf= c->lumPixBuf;
2598         int16_t **chrPixBuf= c->chrPixBuf;
2599         const int vLumBufSize= c->vLumBufSize;
2600         const int vChrBufSize= c->vChrBufSize;
2601         uint8_t *funnyYCode= c->funnyYCode;
2602         uint8_t *funnyUVCode= c->funnyUVCode;
2603         uint8_t *formatConvBuffer= c->formatConvBuffer;
2604         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2605         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2606         int lastDstY;
2607
2608         /* vars whch will change and which we need to storw back in the context */
2609         int dstY= c->dstY;
2610         int lumBufIndex= c->lumBufIndex;
2611         int chrBufIndex= c->chrBufIndex;
2612         int lastInLumBuf= c->lastInLumBuf;
2613         int lastInChrBuf= c->lastInChrBuf;
2614         
2615         if(isPacked(c->srcFormat)){
2616                 src[0]=
2617                 src[1]=
2618                 src[2]= src[0];
2619                 srcStride[0]=
2620                 srcStride[1]=
2621                 srcStride[2]= srcStride[0];
2622         }
2623         srcStride[1]<<= c->vChrDrop;
2624         srcStride[2]<<= c->vChrDrop;
2625
2626 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2627 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2628
2629 #if 0 //self test FIXME move to a vfilter or something
2630 {
2631 static volatile int i=0;
2632 i++;
2633 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2634         selfTest(src, srcStride, c->srcW, c->srcH);
2635 i--;
2636 }
2637 #endif
2638
2639 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2640 //dstStride[0],dstStride[1],dstStride[2]);
2641
2642         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2643         {
2644                 static int firstTime=1; //FIXME move this into the context perhaps
2645                 if(flags & SWS_PRINT_INFO && firstTime)
2646                 {
2647                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2648                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2649                         firstTime=0;
2650                 }
2651         }
2652
2653         /* Note the user might start scaling the picture in the middle so this will not get executed
2654            this is not really intended but works currently, so ppl might do it */
2655         if(srcSliceY ==0){
2656                 lumBufIndex=0;
2657                 chrBufIndex=0;
2658                 dstY=0; 
2659                 lastInLumBuf= -1;
2660                 lastInChrBuf= -1;
2661         }
2662
2663         lastDstY= dstY;
2664
2665         for(;dstY < dstH; dstY++){
2666                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2667                 const int chrDstY= dstY>>c->chrDstVSubSample;
2668                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2669                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2670
2671                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2672                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2673                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2674                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2675
2676 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2677 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2678                 //handle holes (FAST_BILINEAR & weird filters)
2679                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2680                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2681 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2682                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2683                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2684
2685                 // Do we have enough lines in this slice to output the dstY line
2686                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2687                 {
2688                         //Do horizontal scaling
2689                         while(lastInLumBuf < lastLumSrcY)
2690                         {
2691                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2692                                 lumBufIndex++;
2693 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2694                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2695                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2696                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2697 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2698                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2699                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2700                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2701                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2702                                 lastInLumBuf++;
2703                         }
2704                         while(lastInChrBuf < lastChrSrcY)
2705                         {
2706                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2707                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2708                                 chrBufIndex++;
2709                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2710                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2711                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2712                                 //FIXME replace parameters through context struct (some at least)
2713
2714                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2715                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2716                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2717                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2718                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2719                                 lastInChrBuf++;
2720                         }
2721                         //wrap buf index around to stay inside the ring buffer
2722                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2723                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2724                 }
2725                 else // not enough lines left in this slice -> load the rest in the buffer
2726                 {
2727 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2728                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2729                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2730                         vChrBufSize, vLumBufSize);*/
2731
2732                         //Do horizontal scaling
2733                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2734                         {
2735                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2736                                 lumBufIndex++;
2737                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2738                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2739                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2740                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2741                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2742                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2743                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2744                                 lastInLumBuf++;
2745                         }
2746                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2747                         {
2748                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2749                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2750                                 chrBufIndex++;
2751                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2752                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2753                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2754
2755                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2756                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2757                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2758                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2759                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2760                                 lastInChrBuf++;
2761                         }
2762                         //wrap buf index around to stay inside the ring buffer
2763                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2764                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2765                         break; //we can't output a dstY line so let's try with the next slice
2766                 }
2767
2768 #ifdef HAVE_MMX
2769                 b5Dither= dither8[dstY&1];
2770                 g6Dither= dither4[dstY&1];
2771                 g5Dither= dither8[dstY&1];
2772                 r5Dither= dither8[(dstY+1)&1];
2773 #endif
2774             if(dstY < dstH-2)
2775             {
2776                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2777                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2778 #ifdef HAVE_MMX
2779                 int i;
2780                 for(i=0; i<vLumFilterSize; i++)
2781                 {
2782                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2783                         lumMmxFilter[4*i+2]= 
2784                         lumMmxFilter[4*i+3]= 
2785                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2786                 }
2787                 for(i=0; i<vChrFilterSize; i++)
2788                 {
2789                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2790                         chrMmxFilter[4*i+2]= 
2791                         chrMmxFilter[4*i+3]= 
2792                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2793                 }
2794 #endif
2795                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2796                 {
2797                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2800                         {
2801                                 int16_t *lumBuf = lumPixBuf[0];
2802                                 int16_t *chrBuf= chrPixBuf[0];
2803                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2804                         }
2805                         else //General YV12
2806                         {
2807                                 RENAME(yuv2yuvX)(c,
2808                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2809                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810                                         dest, uDest, vDest, dstW, chrDstW);
2811                         }
2812                 }
2813                 else
2814                 {
2815                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2818                         {
2819                                 int chrAlpha= vChrFilter[2*dstY+1];
2820                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2821                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2822                         }
2823                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2824                         {
2825                                 int lumAlpha= vLumFilter[2*dstY+1];
2826                                 int chrAlpha= vChrFilter[2*dstY+1];
2827                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2828                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2829                         }
2830                         else //General RGB
2831                         {
2832                                 RENAME(yuv2packedX)(c,
2833                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2834                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835                                         dest, dstW, dstY);
2836                         }
2837                 }
2838             }
2839             else // hmm looks like we can't use MMX here without overwriting this array's tail
2840             {
2841                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2842                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2843                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2844                 {
2845                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2846                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2847                         yuv2yuvXinC(
2848                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2849                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2850                                 dest, uDest, vDest, dstW, chrDstW);
2851                 }
2852                 else
2853                 {
2854                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2855                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2856                         yuv2packedXinC(c, 
2857                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2858                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2859                                 dest, dstW, dstY);
2860                 }
2861             }
2862         }
2863
2864 #ifdef HAVE_MMX
2865         __asm __volatile(SFENCE:::"memory");
2866         __asm __volatile(EMMS:::"memory");
2867 #endif
2868         /* store changed local vars back in the context */
2869         c->dstY= dstY;
2870         c->lumBufIndex= lumBufIndex;
2871         c->chrBufIndex= chrBufIndex;
2872         c->lastInLumBuf= lastInLumBuf;
2873         c->lastInChrBuf= lastInChrBuf;
2874
2875         return dstY - lastDstY;
2876 }