]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS     "femms"
30 #else
31 #define EMMS     "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69                         "xor %%"REG_a", %%"REG_a"       \n\t"\
70                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71                         "movq %%mm3, %%mm4              \n\t"\
72                         "lea " offset "(%0), %%"REG_d"  \n\t"\
73                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74                         ".balign 16                     \n\t" /* FIXME Unroll? */\
75                         "1:                             \n\t"\
76                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79                         "add $16, %%"REG_d"             \n\t"\
80                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81                         "test %%"REG_S", %%"REG_S"      \n\t"\
82                         "pmulhw %%mm0, %%mm2            \n\t"\
83                         "pmulhw %%mm0, %%mm5            \n\t"\
84                         "paddw %%mm2, %%mm3             \n\t"\
85                         "paddw %%mm5, %%mm4             \n\t"\
86                         " jnz 1b                        \n\t"\
87                         "psraw $3, %%mm3                \n\t"\
88                         "psraw $3, %%mm4                \n\t"\
89                         "packuswb %%mm4, %%mm3          \n\t"\
90                         MOVNTQ(%%mm3, (%1, %%REGa))\
91                         "add $8, %%"REG_a"              \n\t"\
92                         "cmp %2, %%"REG_a"              \n\t"\
93                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94                         "movq %%mm3, %%mm4              \n\t"\
95                         "lea " offset "(%0), %%"REG_d"  \n\t"\
96                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97                         "jb 1b                          \n\t"
98
99 #define YSCALEYUV2YV121 \
100                         "mov %2, %%"REG_a"              \n\t"\
101                         ".balign 16                     \n\t" /* FIXME Unroll? */\
102                         "1:                             \n\t"\
103                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105                         "psraw $7, %%mm0                \n\t"\
106                         "psraw $7, %%mm1                \n\t"\
107                         "packuswb %%mm1, %%mm0          \n\t"\
108                         MOVNTQ(%%mm0, (%1, %%REGa))\
109                         "add $8, %%"REG_a"              \n\t"\
110                         "jnc 1b                         \n\t"
111
112 /*
113                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115                            "r" (dest), "m" (dstW),
116                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120                 "xor %%"REG_a", %%"REG_a"       \n\t"\
121                 ".balign 16                     \n\t"\
122                 "nop                            \n\t"\
123                 "1:                             \n\t"\
124                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127                 "movq %%mm3, %%mm4              \n\t"\
128                 ".balign 16                     \n\t"\
129                 "2:                             \n\t"\
130                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133                 "add $16, %%"REG_d"             \n\t"\
134                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135                 "pmulhw %%mm0, %%mm2            \n\t"\
136                 "pmulhw %%mm0, %%mm5            \n\t"\
137                 "paddw %%mm2, %%mm3             \n\t"\
138                 "paddw %%mm5, %%mm4             \n\t"\
139                 "test %%"REG_S", %%"REG_S"      \n\t"\
140                 " jnz 2b                        \n\t"\
141 \
142                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145                 "movq %%mm1, %%mm7              \n\t"\
146                 ".balign 16                     \n\t"\
147                 "2:                             \n\t"\
148                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151                 "add $16, %%"REG_d"             \n\t"\
152                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153                 "pmulhw %%mm0, %%mm2            \n\t"\
154                 "pmulhw %%mm0, %%mm5            \n\t"\
155                 "paddw %%mm2, %%mm1             \n\t"\
156                 "paddw %%mm5, %%mm7             \n\t"\
157                 "test %%"REG_S", %%"REG_S"      \n\t"\
158                 " jnz 2b                        \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162                 YSCALEYUV2PACKEDX\
163                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177                 "paddw %%mm3, %%mm4             \n\t"\
178                 "movq %%mm2, %%mm0              \n\t"\
179                 "movq %%mm5, %%mm6              \n\t"\
180                 "movq %%mm4, %%mm3              \n\t"\
181                 "punpcklwd %%mm2, %%mm2         \n\t"\
182                 "punpcklwd %%mm5, %%mm5         \n\t"\
183                 "punpcklwd %%mm4, %%mm4         \n\t"\
184                 "paddw %%mm1, %%mm2             \n\t"\
185                 "paddw %%mm1, %%mm5             \n\t"\
186                 "paddw %%mm1, %%mm4             \n\t"\
187                 "punpckhwd %%mm0, %%mm0         \n\t"\
188                 "punpckhwd %%mm6, %%mm6         \n\t"\
189                 "punpckhwd %%mm3, %%mm3         \n\t"\
190                 "paddw %%mm7, %%mm0             \n\t"\
191                 "paddw %%mm7, %%mm6             \n\t"\
192                 "paddw %%mm7, %%mm3             \n\t"\
193                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194                 "packuswb %%mm0, %%mm2          \n\t"\
195                 "packuswb %%mm6, %%mm5          \n\t"\
196                 "packuswb %%mm3, %%mm4          \n\t"\
197                 "pxor %%mm7, %%mm7              \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200                 "pxor %%mm7, %%mm7              \n\t"\
201                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202                 "punpcklwd %%mm6, %%mm6         \n\t"\
203                 "punpcklwd %%mm6, %%mm6         \n\t"\
204                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205                 "punpcklwd %%mm5, %%mm5         \n\t"\
206                 "punpcklwd %%mm5, %%mm5         \n\t"\
207                 "xor %%"REG_a", %%"REG_a"               \n\t"\
208                 ".balign 16                     \n\t"\
209                 "1:                             \n\t"\
210                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237 \
238 \
239                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
243                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
244                 "packuswb %%mm3, %%mm3          \n\t"\
245 \
246                 "packuswb %%mm0, %%mm0          \n\t"\
247                 "paddw %%mm4, %%mm2             \n\t"\
248                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
249 \
250                 "packuswb %%mm1, %%mm1          \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256                 "psraw $3, %%mm0                \n\t"\
257                 "psraw $3, %%mm1                \n\t"\
258                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260                 "xor "#index", "#index"         \n\t"\
261                 ".balign 16                     \n\t"\
262                 "1:                             \n\t"\
263                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288                 
289 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290                 
291 #define REAL_YSCALEYUV2RGB(index, c) \
292                 "xor "#index", "#index" \n\t"\
293                 ".balign 16                     \n\t"\
294                 "1:                             \n\t"\
295                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334                 "paddw %%mm3, %%mm4             \n\t"\
335                 "movq %%mm2, %%mm0              \n\t"\
336                 "movq %%mm5, %%mm6              \n\t"\
337                 "movq %%mm4, %%mm3              \n\t"\
338                 "punpcklwd %%mm2, %%mm2         \n\t"\
339                 "punpcklwd %%mm5, %%mm5         \n\t"\
340                 "punpcklwd %%mm4, %%mm4         \n\t"\
341                 "paddw %%mm1, %%mm2             \n\t"\
342                 "paddw %%mm1, %%mm5             \n\t"\
343                 "paddw %%mm1, %%mm4             \n\t"\
344                 "punpckhwd %%mm0, %%mm0         \n\t"\
345                 "punpckhwd %%mm6, %%mm6         \n\t"\
346                 "punpckhwd %%mm3, %%mm3         \n\t"\
347                 "paddw %%mm7, %%mm0             \n\t"\
348                 "paddw %%mm7, %%mm6             \n\t"\
349                 "paddw %%mm7, %%mm3             \n\t"\
350                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351                 "packuswb %%mm0, %%mm2          \n\t"\
352                 "packuswb %%mm6, %%mm5          \n\t"\
353                 "packuswb %%mm3, %%mm4          \n\t"\
354                 "pxor %%mm7, %%mm7              \n\t"
355 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356                 
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358                 "xor "#index", "#index"         \n\t"\
359                 ".balign 16                     \n\t"\
360                 "1:                             \n\t"\
361                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363                 "psraw $7, %%mm3                \n\t" \
364                 "psraw $7, %%mm4                \n\t" \
365                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367                 "psraw $7, %%mm1                \n\t" \
368                 "psraw $7, %%mm7                \n\t" \
369                 
370 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371                 
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373                 "xor "#index", "#index" \n\t"\
374                 ".balign 16                     \n\t"\
375                 "1:                             \n\t"\
376                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398                 "paddw %%mm3, %%mm4             \n\t"\
399                 "movq %%mm2, %%mm0              \n\t"\
400                 "movq %%mm5, %%mm6              \n\t"\
401                 "movq %%mm4, %%mm3              \n\t"\
402                 "punpcklwd %%mm2, %%mm2         \n\t"\
403                 "punpcklwd %%mm5, %%mm5         \n\t"\
404                 "punpcklwd %%mm4, %%mm4         \n\t"\
405                 "paddw %%mm1, %%mm2             \n\t"\
406                 "paddw %%mm1, %%mm5             \n\t"\
407                 "paddw %%mm1, %%mm4             \n\t"\
408                 "punpckhwd %%mm0, %%mm0         \n\t"\
409                 "punpckhwd %%mm6, %%mm6         \n\t"\
410                 "punpckhwd %%mm3, %%mm3         \n\t"\
411                 "paddw %%mm7, %%mm0             \n\t"\
412                 "paddw %%mm7, %%mm6             \n\t"\
413                 "paddw %%mm7, %%mm3             \n\t"\
414                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415                 "packuswb %%mm0, %%mm2          \n\t"\
416                 "packuswb %%mm6, %%mm5          \n\t"\
417                 "packuswb %%mm3, %%mm4          \n\t"\
418                 "pxor %%mm7, %%mm7              \n\t"
419 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422                 "xor "#index", "#index"         \n\t"\
423                 ".balign 16                     \n\t"\
424                 "1:                             \n\t"\
425                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431                 "psrlw $8, %%mm3                \n\t" \
432                 "psrlw $8, %%mm4                \n\t" \
433                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435                 "psraw $7, %%mm1                \n\t" \
436                 "psraw $7, %%mm7                \n\t" 
437 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438                 
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441                 "xor "#index", "#index"         \n\t"\
442                 ".balign 16                     \n\t"\
443                 "1:                             \n\t"\
444                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470                 "paddw %%mm3, %%mm4             \n\t"\
471                 "movq %%mm2, %%mm0              \n\t"\
472                 "movq %%mm5, %%mm6              \n\t"\
473                 "movq %%mm4, %%mm3              \n\t"\
474                 "punpcklwd %%mm2, %%mm2         \n\t"\
475                 "punpcklwd %%mm5, %%mm5         \n\t"\
476                 "punpcklwd %%mm4, %%mm4         \n\t"\
477                 "paddw %%mm1, %%mm2             \n\t"\
478                 "paddw %%mm1, %%mm5             \n\t"\
479                 "paddw %%mm1, %%mm4             \n\t"\
480                 "punpckhwd %%mm0, %%mm0         \n\t"\
481                 "punpckhwd %%mm6, %%mm6         \n\t"\
482                 "punpckhwd %%mm3, %%mm3         \n\t"\
483                 "paddw %%mm7, %%mm0             \n\t"\
484                 "paddw %%mm7, %%mm6             \n\t"\
485                 "paddw %%mm7, %%mm3             \n\t"\
486                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487                 "packuswb %%mm0, %%mm2          \n\t"\
488                 "packuswb %%mm6, %%mm5          \n\t"\
489                 "packuswb %%mm3, %%mm4          \n\t"\
490                 "pxor %%mm7, %%mm7              \n\t"
491 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495                         "movq %%mm2, %%mm1              \n\t" /* B */\
496                         "movq %%mm5, %%mm6              \n\t" /* R */\
497                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507 \
508                         MOVNTQ(%%mm0, (dst, index, 4))\
509                         MOVNTQ(%%mm2, 8(dst, index, 4))\
510                         MOVNTQ(%%mm1, 16(dst, index, 4))\
511                         MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513                         "add $8, "#index"               \n\t"\
514                         "cmp "#dstw", "#index"          \n\t"\
515                         " jb 1b                         \n\t"
516 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522                         "psrlq $3, %%mm2                \n\t"\
523 \
524                         "movq %%mm2, %%mm1              \n\t"\
525                         "movq %%mm4, %%mm3              \n\t"\
526 \
527                         "punpcklbw %%mm7, %%mm3         \n\t"\
528                         "punpcklbw %%mm5, %%mm2         \n\t"\
529                         "punpckhbw %%mm7, %%mm4         \n\t"\
530                         "punpckhbw %%mm5, %%mm1         \n\t"\
531 \
532                         "psllq $3, %%mm3                \n\t"\
533                         "psllq $3, %%mm4                \n\t"\
534 \
535                         "por %%mm3, %%mm2               \n\t"\
536                         "por %%mm4, %%mm1               \n\t"\
537 \
538                         MOVNTQ(%%mm2, (dst, index, 2))\
539                         MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541                         "add $8, "#index"               \n\t"\
542                         "cmp "#dstw", "#index"          \n\t"\
543                         " jb 1b                         \n\t"
544 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550                         "psrlq $3, %%mm2                \n\t"\
551                         "psrlq $1, %%mm5                \n\t"\
552 \
553                         "movq %%mm2, %%mm1              \n\t"\
554                         "movq %%mm4, %%mm3              \n\t"\
555 \
556                         "punpcklbw %%mm7, %%mm3         \n\t"\
557                         "punpcklbw %%mm5, %%mm2         \n\t"\
558                         "punpckhbw %%mm7, %%mm4         \n\t"\
559                         "punpckhbw %%mm5, %%mm1         \n\t"\
560 \
561                         "psllq $2, %%mm3                \n\t"\
562                         "psllq $2, %%mm4                \n\t"\
563 \
564                         "por %%mm3, %%mm2               \n\t"\
565                         "por %%mm4, %%mm1               \n\t"\
566 \
567                         MOVNTQ(%%mm2, (dst, index, 2))\
568                         MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570                         "add $8, "#index"               \n\t"\
571                         "cmp "#dstw", "#index"          \n\t"\
572                         " jb 1b                         \n\t"
573 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577                         "movq %%mm2, %%mm1              \n\t" /* B */\
578                         "movq %%mm5, %%mm6              \n\t" /* R */\
579                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589 \
590                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598 \
599                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612 \
613                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621 \
622                         MOVNTQ(%%mm0, (dst))\
623                         MOVNTQ(%%mm2, 8(dst))\
624                         MOVNTQ(%%mm3, 16(dst))\
625                         "add $24, "#dst"                \n\t"\
626 \
627                         "add $8, "#index"               \n\t"\
628                         "cmp "#dstw", "#index"          \n\t"\
629                         " jb 1b                         \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633                         "movq %%mm2, %%mm1              \n\t" /* B */\
634                         "movq %%mm5, %%mm6              \n\t" /* R */\
635                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645 \
646                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650 \
651                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655 \
656                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660 \
661                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665                         MOVNTQ(%%mm0, (dst))\
666 \
667                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671                         MOVNTQ(%%mm6, 8(dst))\
672 \
673                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676                         MOVNTQ(%%mm5, 16(dst))\
677 \
678                         "add $24, "#dst"                \n\t"\
679 \
680                         "add $8, "#index"                       \n\t"\
681                         "cmp "#dstw", "#index"                  \n\t"\
682                         " jb 1b                         \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
687                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
688                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691 \
692                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695 \
696                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697                         "por %%mm1, %%mm6               \n\t"\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, (dst))\
700 \
701                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705 \
706                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709 \
710                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711                         "por %%mm3, %%mm6               \n\t"\
712                         MOVNTQ(%%mm6, 8(dst))\
713 \
714                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717 \
718                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721 \
722                         "por %%mm1, %%mm3               \n\t"\
723                         "por %%mm3, %%mm6               \n\t"\
724                         MOVNTQ(%%mm6, 16(dst))\
725 \
726                         "add $24, "#dst"                \n\t"\
727 \
728                         "add $8, "#index"               \n\t"\
729                         "cmp "#dstw", "#index"          \n\t"\
730                         " jb 1b                         \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741                         "packuswb %%mm3, %%mm3          \n\t"\
742                         "packuswb %%mm4, %%mm4          \n\t"\
743                         "packuswb %%mm7, %%mm1          \n\t"\
744                         "punpcklbw %%mm4, %%mm3         \n\t"\
745                         "movq %%mm1, %%mm7              \n\t"\
746                         "punpcklbw %%mm3, %%mm1         \n\t"\
747                         "punpckhbw %%mm3, %%mm7         \n\t"\
748 \
749                         MOVNTQ(%%mm1, (dst, index, 2))\
750                         MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752                         "add $8, "#index"               \n\t"\
753                         "cmp "#dstw", "#index"          \n\t"\
754                         " jb 1b                         \n\t"
755 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761 {
762 #ifdef HAVE_MMX
763         if(uDest != NULL)
764         {
765                 asm volatile(
766                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767                                 :: "r" (&c->redDither),
768                                 "r" (uDest), "m" ((long)chrDstW)
769                                 : "%"REG_a, "%"REG_d, "%"REG_S
770                         );
771
772                 asm volatile(
773                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774                                 :: "r" (&c->redDither),
775                                 "r" (vDest), "m" ((long)chrDstW)
776                                 : "%"REG_a, "%"REG_d, "%"REG_S
777                         );
778         }
779
780         asm volatile(
781                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782                         :: "r" (&c->redDither),
783                            "r" (dest), "m" ((long)dstW)
784                         : "%"REG_a, "%"REG_d, "%"REG_S
785                 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789                       chrFilter, chrSrc, chrFilterSize,
790                       dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793             chrFilter, chrSrc, chrFilterSize,
794             dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
801 {
802 #ifdef HAVE_MMX
803         if(uDest != NULL)
804         {
805                 asm volatile(
806                                 YSCALEYUV2YV121
807                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
808                                 "g" ((long)-chrDstW)
809                                 : "%"REG_a
810                         );
811
812                 asm volatile(
813                                 YSCALEYUV2YV121
814                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
815                                 "g" ((long)-chrDstW)
816                                 : "%"REG_a
817                         );
818         }
819
820         asm volatile(
821                 YSCALEYUV2YV121
822                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
823                 "g" ((long)-dstW)
824                 : "%"REG_a
825         );
826 #else
827         int i;
828         for(i=0; i<dstW; i++)
829         {
830                 int val= lumSrc[i]>>7;
831                 
832                 if(val&256){
833                         if(val<0) val=0;
834                         else      val=255;
835                 }
836
837                 dest[i]= val;
838         }
839
840         if(uDest != NULL)
841                 for(i=0; i<chrDstW; i++)
842                 {
843                         int u=chrSrc[i]>>7;
844                         int v=chrSrc[i + 2048]>>7;
845
846                         if((u|v)&256){
847                                 if(u<0)         u=0;
848                                 else if (u>255) u=255;
849                                 if(v<0)         v=0;
850                                 else if (v>255) v=255;
851                         }
852
853                         uDest[i]= u;
854                         vDest[i]= v;
855                 }
856 #endif
857 }
858
859
860 /**
861  * vertical scale YV12 to RGB
862  */
863 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865                             uint8_t *dest, int dstW, int dstY)
866 {
867         int dummy=0;
868         switch(c->dstFormat)
869         {
870 #ifdef HAVE_MMX
871         case IMGFMT_BGR32:
872                 {
873                         asm volatile(
874                                 YSCALEYUV2RGBX
875                                 WRITEBGR32(%4, %5, %%REGa)
876
877                         :: "r" (&c->redDither), 
878                            "m" (dummy), "m" (dummy), "m" (dummy),
879                            "r" (dest), "m" (dstW)
880                         : "%"REG_a, "%"REG_d, "%"REG_S
881                         );
882                 }
883                 break;
884         case IMGFMT_BGR24:
885                 {
886                         asm volatile(
887                                 YSCALEYUV2RGBX
888                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889                                 "add %4, %%"REG_b"                      \n\t"
890                                 WRITEBGR24(%%REGb, %5, %%REGa)
891
892                         :: "r" (&c->redDither), 
893                            "m" (dummy), "m" (dummy), "m" (dummy),
894                            "r" (dest), "m" (dstW)
895                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
896                         );
897                 }
898                 break;
899         case IMGFMT_BGR15:
900                 {
901                         asm volatile(
902                                 YSCALEYUV2RGBX
903                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
904 #ifdef DITHER1XBPP
905                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
908 #endif
909
910                                 WRITEBGR15(%4, %5, %%REGa)
911
912                         :: "r" (&c->redDither), 
913                            "m" (dummy), "m" (dummy), "m" (dummy),
914                            "r" (dest), "m" (dstW)
915                         : "%"REG_a, "%"REG_d, "%"REG_S
916                         );
917                 }
918                 break;
919         case IMGFMT_BGR16:
920                 {
921                         asm volatile(
922                                 YSCALEYUV2RGBX
923                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
924 #ifdef DITHER1XBPP
925                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
928 #endif
929
930                                 WRITEBGR16(%4, %5, %%REGa)
931
932                         :: "r" (&c->redDither), 
933                            "m" (dummy), "m" (dummy), "m" (dummy),
934                            "r" (dest), "m" (dstW)
935                         : "%"REG_a, "%"REG_d, "%"REG_S
936                         );
937                 }
938                 break;
939         case IMGFMT_YUY2:
940                 {
941                         asm volatile(
942                                 YSCALEYUV2PACKEDX
943                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
944
945                                 "psraw $3, %%mm3                \n\t"
946                                 "psraw $3, %%mm4                \n\t"
947                                 "psraw $3, %%mm1                \n\t"
948                                 "psraw $3, %%mm7                \n\t"
949                                 WRITEYUY2(%4, %5, %%REGa)
950
951                         :: "r" (&c->redDither), 
952                            "m" (dummy), "m" (dummy), "m" (dummy),
953                            "r" (dest), "m" (dstW)
954                         : "%"REG_a, "%"REG_d, "%"REG_S
955                         );
956                 }
957                 break;
958 #endif
959         default:
960 #ifdef HAVE_ALTIVEC
961                 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962                             chrFilter, chrSrc, chrFilterSize,
963                             dest, dstW, dstY);
964 #else
965                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966                             chrFilter, chrSrc, chrFilterSize,
967                             dest, dstW, dstY);
968 #endif
969                 break;
970         }
971 }
972
973 /**
974  * vertical bilinear scale YV12 to RGB
975  */
976 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
978 {
979         int yalpha1=yalpha^4095;
980         int uvalpha1=uvalpha^4095;
981         int i;
982
983 #if 0 //isn't used
984         if(flags&SWS_FULL_CHR_H_INT)
985         {
986                 switch(dstFormat)
987                 {
988 #ifdef HAVE_MMX
989                 case IMGFMT_BGR32:
990                         asm volatile(
991
992
993 FULL_YSCALEYUV2RGB
994                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
995                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
996
997                         "movq %%mm3, %%mm1              \n\t"
998                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
999                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1000
1001                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1003
1004                         "add $4, %%"REG_a"              \n\t"
1005                         "cmp %5, %%"REG_a"              \n\t"
1006                         " jb 1b                         \n\t"
1007
1008
1009                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010                         "m" (yalpha1), "m" (uvalpha1)
1011                         : "%"REG_a
1012                         );
1013                         break;
1014                 case IMGFMT_BGR24:
1015                         asm volatile(
1016
1017 FULL_YSCALEYUV2RGB
1018
1019                                                                 // lsb ... msb
1020                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1021                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1022
1023                         "movq %%mm3, %%mm1              \n\t"
1024                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1025                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1026
1027                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1028                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1029                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1032                         "movq %%mm1, %%mm2              \n\t"
1033                         "psllq $48, %%mm1               \n\t" // 000000BG
1034                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1035
1036                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1037                         "psrld $16, %%mm2               \n\t" // R000R000
1038                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1039                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1040
1041                         "mov %4, %%"REG_b"              \n\t"
1042                         "add %%"REG_a", %%"REG_b"       \n\t"
1043
1044 #ifdef HAVE_MMX2
1045                         //FIXME Alignment
1046                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1048 #else
1049                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1050                         "psrlq $32, %%mm3               \n\t"
1051                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1052                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1053 #endif
1054                         "add $4, %%"REG_a"              \n\t"
1055                         "cmp %5, %%"REG_a"              \n\t"
1056                         " jb 1b                         \n\t"
1057
1058                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059                         "m" (yalpha1), "m" (uvalpha1)
1060                         : "%"REG_a, "%"REG_b
1061                         );
1062                         break;
1063                 case IMGFMT_BGR15:
1064                         asm volatile(
1065
1066 FULL_YSCALEYUV2RGB
1067 #ifdef DITHER1XBPP
1068                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1071 #endif
1072                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1073                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1074                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1075
1076                         "psrlw $3, %%mm3                \n\t"
1077                         "psllw $2, %%mm1                \n\t"
1078                         "psllw $7, %%mm0                \n\t"
1079                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1080                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1081
1082                         "por %%mm3, %%mm1               \n\t"
1083                         "por %%mm1, %%mm0               \n\t"
1084
1085                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1086
1087                         "add $4, %%"REG_a"              \n\t"
1088                         "cmp %5, %%"REG_a"              \n\t"
1089                         " jb 1b                         \n\t"
1090
1091                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092                         "m" (yalpha1), "m" (uvalpha1)
1093                         : "%"REG_a
1094                         );
1095                         break;
1096                 case IMGFMT_BGR16:
1097                         asm volatile(
1098
1099 FULL_YSCALEYUV2RGB
1100 #ifdef DITHER1XBPP
1101                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1104 #endif
1105                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1106                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1107                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1108
1109                         "psrlw $3, %%mm3                \n\t"
1110                         "psllw $3, %%mm1                \n\t"
1111                         "psllw $8, %%mm0                \n\t"
1112                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1113                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1114
1115                         "por %%mm3, %%mm1               \n\t"
1116                         "por %%mm1, %%mm0               \n\t"
1117
1118                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1119
1120                         "add $4, %%"REG_a"              \n\t"
1121                         "cmp %5, %%"REG_a"              \n\t"
1122                         " jb 1b                         \n\t"
1123
1124                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125                         "m" (yalpha1), "m" (uvalpha1)
1126                         : "%"REG_a
1127                         );
1128                 break;
1129 #endif
1130                 case IMGFMT_RGB32:
1131 #ifndef HAVE_MMX
1132                 case IMGFMT_BGR32:
1133 #endif
1134                 if(dstFormat==IMGFMT_BGR32)
1135                 {
1136                         int i;
1137 #ifdef WORDS_BIGENDIAN
1138                         dest++;
1139 #endif
1140                         for(i=0;i<dstW;i++){
1141                                 // vertical linear interpolation && yuv2rgb in a single step:
1142                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1148                                 dest+= 4;
1149                         }
1150                 }
1151                 else if(dstFormat==IMGFMT_BGR24)
1152                 {
1153                         int i;
1154                         for(i=0;i<dstW;i++){
1155                                 // vertical linear interpolation && yuv2rgb in a single step:
1156                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162                                 dest+= 3;
1163                         }
1164                 }
1165                 else if(dstFormat==IMGFMT_BGR16)
1166                 {
1167                         int i;
1168                         for(i=0;i<dstW;i++){
1169                                 // vertical linear interpolation && yuv2rgb in a single step:
1170                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173
1174                                 ((uint16_t*)dest)[i] =
1175                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1178                         }
1179                 }
1180                 else if(dstFormat==IMGFMT_BGR15)
1181                 {
1182                         int i;
1183                         for(i=0;i<dstW;i++){
1184                                 // vertical linear interpolation && yuv2rgb in a single step:
1185                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1188
1189                                 ((uint16_t*)dest)[i] =
1190                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1193                         }
1194                 }
1195         }//FULL_UV_IPOL
1196         else
1197         {
1198 #endif // if 0
1199 #ifdef HAVE_MMX
1200         switch(c->dstFormat)
1201         {
1202 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1203         case IMGFMT_BGR32:
1204                         asm volatile(
1205                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1206                                 "mov %4, %%"REG_SP"                     \n\t"
1207                                 YSCALEYUV2RGB(%%REGa, %5)
1208                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1210
1211                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1212                         "r" (&c->redDither)
1213                         : "%"REG_a
1214                         );
1215                         return;
1216         case IMGFMT_BGR24:
1217                         asm volatile(
1218                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1219                                 "mov %4, %%"REG_SP"                     \n\t"
1220                                 YSCALEYUV2RGB(%%REGa, %5)
1221                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1223                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1224                         "r" (&c->redDither)
1225                         : "%"REG_a
1226                         );
1227                         return;
1228         case IMGFMT_BGR15:
1229                         asm volatile(
1230                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1231                                 "mov %4, %%"REG_SP"                     \n\t"
1232                                 YSCALEYUV2RGB(%%REGa, %5)
1233                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1234 #ifdef DITHER1XBPP
1235                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238 #endif
1239
1240                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1242
1243                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1244                         "r" (&c->redDither)
1245                         : "%"REG_a
1246                         );
1247                         return;
1248         case IMGFMT_BGR16:
1249                         asm volatile(
1250                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1251                                 "mov %4, %%"REG_SP"                     \n\t"
1252                                 YSCALEYUV2RGB(%%REGa, %5)
1253                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1254 #ifdef DITHER1XBPP
1255                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258 #endif
1259
1260                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1262                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1263                         "r" (&c->redDither)
1264                         : "%"REG_a
1265                         );
1266                         return;
1267         case IMGFMT_YUY2:
1268                         asm volatile(
1269                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1270                                 "mov %4, %%"REG_SP"                     \n\t"
1271                                 YSCALEYUV2PACKED(%%REGa, %5)
1272                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1274                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1275                         "r" (&c->redDither)
1276                         : "%"REG_a
1277                         );
1278                         return;
1279         default: break;
1280         }
1281 #endif //HAVE_MMX
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1283 }
1284
1285 /**
1286  * YV12 to RGB without scaling or interpolating
1287  */
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290 {
1291         const int yalpha1=0;
1292         int i;
1293         
1294         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295         const int yalpha= 4096; //FIXME ...
1296
1297         if(flags&SWS_FULL_CHR_H_INT)
1298         {
1299                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300                 return;
1301         }
1302
1303 #ifdef HAVE_MMX
1304         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1305         {
1306                 switch(dstFormat)
1307                 {
1308                 case IMGFMT_BGR32:
1309                         asm volatile(
1310                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1311                                 "mov %4, %%"REG_SP"                     \n\t"
1312                                 YSCALEYUV2RGB1(%%REGa, %5)
1313                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1315
1316                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1317                         "r" (&c->redDither)
1318                         : "%"REG_a
1319                         );
1320                         return;
1321                 case IMGFMT_BGR24:
1322                         asm volatile(
1323                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1324                                 "mov %4, %%"REG_SP"                     \n\t"
1325                                 YSCALEYUV2RGB1(%%REGa, %5)
1326                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1328
1329                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1330                         "r" (&c->redDither)
1331                         : "%"REG_a
1332                         );
1333                         return;
1334                 case IMGFMT_BGR15:
1335                         asm volatile(
1336                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1337                                 "mov %4, %%"REG_SP"                     \n\t"
1338                                 YSCALEYUV2RGB1(%%REGa, %5)
1339                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1340 #ifdef DITHER1XBPP
1341                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1344 #endif
1345                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1347
1348                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1349                         "r" (&c->redDither)
1350                         : "%"REG_a
1351                         );
1352                         return;
1353                 case IMGFMT_BGR16:
1354                         asm volatile(
1355                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1356                                 "mov %4, %%"REG_SP"                     \n\t"
1357                                 YSCALEYUV2RGB1(%%REGa, %5)
1358                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1359 #ifdef DITHER1XBPP
1360                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1363 #endif
1364
1365                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1367
1368                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1369                         "r" (&c->redDither)
1370                         : "%"REG_a
1371                         );
1372                         return;
1373                 case IMGFMT_YUY2:
1374                         asm volatile(
1375                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1376                                 "mov %4, %%"REG_SP"                     \n\t"
1377                                 YSCALEYUV2PACKED1(%%REGa, %5)
1378                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1380
1381                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1382                         "r" (&c->redDither)
1383                         : "%"REG_a
1384                         );
1385                         return;
1386                 }
1387         }
1388         else
1389         {
1390                 switch(dstFormat)
1391                 {
1392                 case IMGFMT_BGR32:
1393                         asm volatile(
1394                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1395                                 "mov %4, %%"REG_SP"                     \n\t"
1396                                 YSCALEYUV2RGB1b(%%REGa, %5)
1397                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1399
1400                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1401                         "r" (&c->redDither)
1402                         : "%"REG_a
1403                         );
1404                         return;
1405                 case IMGFMT_BGR24:
1406                         asm volatile(
1407                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1408                                 "mov %4, %%"REG_SP"                     \n\t"
1409                                 YSCALEYUV2RGB1b(%%REGa, %5)
1410                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1412
1413                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1414                         "r" (&c->redDither)
1415                         : "%"REG_a
1416                         );
1417                         return;
1418                 case IMGFMT_BGR15:
1419                         asm volatile(
1420                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1421                                 "mov %4, %%"REG_SP"                     \n\t"
1422                                 YSCALEYUV2RGB1b(%%REGa, %5)
1423                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424 #ifdef DITHER1XBPP
1425                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1428 #endif
1429                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1431
1432                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1433                         "r" (&c->redDither)
1434                         : "%"REG_a
1435                         );
1436                         return;
1437                 case IMGFMT_BGR16:
1438                         asm volatile(
1439                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1440                                 "mov %4, %%"REG_SP"                     \n\t"
1441                                 YSCALEYUV2RGB1b(%%REGa, %5)
1442                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443 #ifdef DITHER1XBPP
1444                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1447 #endif
1448
1449                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1451
1452                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453                         "r" (&c->redDither)
1454                         : "%"REG_a
1455                         );
1456                         return;
1457                 case IMGFMT_YUY2:
1458                         asm volatile(
1459                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1460                                 "mov %4, %%"REG_SP"                     \n\t"
1461                                 YSCALEYUV2PACKED1b(%%REGa, %5)
1462                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1464
1465                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1466                         "r" (&c->redDither)
1467                         : "%"REG_a
1468                         );
1469                         return;
1470                 }
1471         }
1472 #endif
1473         if( uvalpha < 2048 )
1474         {
1475                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1476         }else{
1477                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1478         }
1479 }
1480
1481 //FIXME yuy2* can read upto 7 samples to much
1482
1483 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1484 {
1485 #ifdef HAVE_MMX
1486         asm volatile(
1487                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488                 "mov %0, %%"REG_a"              \n\t"
1489                 "1:                             \n\t"
1490                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1491                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1492                 "pand %%mm2, %%mm0              \n\t"
1493                 "pand %%mm2, %%mm1              \n\t"
1494                 "packuswb %%mm1, %%mm0          \n\t"
1495                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1496                 "add $8, %%"REG_a"              \n\t"
1497                 " js 1b                         \n\t"
1498                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1499                 : "%"REG_a
1500         );
1501 #else
1502         int i;
1503         for(i=0; i<width; i++)
1504                 dst[i]= src[2*i];
1505 #endif
1506 }
1507
1508 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1509 {
1510 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1511         asm volatile(
1512                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513                 "mov %0, %%"REG_a"              \n\t"
1514                 "1:                             \n\t"
1515                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1516                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1517                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1518                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1519                 PAVGB(%%mm2, %%mm0)
1520                 PAVGB(%%mm3, %%mm1)
1521                 "psrlw $8, %%mm0                \n\t"
1522                 "psrlw $8, %%mm1                \n\t"
1523                 "packuswb %%mm1, %%mm0          \n\t"
1524                 "movq %%mm0, %%mm1              \n\t"
1525                 "psrlw $8, %%mm0                \n\t"
1526                 "pand %%mm4, %%mm1              \n\t"
1527                 "packuswb %%mm0, %%mm0          \n\t"
1528                 "packuswb %%mm1, %%mm1          \n\t"
1529                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1530                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1531                 "add $4, %%"REG_a"              \n\t"
1532                 " js 1b                         \n\t"
1533                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1534                 : "%"REG_a
1535         );
1536 #else
1537         int i;
1538         for(i=0; i<width; i++)
1539         {
1540                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1542         }
1543 #endif
1544 }
1545
1546 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1548 {
1549 #ifdef HAVE_MMX
1550         asm volatile(
1551                 "mov %0, %%"REG_a"              \n\t"
1552                 "1:                             \n\t"
1553                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1554                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1555                 "psrlw $8, %%mm0                \n\t"
1556                 "psrlw $8, %%mm1                \n\t"
1557                 "packuswb %%mm1, %%mm0          \n\t"
1558                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1559                 "add $8, %%"REG_a"              \n\t"
1560                 " js 1b                         \n\t"
1561                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1562                 : "%"REG_a
1563         );
1564 #else
1565         int i;
1566         for(i=0; i<width; i++)
1567                 dst[i]= src[2*i+1];
1568 #endif
1569 }
1570
1571 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1572 {
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574         asm volatile(
1575                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576                 "mov %0, %%"REG_a"              \n\t"
1577                 "1:                             \n\t"
1578                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1579                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1580                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1581                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1582                 PAVGB(%%mm2, %%mm0)
1583                 PAVGB(%%mm3, %%mm1)
1584                 "pand %%mm4, %%mm0              \n\t"
1585                 "pand %%mm4, %%mm1              \n\t"
1586                 "packuswb %%mm1, %%mm0          \n\t"
1587                 "movq %%mm0, %%mm1              \n\t"
1588                 "psrlw $8, %%mm0                \n\t"
1589                 "pand %%mm4, %%mm1              \n\t"
1590                 "packuswb %%mm0, %%mm0          \n\t"
1591                 "packuswb %%mm1, %%mm1          \n\t"
1592                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1593                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1594                 "add $4, %%"REG_a"              \n\t"
1595                 " js 1b                         \n\t"
1596                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1597                 : "%"REG_a
1598         );
1599 #else
1600         int i;
1601         for(i=0; i<width; i++)
1602         {
1603                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1605         }
1606 #endif
1607 }
1608
1609 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1610 {
1611 #ifdef HAVE_MMXFIXME
1612 #else
1613         int i;
1614         for(i=0; i<width; i++)
1615         {
1616                 int b=  ((uint32_t*)src)[i]&0xFF;
1617                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1619
1620                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1621         }
1622 #endif
1623 }
1624
1625 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1626 {
1627 #ifdef HAVE_MMXFIXME
1628 #else
1629         int i;
1630         for(i=0; i<width; i++)
1631         {
1632                 const int a= ((uint32_t*)src1)[2*i+0];
1633                 const int e= ((uint32_t*)src1)[2*i+1];
1634                 const int c= ((uint32_t*)src2)[2*i+0];
1635                 const int d= ((uint32_t*)src2)[2*i+1];
1636                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638                 const int b=  l&0x3FF;
1639                 const int g=  h>>8;
1640                 const int r=  l>>16;
1641
1642                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1644         }
1645 #endif
1646 }
1647
1648 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1649 {
1650 #ifdef HAVE_MMX
1651         asm volatile(
1652                 "mov %2, %%"REG_a"              \n\t"
1653                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1654                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1655                 "pxor %%mm7, %%mm7              \n\t"
1656                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1657                 ".balign 16                     \n\t"
1658                 "1:                             \n\t"
1659                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1660                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1661                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1662                 "punpcklbw %%mm7, %%mm0         \n\t"
1663                 "punpcklbw %%mm7, %%mm1         \n\t"
1664                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1665                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1666                 "punpcklbw %%mm7, %%mm2         \n\t"
1667                 "punpcklbw %%mm7, %%mm3         \n\t"
1668                 "pmaddwd %%mm6, %%mm0           \n\t"
1669                 "pmaddwd %%mm6, %%mm1           \n\t"
1670                 "pmaddwd %%mm6, %%mm2           \n\t"
1671                 "pmaddwd %%mm6, %%mm3           \n\t"
1672 #ifndef FAST_BGR2YV12
1673                 "psrad $8, %%mm0                \n\t"
1674                 "psrad $8, %%mm1                \n\t"
1675                 "psrad $8, %%mm2                \n\t"
1676                 "psrad $8, %%mm3                \n\t"
1677 #endif
1678                 "packssdw %%mm1, %%mm0          \n\t"
1679                 "packssdw %%mm3, %%mm2          \n\t"
1680                 "pmaddwd %%mm5, %%mm0           \n\t"
1681                 "pmaddwd %%mm5, %%mm2           \n\t"
1682                 "packssdw %%mm2, %%mm0          \n\t"
1683                 "psraw $7, %%mm0                \n\t"
1684
1685                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1686                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1687                 "punpcklbw %%mm7, %%mm4         \n\t"
1688                 "punpcklbw %%mm7, %%mm1         \n\t"
1689                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1690                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1691                 "punpcklbw %%mm7, %%mm2         \n\t"
1692                 "punpcklbw %%mm7, %%mm3         \n\t"
1693                 "pmaddwd %%mm6, %%mm4           \n\t"
1694                 "pmaddwd %%mm6, %%mm1           \n\t"
1695                 "pmaddwd %%mm6, %%mm2           \n\t"
1696                 "pmaddwd %%mm6, %%mm3           \n\t"
1697 #ifndef FAST_BGR2YV12
1698                 "psrad $8, %%mm4                \n\t"
1699                 "psrad $8, %%mm1                \n\t"
1700                 "psrad $8, %%mm2                \n\t"
1701                 "psrad $8, %%mm3                \n\t"
1702 #endif
1703                 "packssdw %%mm1, %%mm4          \n\t"
1704                 "packssdw %%mm3, %%mm2          \n\t"
1705                 "pmaddwd %%mm5, %%mm4           \n\t"
1706                 "pmaddwd %%mm5, %%mm2           \n\t"
1707                 "add $24, %%"REG_b"             \n\t"
1708                 "packssdw %%mm2, %%mm4          \n\t"
1709                 "psraw $7, %%mm4                \n\t"
1710
1711                 "packuswb %%mm4, %%mm0          \n\t"
1712                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1713
1714                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1715                 "add $8, %%"REG_a"              \n\t"
1716                 " js 1b                         \n\t"
1717                 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718                 : "%"REG_a, "%"REG_b
1719         );
1720 #else
1721         int i;
1722         for(i=0; i<width; i++)
1723         {
1724                 int b= src[i*3+0];
1725                 int g= src[i*3+1];
1726                 int r= src[i*3+2];
1727
1728                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1729         }
1730 #endif
1731 }
1732
1733 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1734 {
1735 #ifdef HAVE_MMX
1736         asm volatile(
1737                 "mov %4, %%"REG_a"              \n\t"
1738                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1739                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1740                 "pxor %%mm7, %%mm7              \n\t"
1741                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1742                 "add %%"REG_b", %%"REG_b"       \n\t"
1743                 ".balign 16                     \n\t"
1744                 "1:                             \n\t"
1745                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1746                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1747 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1749                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1750                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1751                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1752                 PAVGB(%%mm1, %%mm0)
1753                 PAVGB(%%mm3, %%mm2)
1754                 "movq %%mm0, %%mm1              \n\t"
1755                 "movq %%mm2, %%mm3              \n\t"
1756                 "psrlq $24, %%mm0               \n\t"
1757                 "psrlq $24, %%mm2               \n\t"
1758                 PAVGB(%%mm1, %%mm0)
1759                 PAVGB(%%mm3, %%mm2)
1760                 "punpcklbw %%mm7, %%mm0         \n\t"
1761                 "punpcklbw %%mm7, %%mm2         \n\t"
1762 #else
1763                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1764                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1765                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1766                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1767                 "punpcklbw %%mm7, %%mm0         \n\t"
1768                 "punpcklbw %%mm7, %%mm1         \n\t"
1769                 "punpcklbw %%mm7, %%mm2         \n\t"
1770                 "punpcklbw %%mm7, %%mm3         \n\t"
1771                 "paddw %%mm1, %%mm0             \n\t"
1772                 "paddw %%mm3, %%mm2             \n\t"
1773                 "paddw %%mm2, %%mm0             \n\t"
1774                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1775                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1776                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1777                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1778                 "punpcklbw %%mm7, %%mm4         \n\t"
1779                 "punpcklbw %%mm7, %%mm1         \n\t"
1780                 "punpcklbw %%mm7, %%mm2         \n\t"
1781                 "punpcklbw %%mm7, %%mm3         \n\t"
1782                 "paddw %%mm1, %%mm4             \n\t"
1783                 "paddw %%mm3, %%mm2             \n\t"
1784                 "paddw %%mm4, %%mm2             \n\t"
1785                 "psrlw $2, %%mm0                \n\t"
1786                 "psrlw $2, %%mm2                \n\t"
1787 #endif
1788                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1789                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1790                 
1791                 "pmaddwd %%mm0, %%mm1           \n\t"
1792                 "pmaddwd %%mm2, %%mm3           \n\t"
1793                 "pmaddwd %%mm6, %%mm0           \n\t"
1794                 "pmaddwd %%mm6, %%mm2           \n\t"
1795 #ifndef FAST_BGR2YV12
1796                 "psrad $8, %%mm0                \n\t"
1797                 "psrad $8, %%mm1                \n\t"
1798                 "psrad $8, %%mm2                \n\t"
1799                 "psrad $8, %%mm3                \n\t"
1800 #endif
1801                 "packssdw %%mm2, %%mm0          \n\t"
1802                 "packssdw %%mm3, %%mm1          \n\t"
1803                 "pmaddwd %%mm5, %%mm0           \n\t"
1804                 "pmaddwd %%mm5, %%mm1           \n\t"
1805                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1806                 "psraw $7, %%mm0                \n\t"
1807
1808 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1810                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1811                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1812                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1813                 PAVGB(%%mm1, %%mm4)
1814                 PAVGB(%%mm3, %%mm2)
1815                 "movq %%mm4, %%mm1              \n\t"
1816                 "movq %%mm2, %%mm3              \n\t"
1817                 "psrlq $24, %%mm4               \n\t"
1818                 "psrlq $24, %%mm2               \n\t"
1819                 PAVGB(%%mm1, %%mm4)
1820                 PAVGB(%%mm3, %%mm2)
1821                 "punpcklbw %%mm7, %%mm4         \n\t"
1822                 "punpcklbw %%mm7, %%mm2         \n\t"
1823 #else
1824                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1825                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1826                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1827                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1828                 "punpcklbw %%mm7, %%mm4         \n\t"
1829                 "punpcklbw %%mm7, %%mm1         \n\t"
1830                 "punpcklbw %%mm7, %%mm2         \n\t"
1831                 "punpcklbw %%mm7, %%mm3         \n\t"
1832                 "paddw %%mm1, %%mm4             \n\t"
1833                 "paddw %%mm3, %%mm2             \n\t"
1834                 "paddw %%mm2, %%mm4             \n\t"
1835                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1836                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1837                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1838                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1839                 "punpcklbw %%mm7, %%mm5         \n\t"
1840                 "punpcklbw %%mm7, %%mm1         \n\t"
1841                 "punpcklbw %%mm7, %%mm2         \n\t"
1842                 "punpcklbw %%mm7, %%mm3         \n\t"
1843                 "paddw %%mm1, %%mm5             \n\t"
1844                 "paddw %%mm3, %%mm2             \n\t"
1845                 "paddw %%mm5, %%mm2             \n\t"
1846                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1847                 "psrlw $2, %%mm4                \n\t"
1848                 "psrlw $2, %%mm2                \n\t"
1849 #endif
1850                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1851                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1852                 
1853                 "pmaddwd %%mm4, %%mm1           \n\t"
1854                 "pmaddwd %%mm2, %%mm3           \n\t"
1855                 "pmaddwd %%mm6, %%mm4           \n\t"
1856                 "pmaddwd %%mm6, %%mm2           \n\t"
1857 #ifndef FAST_BGR2YV12
1858                 "psrad $8, %%mm4                \n\t"
1859                 "psrad $8, %%mm1                \n\t"
1860                 "psrad $8, %%mm2                \n\t"
1861                 "psrad $8, %%mm3                \n\t"
1862 #endif
1863                 "packssdw %%mm2, %%mm4          \n\t"
1864                 "packssdw %%mm3, %%mm1          \n\t"
1865                 "pmaddwd %%mm5, %%mm4           \n\t"
1866                 "pmaddwd %%mm5, %%mm1           \n\t"
1867                 "add $24, %%"REG_b"             \n\t"
1868                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1869                 "psraw $7, %%mm4                \n\t"
1870                 
1871                 "movq %%mm0, %%mm1              \n\t"
1872                 "punpckldq %%mm4, %%mm0         \n\t"
1873                 "punpckhdq %%mm4, %%mm1         \n\t"
1874                 "packsswb %%mm1, %%mm0          \n\t"
1875                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1876
1877                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1878                 "punpckhdq %%mm0, %%mm0         \n\t"
1879                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1880                 "add $4, %%"REG_a"              \n\t"
1881                 " js 1b                         \n\t"
1882                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883                 : "%"REG_a, "%"REG_b
1884         );
1885 #else
1886         int i;
1887         for(i=0; i<width; i++)
1888         {
1889                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1892
1893                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1895         }
1896 #endif
1897 }
1898
1899 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1900 {
1901         int i;
1902         for(i=0; i<width; i++)
1903         {
1904                 int d= ((uint16_t*)src)[i];
1905                 int b= d&0x1F;
1906                 int g= (d>>5)&0x3F;
1907                 int r= (d>>11)&0x1F;
1908
1909                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1910         }
1911 }
1912
1913 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1914 {
1915         int i;
1916         for(i=0; i<width; i++)
1917         {
1918                 int d0= ((uint32_t*)src1)[i];
1919                 int d1= ((uint32_t*)src2)[i];
1920                 
1921                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1923
1924                 int dh2= (dh>>11) + (dh<<21);
1925                 int d= dh2 + dl;
1926
1927                 int b= d&0x7F;
1928                 int r= (d>>11)&0x7F;
1929                 int g= d>>21;
1930                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1932         }
1933 }
1934
1935 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1936 {
1937         int i;
1938         for(i=0; i<width; i++)
1939         {
1940                 int d= ((uint16_t*)src)[i];
1941                 int b= d&0x1F;
1942                 int g= (d>>5)&0x1F;
1943                 int r= (d>>10)&0x1F;
1944
1945                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1946         }
1947 }
1948
1949 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1950 {
1951         int i;
1952         for(i=0; i<width; i++)
1953         {
1954                 int d0= ((uint32_t*)src1)[i];
1955                 int d1= ((uint32_t*)src2)[i];
1956                 
1957                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1959
1960                 int dh2= (dh>>11) + (dh<<21);
1961                 int d= dh2 + dl;
1962
1963                 int b= d&0x7F;
1964                 int r= (d>>10)&0x7F;
1965                 int g= d>>21;
1966                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1968         }
1969 }
1970
1971
1972 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1973 {
1974         int i;
1975         for(i=0; i<width; i++)
1976         {
1977                 int r=  ((uint32_t*)src)[i]&0xFF;
1978                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1980
1981                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1982         }
1983 }
1984
1985 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1986 {
1987         int i;
1988         for(i=0; i<width; i++)
1989         {
1990                 const int a= ((uint32_t*)src1)[2*i+0];
1991                 const int e= ((uint32_t*)src1)[2*i+1];
1992                 const int c= ((uint32_t*)src2)[2*i+0];
1993                 const int d= ((uint32_t*)src2)[2*i+1];
1994                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996                 const int r=  l&0x3FF;
1997                 const int g=  h>>8;
1998                 const int b=  l>>16;
1999
2000                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2002         }
2003 }
2004
2005 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2006 {
2007         int i;
2008         for(i=0; i<width; i++)
2009         {
2010                 int r= src[i*3+0];
2011                 int g= src[i*3+1];
2012                 int b= src[i*3+2];
2013
2014                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2015         }
2016 }
2017
2018 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2019 {
2020         int i;
2021         for(i=0; i<width; i++)
2022         {
2023                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2026
2027                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2029         }
2030 }
2031
2032
2033 // Bilinear / Bicubic scaling
2034 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035                                   int16_t *filter, int16_t *filterPos, int filterSize)
2036 {
2037 #ifdef HAVE_MMX
2038         assert(filterSize % 4 == 0 && filterSize>0);
2039         if(filterSize==4) // allways true for upscaling, sometimes for down too
2040         {
2041                 long counter= -2*dstW;
2042                 filter-= counter*2;
2043                 filterPos-= counter/2;
2044                 dst-= counter/2;
2045                 asm volatile(
2046                         "pxor %%mm7, %%mm7              \n\t"
2047                         "movq "MANGLE(w02)", %%mm6      \n\t"
2048                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2049                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2050                         ".balign 16                     \n\t"
2051                         "1:                             \n\t"
2052                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2053                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2054                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2057                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2058                         "punpcklbw %%mm7, %%mm0         \n\t"
2059                         "punpcklbw %%mm7, %%mm2         \n\t"
2060                         "pmaddwd %%mm1, %%mm0           \n\t"
2061                         "pmaddwd %%mm2, %%mm3           \n\t"
2062                         "psrad $8, %%mm0                \n\t"
2063                         "psrad $8, %%mm3                \n\t"
2064                         "packssdw %%mm3, %%mm0          \n\t"
2065                         "pmaddwd %%mm6, %%mm0           \n\t"
2066                         "packssdw %%mm0, %%mm0          \n\t"
2067                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2068                         "add $4, %%"REG_BP"             \n\t"
2069                         " jnc 1b                        \n\t"
2070
2071                         "pop %%"REG_BP"                 \n\t"
2072                         : "+a" (counter)
2073                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2074                         : "%"REG_b
2075                 );
2076         }
2077         else if(filterSize==8)
2078         {
2079                 long counter= -2*dstW;
2080                 filter-= counter*4;
2081                 filterPos-= counter/2;
2082                 dst-= counter/2;
2083                 asm volatile(
2084                         "pxor %%mm7, %%mm7              \n\t"
2085                         "movq "MANGLE(w02)", %%mm6      \n\t"
2086                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2087                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2088                         ".balign 16                     \n\t"
2089                         "1:                             \n\t"
2090                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2091                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2092                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2095                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2096                         "punpcklbw %%mm7, %%mm0         \n\t"
2097                         "punpcklbw %%mm7, %%mm2         \n\t"
2098                         "pmaddwd %%mm1, %%mm0           \n\t"
2099                         "pmaddwd %%mm2, %%mm3           \n\t"
2100
2101                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2104                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2105                         "punpcklbw %%mm7, %%mm4         \n\t"
2106                         "punpcklbw %%mm7, %%mm2         \n\t"
2107                         "pmaddwd %%mm1, %%mm4           \n\t"
2108                         "pmaddwd %%mm2, %%mm5           \n\t"
2109                         "paddd %%mm4, %%mm0             \n\t"
2110                         "paddd %%mm5, %%mm3             \n\t"
2111                                                 
2112                         "psrad $8, %%mm0                \n\t"
2113                         "psrad $8, %%mm3                \n\t"
2114                         "packssdw %%mm3, %%mm0          \n\t"
2115                         "pmaddwd %%mm6, %%mm0           \n\t"
2116                         "packssdw %%mm0, %%mm0          \n\t"
2117                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2118                         "add $4, %%"REG_BP"             \n\t"
2119                         " jnc 1b                        \n\t"
2120
2121                         "pop %%"REG_BP"                 \n\t"
2122                         : "+a" (counter)
2123                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2124                         : "%"REG_b
2125                 );
2126         }
2127         else
2128         {
2129                 long counter= -2*dstW;
2130 //              filter-= counter*filterSize/2;
2131                 filterPos-= counter/2;
2132                 dst-= counter/2;
2133                 asm volatile(
2134                         "pxor %%mm7, %%mm7              \n\t"
2135                         "movq "MANGLE(w02)", %%mm6      \n\t"
2136                         ".balign 16                     \n\t"
2137                         "1:                             \n\t"
2138                         "mov %2, %%"REG_c"              \n\t"
2139                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2140                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2141                         "mov %5, %%"REG_c"              \n\t"
2142                         "pxor %%mm4, %%mm4              \n\t"
2143                         "pxor %%mm5, %%mm5              \n\t"
2144                         "2:                             \n\t"
2145                         "movq (%1), %%mm1               \n\t"
2146                         "movq (%1, %6), %%mm3           \n\t"
2147                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149                         "punpcklbw %%mm7, %%mm0         \n\t"
2150                         "punpcklbw %%mm7, %%mm2         \n\t"
2151                         "pmaddwd %%mm1, %%mm0           \n\t"
2152                         "pmaddwd %%mm2, %%mm3           \n\t"
2153                         "paddd %%mm3, %%mm5             \n\t"
2154                         "paddd %%mm0, %%mm4             \n\t"
2155                         "add $8, %1                     \n\t"
2156                         "add $4, %%"REG_c"              \n\t"
2157                         "cmp %4, %%"REG_c"              \n\t"
2158                         " jb 2b                         \n\t"
2159                         "add %6, %1                     \n\t"
2160                         "psrad $8, %%mm4                \n\t"
2161                         "psrad $8, %%mm5                \n\t"
2162                         "packssdw %%mm5, %%mm4          \n\t"
2163                         "pmaddwd %%mm6, %%mm4           \n\t"
2164                         "packssdw %%mm4, %%mm4          \n\t"
2165                         "mov %3, %%"REG_a"              \n\t"
2166                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2167                         "add $4, %0                     \n\t"
2168                         " jnc 1b                        \n\t"
2169
2170                         : "+r" (counter), "+r" (filter)
2171                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172                           "m" (src), "r" ((long)filterSize*2)
2173                         : "%"REG_b, "%"REG_a, "%"REG_c
2174                 );
2175         }
2176 #else
2177 #ifdef HAVE_ALTIVEC
2178         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2179 #else
2180         int i;
2181         for(i=0; i<dstW; i++)
2182         {
2183                 int j;
2184                 int srcPos= filterPos[i];
2185                 int val=0;
2186 //              printf("filterPos: %d\n", filterPos[i]);
2187                 for(j=0; j<filterSize; j++)
2188                 {
2189 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2191                 }
2192 //              filter += hFilterSize;
2193                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2194 //              dst[i] = val>>7;
2195         }
2196 #endif
2197 #endif
2198 }
2199       // *** horizontal scale Y line to temp buffer
2200 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2203                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204                                    int32_t *mmx2FilterPos)
2205 {
2206     if(srcFormat==IMGFMT_YUY2)
2207     {
2208         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209         src= formatConvBuffer;
2210     }
2211     else if(srcFormat==IMGFMT_UYVY)
2212     {
2213         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214         src= formatConvBuffer;
2215     }
2216     else if(srcFormat==IMGFMT_BGR32)
2217     {
2218         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219         src= formatConvBuffer;
2220     }
2221     else if(srcFormat==IMGFMT_BGR24)
2222     {
2223         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224         src= formatConvBuffer;
2225     }
2226     else if(srcFormat==IMGFMT_BGR16)
2227     {
2228         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229         src= formatConvBuffer;
2230     }
2231     else if(srcFormat==IMGFMT_BGR15)
2232     {
2233         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234         src= formatConvBuffer;
2235     }
2236     else if(srcFormat==IMGFMT_RGB32)
2237     {
2238         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239         src= formatConvBuffer;
2240     }
2241     else if(srcFormat==IMGFMT_RGB24)
2242     {
2243         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244         src= formatConvBuffer;
2245     }
2246
2247 #ifdef HAVE_MMX
2248         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2250 #else
2251     if(!(flags&SWS_FAST_BILINEAR))
2252 #endif
2253     {
2254         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2255     }
2256     else // Fast Bilinear upscale / crap downscale
2257     {
2258 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2259 #ifdef HAVE_MMX2
2260         int i;
2261         if(canMMX2BeUsed)
2262         {
2263                 asm volatile(
2264                         "pxor %%mm7, %%mm7              \n\t"
2265                         "mov %0, %%"REG_c"              \n\t"
2266                         "mov %1, %%"REG_D"              \n\t"
2267                         "mov %2, %%"REG_d"              \n\t"
2268                         "mov %3, %%"REG_b"              \n\t"
2269                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2270                         PREFETCH" (%%"REG_c")           \n\t"
2271                         PREFETCH" 32(%%"REG_c")         \n\t"
2272                         PREFETCH" 64(%%"REG_c")         \n\t"
2273
2274 #define FUNNY_Y_CODE \
2275                         "mov (%%"REG_b"), %%"REG_S"     \n\t"\
2276                         "call *%4                       \n\t"\
2277                         "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2278                         "add %%"REG_a", %%"REG_d"       \n\t"\
2279                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2280
2281 FUNNY_Y_CODE
2282 FUNNY_Y_CODE
2283 FUNNY_Y_CODE
2284 FUNNY_Y_CODE
2285 FUNNY_Y_CODE
2286 FUNNY_Y_CODE
2287 FUNNY_Y_CODE
2288 FUNNY_Y_CODE
2289
2290                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2291                         "m" (funnyYCode)
2292                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_d
2293                 );
2294                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2295         }
2296         else
2297         {
2298 #endif
2299         //NO MMX just normal asm ...
2300         asm volatile(
2301                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2302                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2303                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2304                 ".balign 16                     \n\t"
2305                 "1:                             \n\t"
2306                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2307                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2308                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2309                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2310                 "shll $16, %%edi                \n\t"
2311                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2312                 "mov %1, %%"REG_D"              \n\t"
2313                 "shrl $9, %%esi                 \n\t"
2314                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2315                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2316                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2317
2318                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2319                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2320                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2321                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2322                 "shll $16, %%edi                \n\t"
2323                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2324                 "mov %1, %%"REG_D"              \n\t"
2325                 "shrl $9, %%esi                 \n\t"
2326                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2327                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2328                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2329
2330
2331                 "add $2, %%"REG_a"              \n\t"
2332                 "cmp %2, %%"REG_a"              \n\t"
2333                 " jb 1b                         \n\t"
2334
2335
2336                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2337                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2338                 );
2339 #ifdef HAVE_MMX2
2340         } //if MMX2 can't be used
2341 #endif
2342 #else
2343         int i;
2344         unsigned int xpos=0;
2345         for(i=0;i<dstWidth;i++)
2346         {
2347                 register unsigned int xx=xpos>>16;
2348                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2349                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2350                 xpos+=xInc;
2351         }
2352 #endif
2353     }
2354 }
2355
2356 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2357                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2358                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2359                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2360                                    int32_t *mmx2FilterPos)
2361 {
2362     if(srcFormat==IMGFMT_YUY2)
2363     {
2364         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2365         src1= formatConvBuffer;
2366         src2= formatConvBuffer+2048;
2367     }
2368     else if(srcFormat==IMGFMT_UYVY)
2369     {
2370         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2371         src1= formatConvBuffer;
2372         src2= formatConvBuffer+2048;
2373     }
2374     else if(srcFormat==IMGFMT_BGR32)
2375     {
2376         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2377         src1= formatConvBuffer;
2378         src2= formatConvBuffer+2048;
2379     }
2380     else if(srcFormat==IMGFMT_BGR24)
2381     {
2382         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2383         src1= formatConvBuffer;
2384         src2= formatConvBuffer+2048;
2385     }
2386     else if(srcFormat==IMGFMT_BGR16)
2387     {
2388         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2389         src1= formatConvBuffer;
2390         src2= formatConvBuffer+2048;
2391     }
2392     else if(srcFormat==IMGFMT_BGR15)
2393     {
2394         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2395         src1= formatConvBuffer;
2396         src2= formatConvBuffer+2048;
2397     }
2398     else if(srcFormat==IMGFMT_RGB32)
2399     {
2400         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2401         src1= formatConvBuffer;
2402         src2= formatConvBuffer+2048;
2403     }
2404     else if(srcFormat==IMGFMT_RGB24)
2405     {
2406         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407         src1= formatConvBuffer;
2408         src2= formatConvBuffer+2048;
2409     }
2410     else if(isGray(srcFormat))
2411     {
2412         return;
2413     }
2414
2415 #ifdef HAVE_MMX
2416         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2417     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2418 #else
2419     if(!(flags&SWS_FAST_BILINEAR))
2420 #endif
2421     {
2422         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2423         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2424     }
2425     else // Fast Bilinear upscale / crap downscale
2426     {
2427 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2428 #ifdef HAVE_MMX2
2429         int i;
2430         if(canMMX2BeUsed)
2431         {
2432                 asm volatile(
2433                         "pxor %%mm7, %%mm7              \n\t"
2434                         "mov %0, %%"REG_c"              \n\t"
2435                         "mov %1, %%"REG_D"              \n\t"
2436                         "mov %2, %%"REG_d"              \n\t"
2437                         "mov %3, %%"REG_b"              \n\t"
2438                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2439                         PREFETCH" (%%"REG_c")           \n\t"
2440                         PREFETCH" 32(%%"REG_c")         \n\t"
2441                         PREFETCH" 64(%%"REG_c")         \n\t"
2442
2443 #define FUNNY_UV_CODE \
2444                         "movl (%%"REG_b"), %%esi        \n\t"\
2445                         "call *%4                       \n\t"\
2446                         "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2447                         "add %%"REG_a", %%"REG_D"       \n\t"\
2448                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2449
2450 FUNNY_UV_CODE
2451 FUNNY_UV_CODE
2452 FUNNY_UV_CODE
2453 FUNNY_UV_CODE
2454                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2455                         "mov %5, %%"REG_c"              \n\t" // src
2456                         "mov %1, %%"REG_D"              \n\t" // buf1
2457                         "add $4096, %%"REG_D"           \n\t"
2458                         PREFETCH" (%%"REG_c")           \n\t"
2459                         PREFETCH" 32(%%"REG_c")         \n\t"
2460                         PREFETCH" 64(%%"REG_c")         \n\t"
2461
2462 FUNNY_UV_CODE
2463 FUNNY_UV_CODE
2464 FUNNY_UV_CODE
2465 FUNNY_UV_CODE
2466
2467                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2468                         "m" (funnyUVCode), "m" (src2)
2469                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%esi", "%"REG_D
2470                 );
2471                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2472                 {
2473 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2474                         dst[i] = src1[srcW-1]*128;
2475                         dst[i+2048] = src2[srcW-1]*128;
2476                 }
2477         }
2478         else
2479         {
2480 #endif
2481         asm volatile(
2482                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2483                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2484                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2485                 ".balign 16                     \n\t"
2486                 "1:                             \n\t"
2487                 "mov %0, %%"REG_S"              \n\t"
2488                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2489                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2490                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2491                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2492                 "shll $16, %%edi                \n\t"
2493                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2494                 "mov %1, %%"REG_D"              \n\t"
2495                 "shrl $9, %%esi                 \n\t"
2496                 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2497
2498                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2499                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2500                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2501                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2502                 "shll $16, %%edi                \n\t"
2503                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2504                 "mov %1, %%"REG_D"              \n\t"
2505                 "shrl $9, %%esi                 \n\t"
2506                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2507
2508                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2509                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2510                 "add $1, %%"REG_a"              \n\t"
2511                 "cmp %2, %%"REG_a"              \n\t"
2512                 " jb 1b                         \n\t"
2513
2514                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2515                 "r" (src2)
2516                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2517                 );
2518 #ifdef HAVE_MMX2
2519         } //if MMX2 can't be used
2520 #endif
2521 #else
2522         int i;
2523         unsigned int xpos=0;
2524         for(i=0;i<dstWidth;i++)
2525         {
2526                 register unsigned int xx=xpos>>16;
2527                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2528                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2529                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2530 /* slower
2531           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2532           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2533 */
2534                 xpos+=xInc;
2535         }
2536 #endif
2537    }
2538 }
2539
2540 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2541              int srcSliceH, uint8_t* dst[], int dstStride[]){
2542
2543         /* load a few things into local vars to make the code more readable? and faster */
2544         const int srcW= c->srcW;
2545         const int dstW= c->dstW;
2546         const int dstH= c->dstH;
2547         const int chrDstW= c->chrDstW;
2548         const int chrSrcW= c->chrSrcW;
2549         const int lumXInc= c->lumXInc;
2550         const int chrXInc= c->chrXInc;
2551         const int dstFormat= c->dstFormat;
2552         const int srcFormat= c->srcFormat;
2553         const int flags= c->flags;
2554         const int canMMX2BeUsed= c->canMMX2BeUsed;
2555         int16_t *vLumFilterPos= c->vLumFilterPos;
2556         int16_t *vChrFilterPos= c->vChrFilterPos;
2557         int16_t *hLumFilterPos= c->hLumFilterPos;
2558         int16_t *hChrFilterPos= c->hChrFilterPos;
2559         int16_t *vLumFilter= c->vLumFilter;
2560         int16_t *vChrFilter= c->vChrFilter;
2561         int16_t *hLumFilter= c->hLumFilter;
2562         int16_t *hChrFilter= c->hChrFilter;
2563         int32_t *lumMmxFilter= c->lumMmxFilter;
2564         int32_t *chrMmxFilter= c->chrMmxFilter;
2565         const int vLumFilterSize= c->vLumFilterSize;
2566         const int vChrFilterSize= c->vChrFilterSize;
2567         const int hLumFilterSize= c->hLumFilterSize;
2568         const int hChrFilterSize= c->hChrFilterSize;
2569         int16_t **lumPixBuf= c->lumPixBuf;
2570         int16_t **chrPixBuf= c->chrPixBuf;
2571         const int vLumBufSize= c->vLumBufSize;
2572         const int vChrBufSize= c->vChrBufSize;
2573         uint8_t *funnyYCode= c->funnyYCode;
2574         uint8_t *funnyUVCode= c->funnyUVCode;
2575         uint8_t *formatConvBuffer= c->formatConvBuffer;
2576         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2577         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2578         int lastDstY;
2579
2580         /* vars whch will change and which we need to storw back in the context */
2581         int dstY= c->dstY;
2582         int lumBufIndex= c->lumBufIndex;
2583         int chrBufIndex= c->chrBufIndex;
2584         int lastInLumBuf= c->lastInLumBuf;
2585         int lastInChrBuf= c->lastInChrBuf;
2586         
2587         if(isPacked(c->srcFormat)){
2588                 src[0]=
2589                 src[1]=
2590                 src[2]= src[0];
2591                 srcStride[0]=
2592                 srcStride[1]=
2593                 srcStride[2]= srcStride[0];
2594         }
2595         srcStride[1]<<= c->vChrDrop;
2596         srcStride[2]<<= c->vChrDrop;
2597
2598 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2599 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2600
2601 #if 0 //self test FIXME move to a vfilter or something
2602 {
2603 static volatile int i=0;
2604 i++;
2605 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2606         selfTest(src, srcStride, c->srcW, c->srcH);
2607 i--;
2608 }
2609 #endif
2610
2611 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2612 //dstStride[0],dstStride[1],dstStride[2]);
2613
2614         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2615         {
2616                 static int firstTime=1; //FIXME move this into the context perhaps
2617                 if(flags & SWS_PRINT_INFO && firstTime)
2618                 {
2619                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2620                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2621                         firstTime=0;
2622                 }
2623         }
2624
2625         /* Note the user might start scaling the picture in the middle so this will not get executed
2626            this is not really intended but works currently, so ppl might do it */
2627         if(srcSliceY ==0){
2628                 lumBufIndex=0;
2629                 chrBufIndex=0;
2630                 dstY=0; 
2631                 lastInLumBuf= -1;
2632                 lastInChrBuf= -1;
2633         }
2634
2635         lastDstY= dstY;
2636
2637         for(;dstY < dstH; dstY++){
2638                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2639                 const int chrDstY= dstY>>c->chrDstVSubSample;
2640                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2641                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2642
2643                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2644                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2645                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2646                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2647
2648 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2649 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2650                 //handle holes (FAST_BILINEAR & weird filters)
2651                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2652                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2653 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2654                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2655                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2656
2657                 // Do we have enough lines in this slice to output the dstY line
2658                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2659                 {
2660                         //Do horizontal scaling
2661                         while(lastInLumBuf < lastLumSrcY)
2662                         {
2663                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2664                                 lumBufIndex++;
2665 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2666                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2667                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2668                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2669 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2670                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2671                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2672                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2673                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2674                                 lastInLumBuf++;
2675                         }
2676                         while(lastInChrBuf < lastChrSrcY)
2677                         {
2678                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2679                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2680                                 chrBufIndex++;
2681                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2682                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2683                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2684                                 //FIXME replace parameters through context struct (some at least)
2685
2686                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2687                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2688                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2689                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2690                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2691                                 lastInChrBuf++;
2692                         }
2693                         //wrap buf index around to stay inside the ring buffer
2694                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2695                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2696                 }
2697                 else // not enough lines left in this slice -> load the rest in the buffer
2698                 {
2699 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2700                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2701                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2702                         vChrBufSize, vLumBufSize);*/
2703
2704                         //Do horizontal scaling
2705                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2706                         {
2707                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2708                                 lumBufIndex++;
2709                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2710                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2711                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2712                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2715                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2716                                 lastInLumBuf++;
2717                         }
2718                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2719                         {
2720                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722                                 chrBufIndex++;
2723                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2724                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2725                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726
2727                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2728                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2729                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2730                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2731                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2732                                 lastInChrBuf++;
2733                         }
2734                         //wrap buf index around to stay inside the ring buffer
2735                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2736                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2737                         break; //we can't output a dstY line so let's try with the next slice
2738                 }
2739
2740 #ifdef HAVE_MMX
2741                 b5Dither= dither8[dstY&1];
2742                 g6Dither= dither4[dstY&1];
2743                 g5Dither= dither8[dstY&1];
2744                 r5Dither= dither8[(dstY+1)&1];
2745 #endif
2746             if(dstY < dstH-2)
2747             {
2748                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2749                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2750 #ifdef HAVE_MMX
2751                 int i;
2752                 for(i=0; i<vLumFilterSize; i++)
2753                 {
2754                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2755                         lumMmxFilter[4*i+2]= 
2756                         lumMmxFilter[4*i+3]= 
2757                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2758                 }
2759                 for(i=0; i<vChrFilterSize; i++)
2760                 {
2761                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2762                         chrMmxFilter[4*i+2]= 
2763                         chrMmxFilter[4*i+3]= 
2764                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2765                 }
2766 #endif
2767                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2768                 {
2769                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2770                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2771                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2772                         {
2773                                 int16_t *lumBuf = lumPixBuf[0];
2774                                 int16_t *chrBuf= chrPixBuf[0];
2775                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2776                         }
2777                         else //General YV12
2778                         {
2779                                 RENAME(yuv2yuvX)(c,
2780                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2781                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2782                                         dest, uDest, vDest, dstW, chrDstW);
2783                         }
2784                 }
2785                 else
2786                 {
2787                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2788                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2789                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2790                         {
2791                                 int chrAlpha= vChrFilter[2*dstY+1];
2792                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2793                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2794                         }
2795                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2796                         {
2797                                 int lumAlpha= vLumFilter[2*dstY+1];
2798                                 int chrAlpha= vChrFilter[2*dstY+1];
2799                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2800                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2801                         }
2802                         else //General RGB
2803                         {
2804                                 RENAME(yuv2packedX)(c,
2805                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2806                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807                                         dest, dstW, dstY);
2808                         }
2809                 }
2810             }
2811             else // hmm looks like we can't use MMX here without overwriting this array's tail
2812             {
2813                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2814                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2815                 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2816                 {
2817                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2818                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2819                         yuv2yuvXinC(
2820                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2821                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2822                                 dest, uDest, vDest, dstW, chrDstW);
2823                 }
2824                 else
2825                 {
2826                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2827                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2828                         yuv2packedXinC(c, 
2829                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2830                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2831                                 dest, dstW, dstY);
2832                 }
2833             }
2834         }
2835
2836 #ifdef HAVE_MMX
2837         __asm __volatile(SFENCE:::"memory");
2838         __asm __volatile(EMMS:::"memory");
2839 #endif
2840         /* store changed local vars back in the context */
2841         c->dstY= dstY;
2842         c->lumBufIndex= lumBufIndex;
2843         c->chrBufIndex= chrBufIndex;
2844         c->lastInLumBuf= lastInLumBuf;
2845         c->lastInChrBuf= lastInChrBuf;
2846
2847         return dstY - lastDstY;
2848 }