]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
This patch makes output codec keeping input pix_fmt value when using
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS     "femms"
30 #else
31 #define EMMS     "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69                         "xor %%"REG_a", %%"REG_a"       \n\t"\
70                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71                         "movq %%mm3, %%mm4              \n\t"\
72                         "lea " offset "(%0), %%"REG_d"  \n\t"\
73                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74                         ".balign 16                     \n\t" /* FIXME Unroll? */\
75                         "1:                             \n\t"\
76                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79                         "add $16, %%"REG_d"             \n\t"\
80                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81                         "test %%"REG_S", %%"REG_S"      \n\t"\
82                         "pmulhw %%mm0, %%mm2            \n\t"\
83                         "pmulhw %%mm0, %%mm5            \n\t"\
84                         "paddw %%mm2, %%mm3             \n\t"\
85                         "paddw %%mm5, %%mm4             \n\t"\
86                         " jnz 1b                        \n\t"\
87                         "psraw $3, %%mm3                \n\t"\
88                         "psraw $3, %%mm4                \n\t"\
89                         "packuswb %%mm4, %%mm3          \n\t"\
90                         MOVNTQ(%%mm3, (%1, %%REGa))\
91                         "add $8, %%"REG_a"              \n\t"\
92                         "cmp %2, %%"REG_a"              \n\t"\
93                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94                         "movq %%mm3, %%mm4              \n\t"\
95                         "lea " offset "(%0), %%"REG_d"  \n\t"\
96                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97                         "jb 1b                          \n\t"
98
99 #define YSCALEYUV2YV121 \
100                         "mov %2, %%"REG_a"              \n\t"\
101                         ".balign 16                     \n\t" /* FIXME Unroll? */\
102                         "1:                             \n\t"\
103                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105                         "psraw $7, %%mm0                \n\t"\
106                         "psraw $7, %%mm1                \n\t"\
107                         "packuswb %%mm1, %%mm0          \n\t"\
108                         MOVNTQ(%%mm0, (%1, %%REGa))\
109                         "add $8, %%"REG_a"              \n\t"\
110                         "jnc 1b                         \n\t"
111
112 /*
113                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115                            "r" (dest), "m" (dstW),
116                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120                 "xor %%"REG_a", %%"REG_a"       \n\t"\
121                 ".balign 16                     \n\t"\
122                 "nop                            \n\t"\
123                 "1:                             \n\t"\
124                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127                 "movq %%mm3, %%mm4              \n\t"\
128                 ".balign 16                     \n\t"\
129                 "2:                             \n\t"\
130                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133                 "add $16, %%"REG_d"             \n\t"\
134                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135                 "pmulhw %%mm0, %%mm2            \n\t"\
136                 "pmulhw %%mm0, %%mm5            \n\t"\
137                 "paddw %%mm2, %%mm3             \n\t"\
138                 "paddw %%mm5, %%mm4             \n\t"\
139                 "test %%"REG_S", %%"REG_S"      \n\t"\
140                 " jnz 2b                        \n\t"\
141 \
142                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145                 "movq %%mm1, %%mm7              \n\t"\
146                 ".balign 16                     \n\t"\
147                 "2:                             \n\t"\
148                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151                 "add $16, %%"REG_d"             \n\t"\
152                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153                 "pmulhw %%mm0, %%mm2            \n\t"\
154                 "pmulhw %%mm0, %%mm5            \n\t"\
155                 "paddw %%mm2, %%mm1             \n\t"\
156                 "paddw %%mm5, %%mm7             \n\t"\
157                 "test %%"REG_S", %%"REG_S"      \n\t"\
158                 " jnz 2b                        \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162                 YSCALEYUV2PACKEDX\
163                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177                 "paddw %%mm3, %%mm4             \n\t"\
178                 "movq %%mm2, %%mm0              \n\t"\
179                 "movq %%mm5, %%mm6              \n\t"\
180                 "movq %%mm4, %%mm3              \n\t"\
181                 "punpcklwd %%mm2, %%mm2         \n\t"\
182                 "punpcklwd %%mm5, %%mm5         \n\t"\
183                 "punpcklwd %%mm4, %%mm4         \n\t"\
184                 "paddw %%mm1, %%mm2             \n\t"\
185                 "paddw %%mm1, %%mm5             \n\t"\
186                 "paddw %%mm1, %%mm4             \n\t"\
187                 "punpckhwd %%mm0, %%mm0         \n\t"\
188                 "punpckhwd %%mm6, %%mm6         \n\t"\
189                 "punpckhwd %%mm3, %%mm3         \n\t"\
190                 "paddw %%mm7, %%mm0             \n\t"\
191                 "paddw %%mm7, %%mm6             \n\t"\
192                 "paddw %%mm7, %%mm3             \n\t"\
193                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194                 "packuswb %%mm0, %%mm2          \n\t"\
195                 "packuswb %%mm6, %%mm5          \n\t"\
196                 "packuswb %%mm3, %%mm4          \n\t"\
197                 "pxor %%mm7, %%mm7              \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200                 "pxor %%mm7, %%mm7              \n\t"\
201                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202                 "punpcklwd %%mm6, %%mm6         \n\t"\
203                 "punpcklwd %%mm6, %%mm6         \n\t"\
204                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205                 "punpcklwd %%mm5, %%mm5         \n\t"\
206                 "punpcklwd %%mm5, %%mm5         \n\t"\
207                 "xor %%"REG_a", %%"REG_a"               \n\t"\
208                 ".balign 16                     \n\t"\
209                 "1:                             \n\t"\
210                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237 \
238 \
239                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
243                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
244                 "packuswb %%mm3, %%mm3          \n\t"\
245 \
246                 "packuswb %%mm0, %%mm0          \n\t"\
247                 "paddw %%mm4, %%mm2             \n\t"\
248                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
249 \
250                 "packuswb %%mm1, %%mm1          \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256                 "psraw $3, %%mm0                \n\t"\
257                 "psraw $3, %%mm1                \n\t"\
258                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260                 "xor "#index", "#index"         \n\t"\
261                 ".balign 16                     \n\t"\
262                 "1:                             \n\t"\
263                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288                 
289 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290                 
291 #define REAL_YSCALEYUV2RGB(index, c) \
292                 "xor "#index", "#index" \n\t"\
293                 ".balign 16                     \n\t"\
294                 "1:                             \n\t"\
295                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334                 "paddw %%mm3, %%mm4             \n\t"\
335                 "movq %%mm2, %%mm0              \n\t"\
336                 "movq %%mm5, %%mm6              \n\t"\
337                 "movq %%mm4, %%mm3              \n\t"\
338                 "punpcklwd %%mm2, %%mm2         \n\t"\
339                 "punpcklwd %%mm5, %%mm5         \n\t"\
340                 "punpcklwd %%mm4, %%mm4         \n\t"\
341                 "paddw %%mm1, %%mm2             \n\t"\
342                 "paddw %%mm1, %%mm5             \n\t"\
343                 "paddw %%mm1, %%mm4             \n\t"\
344                 "punpckhwd %%mm0, %%mm0         \n\t"\
345                 "punpckhwd %%mm6, %%mm6         \n\t"\
346                 "punpckhwd %%mm3, %%mm3         \n\t"\
347                 "paddw %%mm7, %%mm0             \n\t"\
348                 "paddw %%mm7, %%mm6             \n\t"\
349                 "paddw %%mm7, %%mm3             \n\t"\
350                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351                 "packuswb %%mm0, %%mm2          \n\t"\
352                 "packuswb %%mm6, %%mm5          \n\t"\
353                 "packuswb %%mm3, %%mm4          \n\t"\
354                 "pxor %%mm7, %%mm7              \n\t"
355 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356                 
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358                 "xor "#index", "#index"         \n\t"\
359                 ".balign 16                     \n\t"\
360                 "1:                             \n\t"\
361                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363                 "psraw $7, %%mm3                \n\t" \
364                 "psraw $7, %%mm4                \n\t" \
365                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367                 "psraw $7, %%mm1                \n\t" \
368                 "psraw $7, %%mm7                \n\t" \
369                 
370 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371                 
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373                 "xor "#index", "#index" \n\t"\
374                 ".balign 16                     \n\t"\
375                 "1:                             \n\t"\
376                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398                 "paddw %%mm3, %%mm4             \n\t"\
399                 "movq %%mm2, %%mm0              \n\t"\
400                 "movq %%mm5, %%mm6              \n\t"\
401                 "movq %%mm4, %%mm3              \n\t"\
402                 "punpcklwd %%mm2, %%mm2         \n\t"\
403                 "punpcklwd %%mm5, %%mm5         \n\t"\
404                 "punpcklwd %%mm4, %%mm4         \n\t"\
405                 "paddw %%mm1, %%mm2             \n\t"\
406                 "paddw %%mm1, %%mm5             \n\t"\
407                 "paddw %%mm1, %%mm4             \n\t"\
408                 "punpckhwd %%mm0, %%mm0         \n\t"\
409                 "punpckhwd %%mm6, %%mm6         \n\t"\
410                 "punpckhwd %%mm3, %%mm3         \n\t"\
411                 "paddw %%mm7, %%mm0             \n\t"\
412                 "paddw %%mm7, %%mm6             \n\t"\
413                 "paddw %%mm7, %%mm3             \n\t"\
414                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415                 "packuswb %%mm0, %%mm2          \n\t"\
416                 "packuswb %%mm6, %%mm5          \n\t"\
417                 "packuswb %%mm3, %%mm4          \n\t"\
418                 "pxor %%mm7, %%mm7              \n\t"
419 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422                 "xor "#index", "#index"         \n\t"\
423                 ".balign 16                     \n\t"\
424                 "1:                             \n\t"\
425                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431                 "psrlw $8, %%mm3                \n\t" \
432                 "psrlw $8, %%mm4                \n\t" \
433                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435                 "psraw $7, %%mm1                \n\t" \
436                 "psraw $7, %%mm7                \n\t" 
437 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438                 
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441                 "xor "#index", "#index"         \n\t"\
442                 ".balign 16                     \n\t"\
443                 "1:                             \n\t"\
444                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470                 "paddw %%mm3, %%mm4             \n\t"\
471                 "movq %%mm2, %%mm0              \n\t"\
472                 "movq %%mm5, %%mm6              \n\t"\
473                 "movq %%mm4, %%mm3              \n\t"\
474                 "punpcklwd %%mm2, %%mm2         \n\t"\
475                 "punpcklwd %%mm5, %%mm5         \n\t"\
476                 "punpcklwd %%mm4, %%mm4         \n\t"\
477                 "paddw %%mm1, %%mm2             \n\t"\
478                 "paddw %%mm1, %%mm5             \n\t"\
479                 "paddw %%mm1, %%mm4             \n\t"\
480                 "punpckhwd %%mm0, %%mm0         \n\t"\
481                 "punpckhwd %%mm6, %%mm6         \n\t"\
482                 "punpckhwd %%mm3, %%mm3         \n\t"\
483                 "paddw %%mm7, %%mm0             \n\t"\
484                 "paddw %%mm7, %%mm6             \n\t"\
485                 "paddw %%mm7, %%mm3             \n\t"\
486                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487                 "packuswb %%mm0, %%mm2          \n\t"\
488                 "packuswb %%mm6, %%mm5          \n\t"\
489                 "packuswb %%mm3, %%mm4          \n\t"\
490                 "pxor %%mm7, %%mm7              \n\t"
491 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495                         "movq %%mm2, %%mm1              \n\t" /* B */\
496                         "movq %%mm5, %%mm6              \n\t" /* R */\
497                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507 \
508                         MOVNTQ(%%mm0, (dst, index, 4))\
509                         MOVNTQ(%%mm2, 8(dst, index, 4))\
510                         MOVNTQ(%%mm1, 16(dst, index, 4))\
511                         MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513                         "add $8, "#index"               \n\t"\
514                         "cmp "#dstw", "#index"          \n\t"\
515                         " jb 1b                         \n\t"
516 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522                         "psrlq $3, %%mm2                \n\t"\
523 \
524                         "movq %%mm2, %%mm1              \n\t"\
525                         "movq %%mm4, %%mm3              \n\t"\
526 \
527                         "punpcklbw %%mm7, %%mm3         \n\t"\
528                         "punpcklbw %%mm5, %%mm2         \n\t"\
529                         "punpckhbw %%mm7, %%mm4         \n\t"\
530                         "punpckhbw %%mm5, %%mm1         \n\t"\
531 \
532                         "psllq $3, %%mm3                \n\t"\
533                         "psllq $3, %%mm4                \n\t"\
534 \
535                         "por %%mm3, %%mm2               \n\t"\
536                         "por %%mm4, %%mm1               \n\t"\
537 \
538                         MOVNTQ(%%mm2, (dst, index, 2))\
539                         MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541                         "add $8, "#index"               \n\t"\
542                         "cmp "#dstw", "#index"          \n\t"\
543                         " jb 1b                         \n\t"
544 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550                         "psrlq $3, %%mm2                \n\t"\
551                         "psrlq $1, %%mm5                \n\t"\
552 \
553                         "movq %%mm2, %%mm1              \n\t"\
554                         "movq %%mm4, %%mm3              \n\t"\
555 \
556                         "punpcklbw %%mm7, %%mm3         \n\t"\
557                         "punpcklbw %%mm5, %%mm2         \n\t"\
558                         "punpckhbw %%mm7, %%mm4         \n\t"\
559                         "punpckhbw %%mm5, %%mm1         \n\t"\
560 \
561                         "psllq $2, %%mm3                \n\t"\
562                         "psllq $2, %%mm4                \n\t"\
563 \
564                         "por %%mm3, %%mm2               \n\t"\
565                         "por %%mm4, %%mm1               \n\t"\
566 \
567                         MOVNTQ(%%mm2, (dst, index, 2))\
568                         MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570                         "add $8, "#index"               \n\t"\
571                         "cmp "#dstw", "#index"          \n\t"\
572                         " jb 1b                         \n\t"
573 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577                         "movq %%mm2, %%mm1              \n\t" /* B */\
578                         "movq %%mm5, %%mm6              \n\t" /* R */\
579                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589 \
590                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598 \
599                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612 \
613                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621 \
622                         MOVNTQ(%%mm0, (dst))\
623                         MOVNTQ(%%mm2, 8(dst))\
624                         MOVNTQ(%%mm3, 16(dst))\
625                         "add $24, "#dst"                \n\t"\
626 \
627                         "add $8, "#index"               \n\t"\
628                         "cmp "#dstw", "#index"          \n\t"\
629                         " jb 1b                         \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633                         "movq %%mm2, %%mm1              \n\t" /* B */\
634                         "movq %%mm5, %%mm6              \n\t" /* R */\
635                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645 \
646                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650 \
651                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655 \
656                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660 \
661                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665                         MOVNTQ(%%mm0, (dst))\
666 \
667                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671                         MOVNTQ(%%mm6, 8(dst))\
672 \
673                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676                         MOVNTQ(%%mm5, 16(dst))\
677 \
678                         "add $24, "#dst"                \n\t"\
679 \
680                         "add $8, "#index"                       \n\t"\
681                         "cmp "#dstw", "#index"                  \n\t"\
682                         " jb 1b                         \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
687                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
688                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691 \
692                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695 \
696                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697                         "por %%mm1, %%mm6               \n\t"\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, (dst))\
700 \
701                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705 \
706                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709 \
710                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711                         "por %%mm3, %%mm6               \n\t"\
712                         MOVNTQ(%%mm6, 8(dst))\
713 \
714                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717 \
718                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721 \
722                         "por %%mm1, %%mm3               \n\t"\
723                         "por %%mm3, %%mm6               \n\t"\
724                         MOVNTQ(%%mm6, 16(dst))\
725 \
726                         "add $24, "#dst"                \n\t"\
727 \
728                         "add $8, "#index"               \n\t"\
729                         "cmp "#dstw", "#index"          \n\t"\
730                         " jb 1b                         \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741                         "packuswb %%mm3, %%mm3          \n\t"\
742                         "packuswb %%mm4, %%mm4          \n\t"\
743                         "packuswb %%mm7, %%mm1          \n\t"\
744                         "punpcklbw %%mm4, %%mm3         \n\t"\
745                         "movq %%mm1, %%mm7              \n\t"\
746                         "punpcklbw %%mm3, %%mm1         \n\t"\
747                         "punpckhbw %%mm3, %%mm7         \n\t"\
748 \
749                         MOVNTQ(%%mm1, (dst, index, 2))\
750                         MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752                         "add $8, "#index"               \n\t"\
753                         "cmp "#dstw", "#index"          \n\t"\
754                         " jb 1b                         \n\t"
755 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
761 {
762 #ifdef HAVE_MMX
763         if(uDest != NULL)
764         {
765                 asm volatile(
766                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767                                 :: "r" (&c->redDither),
768                                 "r" (uDest), "p" (chrDstW)
769                                 : "%"REG_a, "%"REG_d, "%"REG_S
770                         );
771
772                 asm volatile(
773                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774                                 :: "r" (&c->redDither),
775                                 "r" (vDest), "p" (chrDstW)
776                                 : "%"REG_a, "%"REG_d, "%"REG_S
777                         );
778         }
779
780         asm volatile(
781                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782                         :: "r" (&c->redDither),
783                            "r" (dest), "p" (dstW)
784                         : "%"REG_a, "%"REG_d, "%"REG_S
785                 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789                       chrFilter, chrSrc, chrFilterSize,
790                       dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793             chrFilter, chrSrc, chrFilterSize,
794             dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802 {
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804              chrFilter, chrSrc, chrFilterSize,
805              dest, uDest, dstW, chrDstW, dstFormat);
806 }
807
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
810 {
811 #ifdef HAVE_MMX
812         if(uDest != NULL)
813         {
814                 asm volatile(
815                                 YSCALEYUV2YV121
816                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817                                 "g" (-chrDstW)
818                                 : "%"REG_a
819                         );
820
821                 asm volatile(
822                                 YSCALEYUV2YV121
823                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824                                 "g" (-chrDstW)
825                                 : "%"REG_a
826                         );
827         }
828
829         asm volatile(
830                 YSCALEYUV2YV121
831                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
832                 "g" (-dstW)
833                 : "%"REG_a
834         );
835 #else
836         int i;
837         for(i=0; i<dstW; i++)
838         {
839                 int val= lumSrc[i]>>7;
840                 
841                 if(val&256){
842                         if(val<0) val=0;
843                         else      val=255;
844                 }
845
846                 dest[i]= val;
847         }
848
849         if(uDest != NULL)
850                 for(i=0; i<chrDstW; i++)
851                 {
852                         int u=chrSrc[i]>>7;
853                         int v=chrSrc[i + 2048]>>7;
854
855                         if((u|v)&256){
856                                 if(u<0)         u=0;
857                                 else if (u>255) u=255;
858                                 if(v<0)         v=0;
859                                 else if (v>255) v=255;
860                         }
861
862                         uDest[i]= u;
863                         vDest[i]= v;
864                 }
865 #endif
866 }
867
868
869 /**
870  * vertical scale YV12 to RGB
871  */
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874                             uint8_t *dest, int dstW, int dstY)
875 {
876         int dummy=0;
877         switch(c->dstFormat)
878         {
879 #ifdef HAVE_MMX
880         case IMGFMT_BGR32:
881                 {
882                         asm volatile(
883                                 YSCALEYUV2RGBX
884                                 WRITEBGR32(%4, %5, %%REGa)
885
886                         :: "r" (&c->redDither), 
887                            "m" (dummy), "m" (dummy), "m" (dummy),
888                            "r" (dest), "m" (dstW)
889                         : "%"REG_a, "%"REG_d, "%"REG_S
890                         );
891                 }
892                 break;
893         case IMGFMT_BGR24:
894                 {
895                         asm volatile(
896                                 YSCALEYUV2RGBX
897                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898                                 "add %4, %%"REG_b"                      \n\t"
899                                 WRITEBGR24(%%REGb, %5, %%REGa)
900
901                         :: "r" (&c->redDither), 
902                            "m" (dummy), "m" (dummy), "m" (dummy),
903                            "r" (dest), "m" (dstW)
904                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
905                         );
906                 }
907                 break;
908         case IMGFMT_BGR15:
909                 {
910                         asm volatile(
911                                 YSCALEYUV2RGBX
912                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 #ifdef DITHER1XBPP
914                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917 #endif
918
919                                 WRITEBGR15(%4, %5, %%REGa)
920
921                         :: "r" (&c->redDither), 
922                            "m" (dummy), "m" (dummy), "m" (dummy),
923                            "r" (dest), "m" (dstW)
924                         : "%"REG_a, "%"REG_d, "%"REG_S
925                         );
926                 }
927                 break;
928         case IMGFMT_BGR16:
929                 {
930                         asm volatile(
931                                 YSCALEYUV2RGBX
932                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937 #endif
938
939                                 WRITEBGR16(%4, %5, %%REGa)
940
941                         :: "r" (&c->redDither), 
942                            "m" (dummy), "m" (dummy), "m" (dummy),
943                            "r" (dest), "m" (dstW)
944                         : "%"REG_a, "%"REG_d, "%"REG_S
945                         );
946                 }
947                 break;
948         case IMGFMT_YUY2:
949                 {
950                         asm volatile(
951                                 YSCALEYUV2PACKEDX
952                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
954                                 "psraw $3, %%mm3                \n\t"
955                                 "psraw $3, %%mm4                \n\t"
956                                 "psraw $3, %%mm1                \n\t"
957                                 "psraw $3, %%mm7                \n\t"
958                                 WRITEYUY2(%4, %5, %%REGa)
959
960                         :: "r" (&c->redDither), 
961                            "m" (dummy), "m" (dummy), "m" (dummy),
962                            "r" (dest), "m" (dstW)
963                         : "%"REG_a, "%"REG_d, "%"REG_S
964                         );
965                 }
966                 break;
967 #endif
968         default:
969 #ifdef HAVE_ALTIVEC
970                 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971                             chrFilter, chrSrc, chrFilterSize,
972                             dest, dstW, dstY);
973 #else
974                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975                             chrFilter, chrSrc, chrFilterSize,
976                             dest, dstW, dstY);
977 #endif
978                 break;
979         }
980 }
981
982 /**
983  * vertical bilinear scale YV12 to RGB
984  */
985 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
987 {
988         int yalpha1=yalpha^4095;
989         int uvalpha1=uvalpha^4095;
990         int i;
991
992 #if 0 //isn't used
993         if(flags&SWS_FULL_CHR_H_INT)
994         {
995                 switch(dstFormat)
996                 {
997 #ifdef HAVE_MMX
998                 case IMGFMT_BGR32:
999                         asm volatile(
1000
1001
1002 FULL_YSCALEYUV2RGB
1003                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1004                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1005
1006                         "movq %%mm3, %%mm1              \n\t"
1007                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1008                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1009
1010                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1012
1013                         "add $4, %%"REG_a"              \n\t"
1014                         "cmp %5, %%"REG_a"              \n\t"
1015                         " jb 1b                         \n\t"
1016
1017
1018                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019                         "m" (yalpha1), "m" (uvalpha1)
1020                         : "%"REG_a
1021                         );
1022                         break;
1023                 case IMGFMT_BGR24:
1024                         asm volatile(
1025
1026 FULL_YSCALEYUV2RGB
1027
1028                                                                 // lsb ... msb
1029                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1030                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1031
1032                         "movq %%mm3, %%mm1              \n\t"
1033                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1034                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1035
1036                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1037                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1038                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1041                         "movq %%mm1, %%mm2              \n\t"
1042                         "psllq $48, %%mm1               \n\t" // 000000BG
1043                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1044
1045                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1046                         "psrld $16, %%mm2               \n\t" // R000R000
1047                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1048                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1049
1050                         "mov %4, %%"REG_b"              \n\t"
1051                         "add %%"REG_a", %%"REG_b"       \n\t"
1052
1053 #ifdef HAVE_MMX2
1054                         //FIXME Alignment
1055                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1057 #else
1058                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1059                         "psrlq $32, %%mm3               \n\t"
1060                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1062 #endif
1063                         "add $4, %%"REG_a"              \n\t"
1064                         "cmp %5, %%"REG_a"              \n\t"
1065                         " jb 1b                         \n\t"
1066
1067                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068                         "m" (yalpha1), "m" (uvalpha1)
1069                         : "%"REG_a, "%"REG_b
1070                         );
1071                         break;
1072                 case IMGFMT_BGR15:
1073                         asm volatile(
1074
1075 FULL_YSCALEYUV2RGB
1076 #ifdef DITHER1XBPP
1077                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1080 #endif
1081                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1082                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1083                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1084
1085                         "psrlw $3, %%mm3                \n\t"
1086                         "psllw $2, %%mm1                \n\t"
1087                         "psllw $7, %%mm0                \n\t"
1088                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1089                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1090
1091                         "por %%mm3, %%mm1               \n\t"
1092                         "por %%mm1, %%mm0               \n\t"
1093
1094                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1095
1096                         "add $4, %%"REG_a"              \n\t"
1097                         "cmp %5, %%"REG_a"              \n\t"
1098                         " jb 1b                         \n\t"
1099
1100                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101                         "m" (yalpha1), "m" (uvalpha1)
1102                         : "%"REG_a
1103                         );
1104                         break;
1105                 case IMGFMT_BGR16:
1106                         asm volatile(
1107
1108 FULL_YSCALEYUV2RGB
1109 #ifdef DITHER1XBPP
1110                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1113 #endif
1114                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1115                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1116                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1117
1118                         "psrlw $3, %%mm3                \n\t"
1119                         "psllw $3, %%mm1                \n\t"
1120                         "psllw $8, %%mm0                \n\t"
1121                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1122                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1123
1124                         "por %%mm3, %%mm1               \n\t"
1125                         "por %%mm1, %%mm0               \n\t"
1126
1127                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1128
1129                         "add $4, %%"REG_a"              \n\t"
1130                         "cmp %5, %%"REG_a"              \n\t"
1131                         " jb 1b                         \n\t"
1132
1133                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134                         "m" (yalpha1), "m" (uvalpha1)
1135                         : "%"REG_a
1136                         );
1137                 break;
1138 #endif
1139                 case IMGFMT_RGB32:
1140 #ifndef HAVE_MMX
1141                 case IMGFMT_BGR32:
1142 #endif
1143                 if(dstFormat==IMGFMT_BGR32)
1144                 {
1145                         int i;
1146 #ifdef WORDS_BIGENDIAN
1147                         dest++;
1148 #endif
1149                         for(i=0;i<dstW;i++){
1150                                 // vertical linear interpolation && yuv2rgb in a single step:
1151                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157                                 dest+= 4;
1158                         }
1159                 }
1160                 else if(dstFormat==IMGFMT_BGR24)
1161                 {
1162                         int i;
1163                         for(i=0;i<dstW;i++){
1164                                 // vertical linear interpolation && yuv2rgb in a single step:
1165                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1171                                 dest+= 3;
1172                         }
1173                 }
1174                 else if(dstFormat==IMGFMT_BGR16)
1175                 {
1176                         int i;
1177                         for(i=0;i<dstW;i++){
1178                                 // vertical linear interpolation && yuv2rgb in a single step:
1179                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1182
1183                                 ((uint16_t*)dest)[i] =
1184                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1187                         }
1188                 }
1189                 else if(dstFormat==IMGFMT_BGR15)
1190                 {
1191                         int i;
1192                         for(i=0;i<dstW;i++){
1193                                 // vertical linear interpolation && yuv2rgb in a single step:
1194                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1197
1198                                 ((uint16_t*)dest)[i] =
1199                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1202                         }
1203                 }
1204         }//FULL_UV_IPOL
1205         else
1206         {
1207 #endif // if 0
1208 #ifdef HAVE_MMX
1209         switch(c->dstFormat)
1210         {
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1212         case IMGFMT_BGR32:
1213                         asm volatile(
1214                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1215                                 "mov %4, %%"REG_SP"                     \n\t"
1216                                 YSCALEYUV2RGB(%%REGa, %5)
1217                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1219
1220                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221                         "r" (&c->redDither)
1222                         : "%"REG_a
1223                         );
1224                         return;
1225         case IMGFMT_BGR24:
1226                         asm volatile(
1227                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1228                                 "mov %4, %%"REG_SP"                     \n\t"
1229                                 YSCALEYUV2RGB(%%REGa, %5)
1230                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1232                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233                         "r" (&c->redDither)
1234                         : "%"REG_a
1235                         );
1236                         return;
1237         case IMGFMT_BGR15:
1238                         asm volatile(
1239                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1240                                 "mov %4, %%"REG_SP"                     \n\t"
1241                                 YSCALEYUV2RGB(%%REGa, %5)
1242                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243 #ifdef DITHER1XBPP
1244                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1247 #endif
1248
1249                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1251
1252                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253                         "r" (&c->redDither)
1254                         : "%"REG_a
1255                         );
1256                         return;
1257         case IMGFMT_BGR16:
1258                         asm volatile(
1259                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1260                                 "mov %4, %%"REG_SP"                     \n\t"
1261                                 YSCALEYUV2RGB(%%REGa, %5)
1262                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263 #ifdef DITHER1XBPP
1264                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1267 #endif
1268
1269                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1271                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272                         "r" (&c->redDither)
1273                         : "%"REG_a
1274                         );
1275                         return;
1276         case IMGFMT_YUY2:
1277                         asm volatile(
1278                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1279                                 "mov %4, %%"REG_SP"                     \n\t"
1280                                 YSCALEYUV2PACKED(%%REGa, %5)
1281                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1283                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284                         "r" (&c->redDither)
1285                         : "%"REG_a
1286                         );
1287                         return;
1288         default: break;
1289         }
1290 #endif //HAVE_MMX
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1292 }
1293
1294 /**
1295  * YV12 to RGB without scaling or interpolating
1296  */
1297 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1299 {
1300         const int yalpha1=0;
1301         int i;
1302         
1303         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304         const int yalpha= 4096; //FIXME ...
1305
1306         if(flags&SWS_FULL_CHR_H_INT)
1307         {
1308                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1309                 return;
1310         }
1311
1312 #ifdef HAVE_MMX
1313         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1314         {
1315                 switch(dstFormat)
1316                 {
1317                 case IMGFMT_BGR32:
1318                         asm volatile(
1319                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1320                                 "mov %4, %%"REG_SP"                     \n\t"
1321                                 YSCALEYUV2RGB1(%%REGa, %5)
1322                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1324
1325                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326                         "r" (&c->redDither)
1327                         : "%"REG_a
1328                         );
1329                         return;
1330                 case IMGFMT_BGR24:
1331                         asm volatile(
1332                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1333                                 "mov %4, %%"REG_SP"                     \n\t"
1334                                 YSCALEYUV2RGB1(%%REGa, %5)
1335                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1337
1338                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339                         "r" (&c->redDither)
1340                         : "%"REG_a
1341                         );
1342                         return;
1343                 case IMGFMT_BGR15:
1344                         asm volatile(
1345                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1346                                 "mov %4, %%"REG_SP"                     \n\t"
1347                                 YSCALEYUV2RGB1(%%REGa, %5)
1348                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349 #ifdef DITHER1XBPP
1350                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1353 #endif
1354                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1356
1357                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358                         "r" (&c->redDither)
1359                         : "%"REG_a
1360                         );
1361                         return;
1362                 case IMGFMT_BGR16:
1363                         asm volatile(
1364                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1365                                 "mov %4, %%"REG_SP"                     \n\t"
1366                                 YSCALEYUV2RGB1(%%REGa, %5)
1367                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372 #endif
1373
1374                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1376
1377                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378                         "r" (&c->redDither)
1379                         : "%"REG_a
1380                         );
1381                         return;
1382                 case IMGFMT_YUY2:
1383                         asm volatile(
1384                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1385                                 "mov %4, %%"REG_SP"                     \n\t"
1386                                 YSCALEYUV2PACKED1(%%REGa, %5)
1387                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1389
1390                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391                         "r" (&c->redDither)
1392                         : "%"REG_a
1393                         );
1394                         return;
1395                 }
1396         }
1397         else
1398         {
1399                 switch(dstFormat)
1400                 {
1401                 case IMGFMT_BGR32:
1402                         asm volatile(
1403                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1404                                 "mov %4, %%"REG_SP"                     \n\t"
1405                                 YSCALEYUV2RGB1b(%%REGa, %5)
1406                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1408
1409                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410                         "r" (&c->redDither)
1411                         : "%"REG_a
1412                         );
1413                         return;
1414                 case IMGFMT_BGR24:
1415                         asm volatile(
1416                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1417                                 "mov %4, %%"REG_SP"                     \n\t"
1418                                 YSCALEYUV2RGB1b(%%REGa, %5)
1419                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1421
1422                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423                         "r" (&c->redDither)
1424                         : "%"REG_a
1425                         );
1426                         return;
1427                 case IMGFMT_BGR15:
1428                         asm volatile(
1429                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1430                                 "mov %4, %%"REG_SP"                     \n\t"
1431                                 YSCALEYUV2RGB1b(%%REGa, %5)
1432                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433 #ifdef DITHER1XBPP
1434                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1437 #endif
1438                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1440
1441                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442                         "r" (&c->redDither)
1443                         : "%"REG_a
1444                         );
1445                         return;
1446                 case IMGFMT_BGR16:
1447                         asm volatile(
1448                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1449                                 "mov %4, %%"REG_SP"                     \n\t"
1450                                 YSCALEYUV2RGB1b(%%REGa, %5)
1451                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1456 #endif
1457
1458                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1460
1461                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462                         "r" (&c->redDither)
1463                         : "%"REG_a
1464                         );
1465                         return;
1466                 case IMGFMT_YUY2:
1467                         asm volatile(
1468                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1469                                 "mov %4, %%"REG_SP"                     \n\t"
1470                                 YSCALEYUV2PACKED1b(%%REGa, %5)
1471                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1473
1474                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475                         "r" (&c->redDither)
1476                         : "%"REG_a
1477                         );
1478                         return;
1479                 }
1480         }
1481 #endif
1482         if( uvalpha < 2048 )
1483         {
1484                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1485         }else{
1486                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1487         }
1488 }
1489
1490 //FIXME yuy2* can read upto 7 samples to much
1491
1492 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1493 {
1494 #ifdef HAVE_MMX
1495         asm volatile(
1496                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497                 "mov %0, %%"REG_a"              \n\t"
1498                 "1:                             \n\t"
1499                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1500                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501                 "pand %%mm2, %%mm0              \n\t"
1502                 "pand %%mm2, %%mm1              \n\t"
1503                 "packuswb %%mm1, %%mm0          \n\t"
1504                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1505                 "add $8, %%"REG_a"              \n\t"
1506                 " js 1b                         \n\t"
1507                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1508                 : "%"REG_a
1509         );
1510 #else
1511         int i;
1512         for(i=0; i<width; i++)
1513                 dst[i]= src[2*i];
1514 #endif
1515 }
1516
1517 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1518 {
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520         asm volatile(
1521                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522                 "mov %0, %%"REG_a"              \n\t"
1523                 "1:                             \n\t"
1524                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1525                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1527                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1528                 PAVGB(%%mm2, %%mm0)
1529                 PAVGB(%%mm3, %%mm1)
1530                 "psrlw $8, %%mm0                \n\t"
1531                 "psrlw $8, %%mm1                \n\t"
1532                 "packuswb %%mm1, %%mm0          \n\t"
1533                 "movq %%mm0, %%mm1              \n\t"
1534                 "psrlw $8, %%mm0                \n\t"
1535                 "pand %%mm4, %%mm1              \n\t"
1536                 "packuswb %%mm0, %%mm0          \n\t"
1537                 "packuswb %%mm1, %%mm1          \n\t"
1538                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1539                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1540                 "add $4, %%"REG_a"              \n\t"
1541                 " js 1b                         \n\t"
1542                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543                 : "%"REG_a
1544         );
1545 #else
1546         int i;
1547         for(i=0; i<width; i++)
1548         {
1549                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1551         }
1552 #endif
1553 }
1554
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1557 {
1558 #ifdef HAVE_MMX
1559         asm volatile(
1560                 "mov %0, %%"REG_a"              \n\t"
1561                 "1:                             \n\t"
1562                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1563                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564                 "psrlw $8, %%mm0                \n\t"
1565                 "psrlw $8, %%mm1                \n\t"
1566                 "packuswb %%mm1, %%mm0          \n\t"
1567                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1568                 "add $8, %%"REG_a"              \n\t"
1569                 " js 1b                         \n\t"
1570                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1571                 : "%"REG_a
1572         );
1573 #else
1574         int i;
1575         for(i=0; i<width; i++)
1576                 dst[i]= src[2*i+1];
1577 #endif
1578 }
1579
1580 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1581 {
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583         asm volatile(
1584                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585                 "mov %0, %%"REG_a"              \n\t"
1586                 "1:                             \n\t"
1587                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1588                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1590                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1591                 PAVGB(%%mm2, %%mm0)
1592                 PAVGB(%%mm3, %%mm1)
1593                 "pand %%mm4, %%mm0              \n\t"
1594                 "pand %%mm4, %%mm1              \n\t"
1595                 "packuswb %%mm1, %%mm0          \n\t"
1596                 "movq %%mm0, %%mm1              \n\t"
1597                 "psrlw $8, %%mm0                \n\t"
1598                 "pand %%mm4, %%mm1              \n\t"
1599                 "packuswb %%mm0, %%mm0          \n\t"
1600                 "packuswb %%mm1, %%mm1          \n\t"
1601                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1602                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1603                 "add $4, %%"REG_a"              \n\t"
1604                 " js 1b                         \n\t"
1605                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606                 : "%"REG_a
1607         );
1608 #else
1609         int i;
1610         for(i=0; i<width; i++)
1611         {
1612                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1614         }
1615 #endif
1616 }
1617
1618 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1619 {
1620         int i;
1621         for(i=0; i<width; i++)
1622         {
1623                 int b=  ((uint32_t*)src)[i]&0xFF;
1624                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1625                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1626
1627                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1628         }
1629 }
1630
1631 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1632 {
1633         int i;
1634         for(i=0; i<width; i++)
1635         {
1636                 const int a= ((uint32_t*)src1)[2*i+0];
1637                 const int e= ((uint32_t*)src1)[2*i+1];
1638                 const int c= ((uint32_t*)src2)[2*i+0];
1639                 const int d= ((uint32_t*)src2)[2*i+1];
1640                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642                 const int b=  l&0x3FF;
1643                 const int g=  h>>8;
1644                 const int r=  l>>16;
1645
1646                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648         }
1649 }
1650
1651 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1652 {
1653 #ifdef HAVE_MMX
1654         asm volatile(
1655                 "mov %2, %%"REG_a"              \n\t"
1656                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1657                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1658                 "pxor %%mm7, %%mm7              \n\t"
1659                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1660                 ".balign 16                     \n\t"
1661                 "1:                             \n\t"
1662                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1663                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1664                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1665                 "punpcklbw %%mm7, %%mm0         \n\t"
1666                 "punpcklbw %%mm7, %%mm1         \n\t"
1667                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1668                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1669                 "punpcklbw %%mm7, %%mm2         \n\t"
1670                 "punpcklbw %%mm7, %%mm3         \n\t"
1671                 "pmaddwd %%mm6, %%mm0           \n\t"
1672                 "pmaddwd %%mm6, %%mm1           \n\t"
1673                 "pmaddwd %%mm6, %%mm2           \n\t"
1674                 "pmaddwd %%mm6, %%mm3           \n\t"
1675 #ifndef FAST_BGR2YV12
1676                 "psrad $8, %%mm0                \n\t"
1677                 "psrad $8, %%mm1                \n\t"
1678                 "psrad $8, %%mm2                \n\t"
1679                 "psrad $8, %%mm3                \n\t"
1680 #endif
1681                 "packssdw %%mm1, %%mm0          \n\t"
1682                 "packssdw %%mm3, %%mm2          \n\t"
1683                 "pmaddwd %%mm5, %%mm0           \n\t"
1684                 "pmaddwd %%mm5, %%mm2           \n\t"
1685                 "packssdw %%mm2, %%mm0          \n\t"
1686                 "psraw $7, %%mm0                \n\t"
1687
1688                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1689                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1690                 "punpcklbw %%mm7, %%mm4         \n\t"
1691                 "punpcklbw %%mm7, %%mm1         \n\t"
1692                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1693                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1694                 "punpcklbw %%mm7, %%mm2         \n\t"
1695                 "punpcklbw %%mm7, %%mm3         \n\t"
1696                 "pmaddwd %%mm6, %%mm4           \n\t"
1697                 "pmaddwd %%mm6, %%mm1           \n\t"
1698                 "pmaddwd %%mm6, %%mm2           \n\t"
1699                 "pmaddwd %%mm6, %%mm3           \n\t"
1700 #ifndef FAST_BGR2YV12
1701                 "psrad $8, %%mm4                \n\t"
1702                 "psrad $8, %%mm1                \n\t"
1703                 "psrad $8, %%mm2                \n\t"
1704                 "psrad $8, %%mm3                \n\t"
1705 #endif
1706                 "packssdw %%mm1, %%mm4          \n\t"
1707                 "packssdw %%mm3, %%mm2          \n\t"
1708                 "pmaddwd %%mm5, %%mm4           \n\t"
1709                 "pmaddwd %%mm5, %%mm2           \n\t"
1710                 "add $24, %%"REG_b"             \n\t"
1711                 "packssdw %%mm2, %%mm4          \n\t"
1712                 "psraw $7, %%mm4                \n\t"
1713
1714                 "packuswb %%mm4, %%mm0          \n\t"
1715                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1716
1717                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1718                 "add $8, %%"REG_a"              \n\t"
1719                 " js 1b                         \n\t"
1720                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1721                 : "%"REG_a, "%"REG_b
1722         );
1723 #else
1724         int i;
1725         for(i=0; i<width; i++)
1726         {
1727                 int b= src[i*3+0];
1728                 int g= src[i*3+1];
1729                 int r= src[i*3+2];
1730
1731                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1732         }
1733 #endif
1734 }
1735
1736 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1737 {
1738 #ifdef HAVE_MMX
1739         asm volatile(
1740                 "mov %4, %%"REG_a"              \n\t"
1741                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1742                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1743                 "pxor %%mm7, %%mm7              \n\t"
1744                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1745                 "add %%"REG_b", %%"REG_b"       \n\t"
1746                 ".balign 16                     \n\t"
1747                 "1:                             \n\t"
1748                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1749                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1750 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1752                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1753                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1754                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1755                 PAVGB(%%mm1, %%mm0)
1756                 PAVGB(%%mm3, %%mm2)
1757                 "movq %%mm0, %%mm1              \n\t"
1758                 "movq %%mm2, %%mm3              \n\t"
1759                 "psrlq $24, %%mm0               \n\t"
1760                 "psrlq $24, %%mm2               \n\t"
1761                 PAVGB(%%mm1, %%mm0)
1762                 PAVGB(%%mm3, %%mm2)
1763                 "punpcklbw %%mm7, %%mm0         \n\t"
1764                 "punpcklbw %%mm7, %%mm2         \n\t"
1765 #else
1766                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1767                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1768                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1769                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1770                 "punpcklbw %%mm7, %%mm0         \n\t"
1771                 "punpcklbw %%mm7, %%mm1         \n\t"
1772                 "punpcklbw %%mm7, %%mm2         \n\t"
1773                 "punpcklbw %%mm7, %%mm3         \n\t"
1774                 "paddw %%mm1, %%mm0             \n\t"
1775                 "paddw %%mm3, %%mm2             \n\t"
1776                 "paddw %%mm2, %%mm0             \n\t"
1777                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1778                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1779                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1780                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1781                 "punpcklbw %%mm7, %%mm4         \n\t"
1782                 "punpcklbw %%mm7, %%mm1         \n\t"
1783                 "punpcklbw %%mm7, %%mm2         \n\t"
1784                 "punpcklbw %%mm7, %%mm3         \n\t"
1785                 "paddw %%mm1, %%mm4             \n\t"
1786                 "paddw %%mm3, %%mm2             \n\t"
1787                 "paddw %%mm4, %%mm2             \n\t"
1788                 "psrlw $2, %%mm0                \n\t"
1789                 "psrlw $2, %%mm2                \n\t"
1790 #endif
1791                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1792                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1793                 
1794                 "pmaddwd %%mm0, %%mm1           \n\t"
1795                 "pmaddwd %%mm2, %%mm3           \n\t"
1796                 "pmaddwd %%mm6, %%mm0           \n\t"
1797                 "pmaddwd %%mm6, %%mm2           \n\t"
1798 #ifndef FAST_BGR2YV12
1799                 "psrad $8, %%mm0                \n\t"
1800                 "psrad $8, %%mm1                \n\t"
1801                 "psrad $8, %%mm2                \n\t"
1802                 "psrad $8, %%mm3                \n\t"
1803 #endif
1804                 "packssdw %%mm2, %%mm0          \n\t"
1805                 "packssdw %%mm3, %%mm1          \n\t"
1806                 "pmaddwd %%mm5, %%mm0           \n\t"
1807                 "pmaddwd %%mm5, %%mm1           \n\t"
1808                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1809                 "psraw $7, %%mm0                \n\t"
1810
1811 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1813                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1814                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1815                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1816                 PAVGB(%%mm1, %%mm4)
1817                 PAVGB(%%mm3, %%mm2)
1818                 "movq %%mm4, %%mm1              \n\t"
1819                 "movq %%mm2, %%mm3              \n\t"
1820                 "psrlq $24, %%mm4               \n\t"
1821                 "psrlq $24, %%mm2               \n\t"
1822                 PAVGB(%%mm1, %%mm4)
1823                 PAVGB(%%mm3, %%mm2)
1824                 "punpcklbw %%mm7, %%mm4         \n\t"
1825                 "punpcklbw %%mm7, %%mm2         \n\t"
1826 #else
1827                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1828                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1829                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1830                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1831                 "punpcklbw %%mm7, %%mm4         \n\t"
1832                 "punpcklbw %%mm7, %%mm1         \n\t"
1833                 "punpcklbw %%mm7, %%mm2         \n\t"
1834                 "punpcklbw %%mm7, %%mm3         \n\t"
1835                 "paddw %%mm1, %%mm4             \n\t"
1836                 "paddw %%mm3, %%mm2             \n\t"
1837                 "paddw %%mm2, %%mm4             \n\t"
1838                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1839                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1840                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1841                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1842                 "punpcklbw %%mm7, %%mm5         \n\t"
1843                 "punpcklbw %%mm7, %%mm1         \n\t"
1844                 "punpcklbw %%mm7, %%mm2         \n\t"
1845                 "punpcklbw %%mm7, %%mm3         \n\t"
1846                 "paddw %%mm1, %%mm5             \n\t"
1847                 "paddw %%mm3, %%mm2             \n\t"
1848                 "paddw %%mm5, %%mm2             \n\t"
1849                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1850                 "psrlw $2, %%mm4                \n\t"
1851                 "psrlw $2, %%mm2                \n\t"
1852 #endif
1853                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1854                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1855                 
1856                 "pmaddwd %%mm4, %%mm1           \n\t"
1857                 "pmaddwd %%mm2, %%mm3           \n\t"
1858                 "pmaddwd %%mm6, %%mm4           \n\t"
1859                 "pmaddwd %%mm6, %%mm2           \n\t"
1860 #ifndef FAST_BGR2YV12
1861                 "psrad $8, %%mm4                \n\t"
1862                 "psrad $8, %%mm1                \n\t"
1863                 "psrad $8, %%mm2                \n\t"
1864                 "psrad $8, %%mm3                \n\t"
1865 #endif
1866                 "packssdw %%mm2, %%mm4          \n\t"
1867                 "packssdw %%mm3, %%mm1          \n\t"
1868                 "pmaddwd %%mm5, %%mm4           \n\t"
1869                 "pmaddwd %%mm5, %%mm1           \n\t"
1870                 "add $24, %%"REG_b"             \n\t"
1871                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1872                 "psraw $7, %%mm4                \n\t"
1873                 
1874                 "movq %%mm0, %%mm1              \n\t"
1875                 "punpckldq %%mm4, %%mm0         \n\t"
1876                 "punpckhdq %%mm4, %%mm1         \n\t"
1877                 "packsswb %%mm1, %%mm0          \n\t"
1878                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1879
1880                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1881                 "punpckhdq %%mm0, %%mm0         \n\t"
1882                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1883                 "add $4, %%"REG_a"              \n\t"
1884                 " js 1b                         \n\t"
1885                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1886                 : "%"REG_a, "%"REG_b
1887         );
1888 #else
1889         int i;
1890         for(i=0; i<width; i++)
1891         {
1892                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1895
1896                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1898         }
1899 #endif
1900 }
1901
1902 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1903 {
1904         int i;
1905         for(i=0; i<width; i++)
1906         {
1907                 int d= ((uint16_t*)src)[i];
1908                 int b= d&0x1F;
1909                 int g= (d>>5)&0x3F;
1910                 int r= (d>>11)&0x1F;
1911
1912                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913         }
1914 }
1915
1916 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1917 {
1918         int i;
1919         for(i=0; i<width; i++)
1920         {
1921                 int d0= ((uint32_t*)src1)[i];
1922                 int d1= ((uint32_t*)src2)[i];
1923                 
1924                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1926
1927                 int dh2= (dh>>11) + (dh<<21);
1928                 int d= dh2 + dl;
1929
1930                 int b= d&0x7F;
1931                 int r= (d>>11)&0x7F;
1932                 int g= d>>21;
1933                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935         }
1936 }
1937
1938 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1939 {
1940         int i;
1941         for(i=0; i<width; i++)
1942         {
1943                 int d= ((uint16_t*)src)[i];
1944                 int b= d&0x1F;
1945                 int g= (d>>5)&0x1F;
1946                 int r= (d>>10)&0x1F;
1947
1948                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949         }
1950 }
1951
1952 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1953 {
1954         int i;
1955         for(i=0; i<width; i++)
1956         {
1957                 int d0= ((uint32_t*)src1)[i];
1958                 int d1= ((uint32_t*)src2)[i];
1959                 
1960                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1962
1963                 int dh2= (dh>>11) + (dh<<21);
1964                 int d= dh2 + dl;
1965
1966                 int b= d&0x7F;
1967                 int r= (d>>10)&0x7F;
1968                 int g= d>>21;
1969                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1971         }
1972 }
1973
1974
1975 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1976 {
1977         int i;
1978         for(i=0; i<width; i++)
1979         {
1980                 int r=  ((uint32_t*)src)[i]&0xFF;
1981                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1982                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1983
1984                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1985         }
1986 }
1987
1988 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1989 {
1990         int i;
1991         for(i=0; i<width; i++)
1992         {
1993                 const int a= ((uint32_t*)src1)[2*i+0];
1994                 const int e= ((uint32_t*)src1)[2*i+1];
1995                 const int c= ((uint32_t*)src2)[2*i+0];
1996                 const int d= ((uint32_t*)src2)[2*i+1];
1997                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999                 const int r=  l&0x3FF;
2000                 const int g=  h>>8;
2001                 const int b=  l>>16;
2002
2003                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005         }
2006 }
2007
2008 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2009 {
2010         int i;
2011         for(i=0; i<width; i++)
2012         {
2013                 int r= src[i*3+0];
2014                 int g= src[i*3+1];
2015                 int b= src[i*3+2];
2016
2017                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2018         }
2019 }
2020
2021 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2022 {
2023         int i;
2024         for(i=0; i<width; i++)
2025         {
2026                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2029
2030                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2032         }
2033 }
2034
2035
2036 // Bilinear / Bicubic scaling
2037 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038                                   int16_t *filter, int16_t *filterPos, long filterSize)
2039 {
2040 #ifdef HAVE_MMX
2041         assert(filterSize % 4 == 0 && filterSize>0);
2042         if(filterSize==4) // allways true for upscaling, sometimes for down too
2043         {
2044                 long counter= -2*dstW;
2045                 filter-= counter*2;
2046                 filterPos-= counter/2;
2047                 dst-= counter/2;
2048                 asm volatile(
2049                         "pxor %%mm7, %%mm7              \n\t"
2050                         "movq "MANGLE(w02)", %%mm6      \n\t"
2051                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2052                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2053                         ".balign 16                     \n\t"
2054                         "1:                             \n\t"
2055                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2057                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2060                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2061                         "punpcklbw %%mm7, %%mm0         \n\t"
2062                         "punpcklbw %%mm7, %%mm2         \n\t"
2063                         "pmaddwd %%mm1, %%mm0           \n\t"
2064                         "pmaddwd %%mm2, %%mm3           \n\t"
2065                         "psrad $8, %%mm0                \n\t"
2066                         "psrad $8, %%mm3                \n\t"
2067                         "packssdw %%mm3, %%mm0          \n\t"
2068                         "pmaddwd %%mm6, %%mm0           \n\t"
2069                         "packssdw %%mm0, %%mm0          \n\t"
2070                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2071                         "add $4, %%"REG_BP"             \n\t"
2072                         " jnc 1b                        \n\t"
2073
2074                         "pop %%"REG_BP"                 \n\t"
2075                         : "+a" (counter)
2076                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077                         : "%"REG_b
2078                 );
2079         }
2080         else if(filterSize==8)
2081         {
2082                 long counter= -2*dstW;
2083                 filter-= counter*4;
2084                 filterPos-= counter/2;
2085                 dst-= counter/2;
2086                 asm volatile(
2087                         "pxor %%mm7, %%mm7              \n\t"
2088                         "movq "MANGLE(w02)", %%mm6      \n\t"
2089                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2090                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2091                         ".balign 16                     \n\t"
2092                         "1:                             \n\t"
2093                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2095                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2098                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2099                         "punpcklbw %%mm7, %%mm0         \n\t"
2100                         "punpcklbw %%mm7, %%mm2         \n\t"
2101                         "pmaddwd %%mm1, %%mm0           \n\t"
2102                         "pmaddwd %%mm2, %%mm3           \n\t"
2103
2104                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2107                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2108                         "punpcklbw %%mm7, %%mm4         \n\t"
2109                         "punpcklbw %%mm7, %%mm2         \n\t"
2110                         "pmaddwd %%mm1, %%mm4           \n\t"
2111                         "pmaddwd %%mm2, %%mm5           \n\t"
2112                         "paddd %%mm4, %%mm0             \n\t"
2113                         "paddd %%mm5, %%mm3             \n\t"
2114                                                 
2115                         "psrad $8, %%mm0                \n\t"
2116                         "psrad $8, %%mm3                \n\t"
2117                         "packssdw %%mm3, %%mm0          \n\t"
2118                         "pmaddwd %%mm6, %%mm0           \n\t"
2119                         "packssdw %%mm0, %%mm0          \n\t"
2120                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2121                         "add $4, %%"REG_BP"             \n\t"
2122                         " jnc 1b                        \n\t"
2123
2124                         "pop %%"REG_BP"                 \n\t"
2125                         : "+a" (counter)
2126                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2127                         : "%"REG_b
2128                 );
2129         }
2130         else
2131         {
2132                 uint8_t *offset = src+filterSize;
2133                 long counter= -2*dstW;
2134 //              filter-= counter*filterSize/2;
2135                 filterPos-= counter/2;
2136                 dst-= counter/2;
2137                 asm volatile(
2138                         "pxor %%mm7, %%mm7              \n\t"
2139                         "movq "MANGLE(w02)", %%mm6      \n\t"
2140                         ".balign 16                     \n\t"
2141                         "1:                             \n\t"
2142                         "mov %2, %%"REG_c"              \n\t"
2143                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2144                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2145                         "mov %5, %%"REG_c"              \n\t"
2146                         "pxor %%mm4, %%mm4              \n\t"
2147                         "pxor %%mm5, %%mm5              \n\t"
2148                         "2:                             \n\t"
2149                         "movq (%1), %%mm1               \n\t"
2150                         "movq (%1, %6), %%mm3           \n\t"
2151                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2153                         "punpcklbw %%mm7, %%mm0         \n\t"
2154                         "punpcklbw %%mm7, %%mm2         \n\t"
2155                         "pmaddwd %%mm1, %%mm0           \n\t"
2156                         "pmaddwd %%mm2, %%mm3           \n\t"
2157                         "paddd %%mm3, %%mm5             \n\t"
2158                         "paddd %%mm0, %%mm4             \n\t"
2159                         "add $8, %1                     \n\t"
2160                         "add $4, %%"REG_c"              \n\t"
2161                         "cmp %4, %%"REG_c"              \n\t"
2162                         " jb 2b                         \n\t"
2163                         "add %6, %1                     \n\t"
2164                         "psrad $8, %%mm4                \n\t"
2165                         "psrad $8, %%mm5                \n\t"
2166                         "packssdw %%mm5, %%mm4          \n\t"
2167                         "pmaddwd %%mm6, %%mm4           \n\t"
2168                         "packssdw %%mm4, %%mm4          \n\t"
2169                         "mov %3, %%"REG_a"              \n\t"
2170                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2171                         "add $4, %0                     \n\t"
2172                         " jnc 1b                        \n\t"
2173
2174                         : "+r" (counter), "+r" (filter)
2175                         : "m" (filterPos), "m" (dst), "m"(offset),
2176                           "m" (src), "r" (filterSize*2)
2177                         : "%"REG_b, "%"REG_a, "%"REG_c
2178                 );
2179         }
2180 #else
2181 #ifdef HAVE_ALTIVEC
2182         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183 #else
2184         int i;
2185         for(i=0; i<dstW; i++)
2186         {
2187                 int j;
2188                 int srcPos= filterPos[i];
2189                 int val=0;
2190 //              printf("filterPos: %d\n", filterPos[i]);
2191                 for(j=0; j<filterSize; j++)
2192                 {
2193 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2195                 }
2196 //              filter += hFilterSize;
2197                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2198 //              dst[i] = val>>7;
2199         }
2200 #endif
2201 #endif
2202 }
2203       // *** horizontal scale Y line to temp buffer
2204 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2206                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2207                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208                                    int32_t *mmx2FilterPos)
2209 {
2210     if(srcFormat==IMGFMT_YUY2)
2211     {
2212         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213         src= formatConvBuffer;
2214     }
2215     else if(srcFormat==IMGFMT_UYVY)
2216     {
2217         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218         src= formatConvBuffer;
2219     }
2220     else if(srcFormat==IMGFMT_BGR32)
2221     {
2222         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223         src= formatConvBuffer;
2224     }
2225     else if(srcFormat==IMGFMT_BGR24)
2226     {
2227         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228         src= formatConvBuffer;
2229     }
2230     else if(srcFormat==IMGFMT_BGR16)
2231     {
2232         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233         src= formatConvBuffer;
2234     }
2235     else if(srcFormat==IMGFMT_BGR15)
2236     {
2237         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238         src= formatConvBuffer;
2239     }
2240     else if(srcFormat==IMGFMT_RGB32)
2241     {
2242         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243         src= formatConvBuffer;
2244     }
2245     else if(srcFormat==IMGFMT_RGB24)
2246     {
2247         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248         src= formatConvBuffer;
2249     }
2250
2251 #ifdef HAVE_MMX
2252         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2254 #else
2255     if(!(flags&SWS_FAST_BILINEAR))
2256 #endif
2257     {
2258         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2259     }
2260     else // Fast Bilinear upscale / crap downscale
2261     {
2262 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2263 #ifdef HAVE_MMX2
2264         int i;
2265         if(canMMX2BeUsed)
2266         {
2267                 asm volatile(
2268                         "pxor %%mm7, %%mm7              \n\t"
2269                         "mov %0, %%"REG_c"              \n\t"
2270                         "mov %1, %%"REG_D"              \n\t"
2271                         "mov %2, %%"REG_d"              \n\t"
2272                         "mov %3, %%"REG_b"              \n\t"
2273                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2274                         PREFETCH" (%%"REG_c")           \n\t"
2275                         PREFETCH" 32(%%"REG_c")         \n\t"
2276                         PREFETCH" 64(%%"REG_c")         \n\t"
2277
2278 #ifdef ARCH_X86_64
2279
2280 #define FUNNY_Y_CODE \
2281                         "movl (%%"REG_b"), %%esi        \n\t"\
2282                         "call *%4                       \n\t"\
2283                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284                         "add %%"REG_S", %%"REG_c"       \n\t"\
2285                         "add %%"REG_a", %%"REG_D"       \n\t"\
2286                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2287
2288 #else
2289
2290 #define FUNNY_Y_CODE \
2291                         "movl (%%"REG_b"), %%esi        \n\t"\
2292                         "call *%4                       \n\t"\
2293                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294                         "add %%"REG_a", %%"REG_D"       \n\t"\
2295                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2296
2297 #endif
2298
2299 FUNNY_Y_CODE
2300 FUNNY_Y_CODE
2301 FUNNY_Y_CODE
2302 FUNNY_Y_CODE
2303 FUNNY_Y_CODE
2304 FUNNY_Y_CODE
2305 FUNNY_Y_CODE
2306 FUNNY_Y_CODE
2307
2308                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2309                         "m" (funnyYCode)
2310                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2311                 );
2312                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2313         }
2314         else
2315         {
2316 #endif
2317         int xInc_shr16 = xInc >> 16;
2318         int xInc_mask = xInc & 0xffff;
2319         //NO MMX just normal asm ...
2320         asm volatile(
2321                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2322                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2323                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2324                 ".balign 16                     \n\t"
2325                 "1:                             \n\t"
2326                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2328                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2329                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330                 "shll $16, %%edi                \n\t"
2331                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332                 "mov %1, %%"REG_D"              \n\t"
2333                 "shrl $9, %%esi                 \n\t"
2334                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2335                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2336                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2337
2338                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2339                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2340                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2341                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342                 "shll $16, %%edi                \n\t"
2343                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344                 "mov %1, %%"REG_D"              \n\t"
2345                 "shrl $9, %%esi                 \n\t"
2346                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2347                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2348                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2349
2350
2351                 "add $2, %%"REG_a"              \n\t"
2352                 "cmp %2, %%"REG_a"              \n\t"
2353                 " jb 1b                         \n\t"
2354
2355
2356                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2357                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2358                 );
2359 #ifdef HAVE_MMX2
2360         } //if MMX2 can't be used
2361 #endif
2362 #else
2363         int i;
2364         unsigned int xpos=0;
2365         for(i=0;i<dstWidth;i++)
2366         {
2367                 register unsigned int xx=xpos>>16;
2368                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370                 xpos+=xInc;
2371         }
2372 #endif
2373     }
2374 }
2375
2376 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2377                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2378                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2379                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380                                    int32_t *mmx2FilterPos)
2381 {
2382     if(srcFormat==IMGFMT_YUY2)
2383     {
2384         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385         src1= formatConvBuffer;
2386         src2= formatConvBuffer+2048;
2387     }
2388     else if(srcFormat==IMGFMT_UYVY)
2389     {
2390         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391         src1= formatConvBuffer;
2392         src2= formatConvBuffer+2048;
2393     }
2394     else if(srcFormat==IMGFMT_BGR32)
2395     {
2396         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397         src1= formatConvBuffer;
2398         src2= formatConvBuffer+2048;
2399     }
2400     else if(srcFormat==IMGFMT_BGR24)
2401     {
2402         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403         src1= formatConvBuffer;
2404         src2= formatConvBuffer+2048;
2405     }
2406     else if(srcFormat==IMGFMT_BGR16)
2407     {
2408         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409         src1= formatConvBuffer;
2410         src2= formatConvBuffer+2048;
2411     }
2412     else if(srcFormat==IMGFMT_BGR15)
2413     {
2414         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415         src1= formatConvBuffer;
2416         src2= formatConvBuffer+2048;
2417     }
2418     else if(srcFormat==IMGFMT_RGB32)
2419     {
2420         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421         src1= formatConvBuffer;
2422         src2= formatConvBuffer+2048;
2423     }
2424     else if(srcFormat==IMGFMT_RGB24)
2425     {
2426         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427         src1= formatConvBuffer;
2428         src2= formatConvBuffer+2048;
2429     }
2430     else if(isGray(srcFormat))
2431     {
2432         return;
2433     }
2434
2435 #ifdef HAVE_MMX
2436         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2438 #else
2439     if(!(flags&SWS_FAST_BILINEAR))
2440 #endif
2441     {
2442         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2444     }
2445     else // Fast Bilinear upscale / crap downscale
2446     {
2447 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2448 #ifdef HAVE_MMX2
2449         int i;
2450         if(canMMX2BeUsed)
2451         {
2452                 asm volatile(
2453                         "pxor %%mm7, %%mm7              \n\t"
2454                         "mov %0, %%"REG_c"              \n\t"
2455                         "mov %1, %%"REG_D"              \n\t"
2456                         "mov %2, %%"REG_d"              \n\t"
2457                         "mov %3, %%"REG_b"              \n\t"
2458                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2459                         PREFETCH" (%%"REG_c")           \n\t"
2460                         PREFETCH" 32(%%"REG_c")         \n\t"
2461                         PREFETCH" 64(%%"REG_c")         \n\t"
2462
2463 #ifdef ARCH_X86_64
2464
2465 #define FUNNY_UV_CODE \
2466                         "movl (%%"REG_b"), %%esi        \n\t"\
2467                         "call *%4                       \n\t"\
2468                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469                         "add %%"REG_S", %%"REG_c"       \n\t"\
2470                         "add %%"REG_a", %%"REG_D"       \n\t"\
2471                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2472
2473 #else
2474
2475 #define FUNNY_UV_CODE \
2476                         "movl (%%"REG_b"), %%esi        \n\t"\
2477                         "call *%4                       \n\t"\
2478                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479                         "add %%"REG_a", %%"REG_D"       \n\t"\
2480                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2481
2482 #endif
2483
2484 FUNNY_UV_CODE
2485 FUNNY_UV_CODE
2486 FUNNY_UV_CODE
2487 FUNNY_UV_CODE
2488                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2489                         "mov %5, %%"REG_c"              \n\t" // src
2490                         "mov %1, %%"REG_D"              \n\t" // buf1
2491                         "add $4096, %%"REG_D"           \n\t"
2492                         PREFETCH" (%%"REG_c")           \n\t"
2493                         PREFETCH" 32(%%"REG_c")         \n\t"
2494                         PREFETCH" 64(%%"REG_c")         \n\t"
2495
2496 FUNNY_UV_CODE
2497 FUNNY_UV_CODE
2498 FUNNY_UV_CODE
2499 FUNNY_UV_CODE
2500
2501                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502                         "m" (funnyUVCode), "m" (src2)
2503                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2504                 );
2505                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2506                 {
2507 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2508                         dst[i] = src1[srcW-1]*128;
2509                         dst[i+2048] = src2[srcW-1]*128;
2510                 }
2511         }
2512         else
2513         {
2514 #endif
2515         long xInc_shr16 = (long) (xInc >> 16);
2516         int xInc_mask = xInc & 0xffff; 
2517         asm volatile(
2518                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2519                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2520                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2521                 ".balign 16                     \n\t"
2522                 "1:                             \n\t"
2523                 "mov %0, %%"REG_S"              \n\t"
2524                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2525                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2526                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2527                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528                 "shll $16, %%edi                \n\t"
2529                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530                 "mov %1, %%"REG_D"              \n\t"
2531                 "shrl $9, %%esi                 \n\t"
2532                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2533
2534                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2536                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2537                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538                 "shll $16, %%edi                \n\t"
2539                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540                 "mov %1, %%"REG_D"              \n\t"
2541                 "shrl $9, %%esi                 \n\t"
2542                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2543
2544                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2545                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2546                 "add $1, %%"REG_a"              \n\t"
2547                 "cmp %2, %%"REG_a"              \n\t"
2548                 " jb 1b                         \n\t"
2549
2550 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2551    which is needed to support GCC-4.0 */
2552 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2553                 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2554 #else
2555                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2556 #endif
2557                 "r" (src2)
2558                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2559                 );
2560 #ifdef HAVE_MMX2
2561         } //if MMX2 can't be used
2562 #endif
2563 #else
2564         int i;
2565         unsigned int xpos=0;
2566         for(i=0;i<dstWidth;i++)
2567         {
2568                 register unsigned int xx=xpos>>16;
2569                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2570                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2571                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2572 /* slower
2573           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2574           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2575 */
2576                 xpos+=xInc;
2577         }
2578 #endif
2579    }
2580 }
2581
2582 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2583              int srcSliceH, uint8_t* dst[], int dstStride[]){
2584
2585         /* load a few things into local vars to make the code more readable? and faster */
2586         const int srcW= c->srcW;
2587         const int dstW= c->dstW;
2588         const int dstH= c->dstH;
2589         const int chrDstW= c->chrDstW;
2590         const int chrSrcW= c->chrSrcW;
2591         const int lumXInc= c->lumXInc;
2592         const int chrXInc= c->chrXInc;
2593         const int dstFormat= c->dstFormat;
2594         const int srcFormat= c->srcFormat;
2595         const int flags= c->flags;
2596         const int canMMX2BeUsed= c->canMMX2BeUsed;
2597         int16_t *vLumFilterPos= c->vLumFilterPos;
2598         int16_t *vChrFilterPos= c->vChrFilterPos;
2599         int16_t *hLumFilterPos= c->hLumFilterPos;
2600         int16_t *hChrFilterPos= c->hChrFilterPos;
2601         int16_t *vLumFilter= c->vLumFilter;
2602         int16_t *vChrFilter= c->vChrFilter;
2603         int16_t *hLumFilter= c->hLumFilter;
2604         int16_t *hChrFilter= c->hChrFilter;
2605         int32_t *lumMmxFilter= c->lumMmxFilter;
2606         int32_t *chrMmxFilter= c->chrMmxFilter;
2607         const int vLumFilterSize= c->vLumFilterSize;
2608         const int vChrFilterSize= c->vChrFilterSize;
2609         const int hLumFilterSize= c->hLumFilterSize;
2610         const int hChrFilterSize= c->hChrFilterSize;
2611         int16_t **lumPixBuf= c->lumPixBuf;
2612         int16_t **chrPixBuf= c->chrPixBuf;
2613         const int vLumBufSize= c->vLumBufSize;
2614         const int vChrBufSize= c->vChrBufSize;
2615         uint8_t *funnyYCode= c->funnyYCode;
2616         uint8_t *funnyUVCode= c->funnyUVCode;
2617         uint8_t *formatConvBuffer= c->formatConvBuffer;
2618         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2619         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2620         int lastDstY;
2621
2622         /* vars whch will change and which we need to storw back in the context */
2623         int dstY= c->dstY;
2624         int lumBufIndex= c->lumBufIndex;
2625         int chrBufIndex= c->chrBufIndex;
2626         int lastInLumBuf= c->lastInLumBuf;
2627         int lastInChrBuf= c->lastInChrBuf;
2628         
2629         if(isPacked(c->srcFormat)){
2630                 src[0]=
2631                 src[1]=
2632                 src[2]= src[0];
2633                 srcStride[0]=
2634                 srcStride[1]=
2635                 srcStride[2]= srcStride[0];
2636         }
2637         srcStride[1]<<= c->vChrDrop;
2638         srcStride[2]<<= c->vChrDrop;
2639
2640 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2642
2643 #if 0 //self test FIXME move to a vfilter or something
2644 {
2645 static volatile int i=0;
2646 i++;
2647 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2648         selfTest(src, srcStride, c->srcW, c->srcH);
2649 i--;
2650 }
2651 #endif
2652
2653 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654 //dstStride[0],dstStride[1],dstStride[2]);
2655
2656         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2657         {
2658                 static int firstTime=1; //FIXME move this into the context perhaps
2659                 if(flags & SWS_PRINT_INFO && firstTime)
2660                 {
2661                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2662                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2663                         firstTime=0;
2664                 }
2665         }
2666
2667         /* Note the user might start scaling the picture in the middle so this will not get executed
2668            this is not really intended but works currently, so ppl might do it */
2669         if(srcSliceY ==0){
2670                 lumBufIndex=0;
2671                 chrBufIndex=0;
2672                 dstY=0; 
2673                 lastInLumBuf= -1;
2674                 lastInChrBuf= -1;
2675         }
2676
2677         lastDstY= dstY;
2678
2679         for(;dstY < dstH; dstY++){
2680                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2681                 const int chrDstY= dstY>>c->chrDstVSubSample;
2682                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2683                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2684
2685                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2686                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2687                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2688                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2689
2690 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2691 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2692                 //handle holes (FAST_BILINEAR & weird filters)
2693                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2694                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2695 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2696                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2697                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2698
2699                 // Do we have enough lines in this slice to output the dstY line
2700                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2701                 {
2702                         //Do horizontal scaling
2703                         while(lastInLumBuf < lastLumSrcY)
2704                         {
2705                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2706                                 lumBufIndex++;
2707 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2708                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2709                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2710                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2711 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2712                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2715                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2716                                 lastInLumBuf++;
2717                         }
2718                         while(lastInChrBuf < lastChrSrcY)
2719                         {
2720                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722                                 chrBufIndex++;
2723                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2724                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2725                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726                                 //FIXME replace parameters through context struct (some at least)
2727
2728                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2729                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2730                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2731                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2732                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2733                                 lastInChrBuf++;
2734                         }
2735                         //wrap buf index around to stay inside the ring buffer
2736                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2737                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2738                 }
2739                 else // not enough lines left in this slice -> load the rest in the buffer
2740                 {
2741 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2742                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2743                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2744                         vChrBufSize, vLumBufSize);*/
2745
2746                         //Do horizontal scaling
2747                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2748                         {
2749                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2750                                 lumBufIndex++;
2751                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2752                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2753                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2754                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2755                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2756                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2757                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2758                                 lastInLumBuf++;
2759                         }
2760                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2761                         {
2762                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2763                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2764                                 chrBufIndex++;
2765                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2766                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2767                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2768
2769                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2770                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2771                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2772                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2773                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2774                                 lastInChrBuf++;
2775                         }
2776                         //wrap buf index around to stay inside the ring buffer
2777                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2778                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2779                         break; //we can't output a dstY line so let's try with the next slice
2780                 }
2781
2782 #ifdef HAVE_MMX
2783                 b5Dither= dither8[dstY&1];
2784                 g6Dither= dither4[dstY&1];
2785                 g5Dither= dither8[dstY&1];
2786                 r5Dither= dither8[(dstY+1)&1];
2787 #endif
2788             if(dstY < dstH-2)
2789             {
2790                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2791                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2792 #ifdef HAVE_MMX
2793                 int i;
2794                 for(i=0; i<vLumFilterSize; i++)
2795                 {
2796                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2797                         lumMmxFilter[4*i+2]= 
2798                         lumMmxFilter[4*i+3]= 
2799                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2800                 }
2801                 for(i=0; i<vChrFilterSize; i++)
2802                 {
2803                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2804                         chrMmxFilter[4*i+2]= 
2805                         chrMmxFilter[4*i+3]= 
2806                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2807                 }
2808 #endif
2809                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2810                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2812                         RENAME(yuv2nv12X)(c,
2813                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2814                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2815                                 dest, uDest, dstW, chrDstW, dstFormat);
2816                 }
2817                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2818                 {
2819                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2820                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2821                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2822                         {
2823                                 int16_t *lumBuf = lumPixBuf[0];
2824                                 int16_t *chrBuf= chrPixBuf[0];
2825                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2826                         }
2827                         else //General YV12
2828                         {
2829                                 RENAME(yuv2yuvX)(c,
2830                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2831                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2832                                         dest, uDest, vDest, dstW, chrDstW);
2833                         }
2834                 }
2835                 else
2836                 {
2837                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2838                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2839                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2840                         {
2841                                 int chrAlpha= vChrFilter[2*dstY+1];
2842                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2843                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2844                         }
2845                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2846                         {
2847                                 int lumAlpha= vLumFilter[2*dstY+1];
2848                                 int chrAlpha= vChrFilter[2*dstY+1];
2849                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2850                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2851                         }
2852                         else //General RGB
2853                         {
2854                                 RENAME(yuv2packedX)(c,
2855                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2856                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2857                                         dest, dstW, dstY);
2858                         }
2859                 }
2860             }
2861             else // hmm looks like we can't use MMX here without overwriting this array's tail
2862             {
2863                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2866                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2867                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2868                         yuv2nv12XinC(
2869                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2870                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2871                                 dest, uDest, dstW, chrDstW, dstFormat);
2872                 }
2873                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2874                 {
2875                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2877                         yuv2yuvXinC(
2878                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2879                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880                                 dest, uDest, vDest, dstW, chrDstW);
2881                 }
2882                 else
2883                 {
2884                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886                         yuv2packedXinC(c, 
2887                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2888                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2889                                 dest, dstW, dstY);
2890                 }
2891             }
2892         }
2893
2894 #ifdef HAVE_MMX
2895         __asm __volatile(SFENCE:::"memory");
2896         __asm __volatile(EMMS:::"memory");
2897 #endif
2898         /* store changed local vars back in the context */
2899         c->dstY= dstY;
2900         c->lumBufIndex= lumBufIndex;
2901         c->chrBufIndex= chrBufIndex;
2902         c->lastInLumBuf= lastInLumBuf;
2903         c->lastInChrBuf= lastInChrBuf;
2904
2905         return dstY - lastDstY;
2906 }