]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
Improved NV12/NV21 support.
[ffmpeg] / postproc / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS     "femms"
30 #else
31 #define EMMS     "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69                         "xor %%"REG_a", %%"REG_a"       \n\t"\
70                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71                         "movq %%mm3, %%mm4              \n\t"\
72                         "lea " offset "(%0), %%"REG_d"  \n\t"\
73                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74                         ".balign 16                     \n\t" /* FIXME Unroll? */\
75                         "1:                             \n\t"\
76                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79                         "add $16, %%"REG_d"             \n\t"\
80                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81                         "test %%"REG_S", %%"REG_S"      \n\t"\
82                         "pmulhw %%mm0, %%mm2            \n\t"\
83                         "pmulhw %%mm0, %%mm5            \n\t"\
84                         "paddw %%mm2, %%mm3             \n\t"\
85                         "paddw %%mm5, %%mm4             \n\t"\
86                         " jnz 1b                        \n\t"\
87                         "psraw $3, %%mm3                \n\t"\
88                         "psraw $3, %%mm4                \n\t"\
89                         "packuswb %%mm4, %%mm3          \n\t"\
90                         MOVNTQ(%%mm3, (%1, %%REGa))\
91                         "add $8, %%"REG_a"              \n\t"\
92                         "cmp %2, %%"REG_a"              \n\t"\
93                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94                         "movq %%mm3, %%mm4              \n\t"\
95                         "lea " offset "(%0), %%"REG_d"  \n\t"\
96                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97                         "jb 1b                          \n\t"
98
99 #define YSCALEYUV2YV121 \
100                         "mov %2, %%"REG_a"              \n\t"\
101                         ".balign 16                     \n\t" /* FIXME Unroll? */\
102                         "1:                             \n\t"\
103                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105                         "psraw $7, %%mm0                \n\t"\
106                         "psraw $7, %%mm1                \n\t"\
107                         "packuswb %%mm1, %%mm0          \n\t"\
108                         MOVNTQ(%%mm0, (%1, %%REGa))\
109                         "add $8, %%"REG_a"              \n\t"\
110                         "jnc 1b                         \n\t"
111
112 /*
113                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115                            "r" (dest), "m" (dstW),
116                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120                 "xor %%"REG_a", %%"REG_a"       \n\t"\
121                 ".balign 16                     \n\t"\
122                 "nop                            \n\t"\
123                 "1:                             \n\t"\
124                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127                 "movq %%mm3, %%mm4              \n\t"\
128                 ".balign 16                     \n\t"\
129                 "2:                             \n\t"\
130                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133                 "add $16, %%"REG_d"             \n\t"\
134                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135                 "pmulhw %%mm0, %%mm2            \n\t"\
136                 "pmulhw %%mm0, %%mm5            \n\t"\
137                 "paddw %%mm2, %%mm3             \n\t"\
138                 "paddw %%mm5, %%mm4             \n\t"\
139                 "test %%"REG_S", %%"REG_S"      \n\t"\
140                 " jnz 2b                        \n\t"\
141 \
142                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145                 "movq %%mm1, %%mm7              \n\t"\
146                 ".balign 16                     \n\t"\
147                 "2:                             \n\t"\
148                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151                 "add $16, %%"REG_d"             \n\t"\
152                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153                 "pmulhw %%mm0, %%mm2            \n\t"\
154                 "pmulhw %%mm0, %%mm5            \n\t"\
155                 "paddw %%mm2, %%mm1             \n\t"\
156                 "paddw %%mm5, %%mm7             \n\t"\
157                 "test %%"REG_S", %%"REG_S"      \n\t"\
158                 " jnz 2b                        \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162                 YSCALEYUV2PACKEDX\
163                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177                 "paddw %%mm3, %%mm4             \n\t"\
178                 "movq %%mm2, %%mm0              \n\t"\
179                 "movq %%mm5, %%mm6              \n\t"\
180                 "movq %%mm4, %%mm3              \n\t"\
181                 "punpcklwd %%mm2, %%mm2         \n\t"\
182                 "punpcklwd %%mm5, %%mm5         \n\t"\
183                 "punpcklwd %%mm4, %%mm4         \n\t"\
184                 "paddw %%mm1, %%mm2             \n\t"\
185                 "paddw %%mm1, %%mm5             \n\t"\
186                 "paddw %%mm1, %%mm4             \n\t"\
187                 "punpckhwd %%mm0, %%mm0         \n\t"\
188                 "punpckhwd %%mm6, %%mm6         \n\t"\
189                 "punpckhwd %%mm3, %%mm3         \n\t"\
190                 "paddw %%mm7, %%mm0             \n\t"\
191                 "paddw %%mm7, %%mm6             \n\t"\
192                 "paddw %%mm7, %%mm3             \n\t"\
193                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194                 "packuswb %%mm0, %%mm2          \n\t"\
195                 "packuswb %%mm6, %%mm5          \n\t"\
196                 "packuswb %%mm3, %%mm4          \n\t"\
197                 "pxor %%mm7, %%mm7              \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200                 "pxor %%mm7, %%mm7              \n\t"\
201                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202                 "punpcklwd %%mm6, %%mm6         \n\t"\
203                 "punpcklwd %%mm6, %%mm6         \n\t"\
204                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205                 "punpcklwd %%mm5, %%mm5         \n\t"\
206                 "punpcklwd %%mm5, %%mm5         \n\t"\
207                 "xor %%"REG_a", %%"REG_a"               \n\t"\
208                 ".balign 16                     \n\t"\
209                 "1:                             \n\t"\
210                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237 \
238 \
239                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
243                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
244                 "packuswb %%mm3, %%mm3          \n\t"\
245 \
246                 "packuswb %%mm0, %%mm0          \n\t"\
247                 "paddw %%mm4, %%mm2             \n\t"\
248                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
249 \
250                 "packuswb %%mm1, %%mm1          \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256                 "psraw $3, %%mm0                \n\t"\
257                 "psraw $3, %%mm1                \n\t"\
258                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260                 "xor "#index", "#index"         \n\t"\
261                 ".balign 16                     \n\t"\
262                 "1:                             \n\t"\
263                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288                 
289 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290                 
291 #define REAL_YSCALEYUV2RGB(index, c) \
292                 "xor "#index", "#index" \n\t"\
293                 ".balign 16                     \n\t"\
294                 "1:                             \n\t"\
295                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334                 "paddw %%mm3, %%mm4             \n\t"\
335                 "movq %%mm2, %%mm0              \n\t"\
336                 "movq %%mm5, %%mm6              \n\t"\
337                 "movq %%mm4, %%mm3              \n\t"\
338                 "punpcklwd %%mm2, %%mm2         \n\t"\
339                 "punpcklwd %%mm5, %%mm5         \n\t"\
340                 "punpcklwd %%mm4, %%mm4         \n\t"\
341                 "paddw %%mm1, %%mm2             \n\t"\
342                 "paddw %%mm1, %%mm5             \n\t"\
343                 "paddw %%mm1, %%mm4             \n\t"\
344                 "punpckhwd %%mm0, %%mm0         \n\t"\
345                 "punpckhwd %%mm6, %%mm6         \n\t"\
346                 "punpckhwd %%mm3, %%mm3         \n\t"\
347                 "paddw %%mm7, %%mm0             \n\t"\
348                 "paddw %%mm7, %%mm6             \n\t"\
349                 "paddw %%mm7, %%mm3             \n\t"\
350                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351                 "packuswb %%mm0, %%mm2          \n\t"\
352                 "packuswb %%mm6, %%mm5          \n\t"\
353                 "packuswb %%mm3, %%mm4          \n\t"\
354                 "pxor %%mm7, %%mm7              \n\t"
355 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356                 
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358                 "xor "#index", "#index"         \n\t"\
359                 ".balign 16                     \n\t"\
360                 "1:                             \n\t"\
361                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363                 "psraw $7, %%mm3                \n\t" \
364                 "psraw $7, %%mm4                \n\t" \
365                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367                 "psraw $7, %%mm1                \n\t" \
368                 "psraw $7, %%mm7                \n\t" \
369                 
370 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371                 
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373                 "xor "#index", "#index" \n\t"\
374                 ".balign 16                     \n\t"\
375                 "1:                             \n\t"\
376                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398                 "paddw %%mm3, %%mm4             \n\t"\
399                 "movq %%mm2, %%mm0              \n\t"\
400                 "movq %%mm5, %%mm6              \n\t"\
401                 "movq %%mm4, %%mm3              \n\t"\
402                 "punpcklwd %%mm2, %%mm2         \n\t"\
403                 "punpcklwd %%mm5, %%mm5         \n\t"\
404                 "punpcklwd %%mm4, %%mm4         \n\t"\
405                 "paddw %%mm1, %%mm2             \n\t"\
406                 "paddw %%mm1, %%mm5             \n\t"\
407                 "paddw %%mm1, %%mm4             \n\t"\
408                 "punpckhwd %%mm0, %%mm0         \n\t"\
409                 "punpckhwd %%mm6, %%mm6         \n\t"\
410                 "punpckhwd %%mm3, %%mm3         \n\t"\
411                 "paddw %%mm7, %%mm0             \n\t"\
412                 "paddw %%mm7, %%mm6             \n\t"\
413                 "paddw %%mm7, %%mm3             \n\t"\
414                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415                 "packuswb %%mm0, %%mm2          \n\t"\
416                 "packuswb %%mm6, %%mm5          \n\t"\
417                 "packuswb %%mm3, %%mm4          \n\t"\
418                 "pxor %%mm7, %%mm7              \n\t"
419 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422                 "xor "#index", "#index"         \n\t"\
423                 ".balign 16                     \n\t"\
424                 "1:                             \n\t"\
425                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431                 "psrlw $8, %%mm3                \n\t" \
432                 "psrlw $8, %%mm4                \n\t" \
433                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435                 "psraw $7, %%mm1                \n\t" \
436                 "psraw $7, %%mm7                \n\t" 
437 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438                 
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441                 "xor "#index", "#index"         \n\t"\
442                 ".balign 16                     \n\t"\
443                 "1:                             \n\t"\
444                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470                 "paddw %%mm3, %%mm4             \n\t"\
471                 "movq %%mm2, %%mm0              \n\t"\
472                 "movq %%mm5, %%mm6              \n\t"\
473                 "movq %%mm4, %%mm3              \n\t"\
474                 "punpcklwd %%mm2, %%mm2         \n\t"\
475                 "punpcklwd %%mm5, %%mm5         \n\t"\
476                 "punpcklwd %%mm4, %%mm4         \n\t"\
477                 "paddw %%mm1, %%mm2             \n\t"\
478                 "paddw %%mm1, %%mm5             \n\t"\
479                 "paddw %%mm1, %%mm4             \n\t"\
480                 "punpckhwd %%mm0, %%mm0         \n\t"\
481                 "punpckhwd %%mm6, %%mm6         \n\t"\
482                 "punpckhwd %%mm3, %%mm3         \n\t"\
483                 "paddw %%mm7, %%mm0             \n\t"\
484                 "paddw %%mm7, %%mm6             \n\t"\
485                 "paddw %%mm7, %%mm3             \n\t"\
486                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487                 "packuswb %%mm0, %%mm2          \n\t"\
488                 "packuswb %%mm6, %%mm5          \n\t"\
489                 "packuswb %%mm3, %%mm4          \n\t"\
490                 "pxor %%mm7, %%mm7              \n\t"
491 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495                         "movq %%mm2, %%mm1              \n\t" /* B */\
496                         "movq %%mm5, %%mm6              \n\t" /* R */\
497                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507 \
508                         MOVNTQ(%%mm0, (dst, index, 4))\
509                         MOVNTQ(%%mm2, 8(dst, index, 4))\
510                         MOVNTQ(%%mm1, 16(dst, index, 4))\
511                         MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513                         "add $8, "#index"               \n\t"\
514                         "cmp "#dstw", "#index"          \n\t"\
515                         " jb 1b                         \n\t"
516 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522                         "psrlq $3, %%mm2                \n\t"\
523 \
524                         "movq %%mm2, %%mm1              \n\t"\
525                         "movq %%mm4, %%mm3              \n\t"\
526 \
527                         "punpcklbw %%mm7, %%mm3         \n\t"\
528                         "punpcklbw %%mm5, %%mm2         \n\t"\
529                         "punpckhbw %%mm7, %%mm4         \n\t"\
530                         "punpckhbw %%mm5, %%mm1         \n\t"\
531 \
532                         "psllq $3, %%mm3                \n\t"\
533                         "psllq $3, %%mm4                \n\t"\
534 \
535                         "por %%mm3, %%mm2               \n\t"\
536                         "por %%mm4, %%mm1               \n\t"\
537 \
538                         MOVNTQ(%%mm2, (dst, index, 2))\
539                         MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541                         "add $8, "#index"               \n\t"\
542                         "cmp "#dstw", "#index"          \n\t"\
543                         " jb 1b                         \n\t"
544 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550                         "psrlq $3, %%mm2                \n\t"\
551                         "psrlq $1, %%mm5                \n\t"\
552 \
553                         "movq %%mm2, %%mm1              \n\t"\
554                         "movq %%mm4, %%mm3              \n\t"\
555 \
556                         "punpcklbw %%mm7, %%mm3         \n\t"\
557                         "punpcklbw %%mm5, %%mm2         \n\t"\
558                         "punpckhbw %%mm7, %%mm4         \n\t"\
559                         "punpckhbw %%mm5, %%mm1         \n\t"\
560 \
561                         "psllq $2, %%mm3                \n\t"\
562                         "psllq $2, %%mm4                \n\t"\
563 \
564                         "por %%mm3, %%mm2               \n\t"\
565                         "por %%mm4, %%mm1               \n\t"\
566 \
567                         MOVNTQ(%%mm2, (dst, index, 2))\
568                         MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570                         "add $8, "#index"               \n\t"\
571                         "cmp "#dstw", "#index"          \n\t"\
572                         " jb 1b                         \n\t"
573 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577                         "movq %%mm2, %%mm1              \n\t" /* B */\
578                         "movq %%mm5, %%mm6              \n\t" /* R */\
579                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589 \
590                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598 \
599                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612 \
613                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621 \
622                         MOVNTQ(%%mm0, (dst))\
623                         MOVNTQ(%%mm2, 8(dst))\
624                         MOVNTQ(%%mm3, 16(dst))\
625                         "add $24, "#dst"                \n\t"\
626 \
627                         "add $8, "#index"               \n\t"\
628                         "cmp "#dstw", "#index"          \n\t"\
629                         " jb 1b                         \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633                         "movq %%mm2, %%mm1              \n\t" /* B */\
634                         "movq %%mm5, %%mm6              \n\t" /* R */\
635                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645 \
646                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650 \
651                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655 \
656                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660 \
661                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665                         MOVNTQ(%%mm0, (dst))\
666 \
667                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671                         MOVNTQ(%%mm6, 8(dst))\
672 \
673                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676                         MOVNTQ(%%mm5, 16(dst))\
677 \
678                         "add $24, "#dst"                \n\t"\
679 \
680                         "add $8, "#index"                       \n\t"\
681                         "cmp "#dstw", "#index"                  \n\t"\
682                         " jb 1b                         \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
687                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
688                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691 \
692                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695 \
696                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697                         "por %%mm1, %%mm6               \n\t"\
698                         "por %%mm3, %%mm6               \n\t"\
699                         MOVNTQ(%%mm6, (dst))\
700 \
701                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705 \
706                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709 \
710                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711                         "por %%mm3, %%mm6               \n\t"\
712                         MOVNTQ(%%mm6, 8(dst))\
713 \
714                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717 \
718                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721 \
722                         "por %%mm1, %%mm3               \n\t"\
723                         "por %%mm3, %%mm6               \n\t"\
724                         MOVNTQ(%%mm6, 16(dst))\
725 \
726                         "add $24, "#dst"                \n\t"\
727 \
728                         "add $8, "#index"               \n\t"\
729                         "cmp "#dstw", "#index"          \n\t"\
730                         " jb 1b                         \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741                         "packuswb %%mm3, %%mm3          \n\t"\
742                         "packuswb %%mm4, %%mm4          \n\t"\
743                         "packuswb %%mm7, %%mm1          \n\t"\
744                         "punpcklbw %%mm4, %%mm3         \n\t"\
745                         "movq %%mm1, %%mm7              \n\t"\
746                         "punpcklbw %%mm3, %%mm1         \n\t"\
747                         "punpckhbw %%mm3, %%mm7         \n\t"\
748 \
749                         MOVNTQ(%%mm1, (dst, index, 2))\
750                         MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752                         "add $8, "#index"               \n\t"\
753                         "cmp "#dstw", "#index"          \n\t"\
754                         " jb 1b                         \n\t"
755 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761 {
762 #ifdef HAVE_MMX
763         if(uDest != NULL)
764         {
765                 asm volatile(
766                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767                                 :: "r" (&c->redDither),
768                                 "r" (uDest), "m" ((long)chrDstW)
769                                 : "%"REG_a, "%"REG_d, "%"REG_S
770                         );
771
772                 asm volatile(
773                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774                                 :: "r" (&c->redDither),
775                                 "r" (vDest), "m" ((long)chrDstW)
776                                 : "%"REG_a, "%"REG_d, "%"REG_S
777                         );
778         }
779
780         asm volatile(
781                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782                         :: "r" (&c->redDither),
783                            "r" (dest), "m" ((long)dstW)
784                         : "%"REG_a, "%"REG_d, "%"REG_S
785                 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789                       chrFilter, chrSrc, chrFilterSize,
790                       dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793             chrFilter, chrSrc, chrFilterSize,
794             dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802 {
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804              chrFilter, chrSrc, chrFilterSize,
805              dest, uDest, dstW, chrDstW, dstFormat);
806 }
807
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
810 {
811 #ifdef HAVE_MMX
812         if(uDest != NULL)
813         {
814                 asm volatile(
815                                 YSCALEYUV2YV121
816                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817                                 "g" ((long)-chrDstW)
818                                 : "%"REG_a
819                         );
820
821                 asm volatile(
822                                 YSCALEYUV2YV121
823                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824                                 "g" ((long)-chrDstW)
825                                 : "%"REG_a
826                         );
827         }
828
829         asm volatile(
830                 YSCALEYUV2YV121
831                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
832                 "g" ((long)-dstW)
833                 : "%"REG_a
834         );
835 #else
836         int i;
837         for(i=0; i<dstW; i++)
838         {
839                 int val= lumSrc[i]>>7;
840                 
841                 if(val&256){
842                         if(val<0) val=0;
843                         else      val=255;
844                 }
845
846                 dest[i]= val;
847         }
848
849         if(uDest != NULL)
850                 for(i=0; i<chrDstW; i++)
851                 {
852                         int u=chrSrc[i]>>7;
853                         int v=chrSrc[i + 2048]>>7;
854
855                         if((u|v)&256){
856                                 if(u<0)         u=0;
857                                 else if (u>255) u=255;
858                                 if(v<0)         v=0;
859                                 else if (v>255) v=255;
860                         }
861
862                         uDest[i]= u;
863                         vDest[i]= v;
864                 }
865 #endif
866 }
867
868
869 /**
870  * vertical scale YV12 to RGB
871  */
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874                             uint8_t *dest, int dstW, int dstY)
875 {
876         int dummy=0;
877         switch(c->dstFormat)
878         {
879 #ifdef HAVE_MMX
880         case IMGFMT_BGR32:
881                 {
882                         asm volatile(
883                                 YSCALEYUV2RGBX
884                                 WRITEBGR32(%4, %5, %%REGa)
885
886                         :: "r" (&c->redDither), 
887                            "m" (dummy), "m" (dummy), "m" (dummy),
888                            "r" (dest), "m" (dstW)
889                         : "%"REG_a, "%"REG_d, "%"REG_S
890                         );
891                 }
892                 break;
893         case IMGFMT_BGR24:
894                 {
895                         asm volatile(
896                                 YSCALEYUV2RGBX
897                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898                                 "add %4, %%"REG_b"                      \n\t"
899                                 WRITEBGR24(%%REGb, %5, %%REGa)
900
901                         :: "r" (&c->redDither), 
902                            "m" (dummy), "m" (dummy), "m" (dummy),
903                            "r" (dest), "m" (dstW)
904                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
905                         );
906                 }
907                 break;
908         case IMGFMT_BGR15:
909                 {
910                         asm volatile(
911                                 YSCALEYUV2RGBX
912                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 #ifdef DITHER1XBPP
914                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917 #endif
918
919                                 WRITEBGR15(%4, %5, %%REGa)
920
921                         :: "r" (&c->redDither), 
922                            "m" (dummy), "m" (dummy), "m" (dummy),
923                            "r" (dest), "m" (dstW)
924                         : "%"REG_a, "%"REG_d, "%"REG_S
925                         );
926                 }
927                 break;
928         case IMGFMT_BGR16:
929                 {
930                         asm volatile(
931                                 YSCALEYUV2RGBX
932                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937 #endif
938
939                                 WRITEBGR16(%4, %5, %%REGa)
940
941                         :: "r" (&c->redDither), 
942                            "m" (dummy), "m" (dummy), "m" (dummy),
943                            "r" (dest), "m" (dstW)
944                         : "%"REG_a, "%"REG_d, "%"REG_S
945                         );
946                 }
947                 break;
948         case IMGFMT_YUY2:
949                 {
950                         asm volatile(
951                                 YSCALEYUV2PACKEDX
952                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
954                                 "psraw $3, %%mm3                \n\t"
955                                 "psraw $3, %%mm4                \n\t"
956                                 "psraw $3, %%mm1                \n\t"
957                                 "psraw $3, %%mm7                \n\t"
958                                 WRITEYUY2(%4, %5, %%REGa)
959
960                         :: "r" (&c->redDither), 
961                            "m" (dummy), "m" (dummy), "m" (dummy),
962                            "r" (dest), "m" (dstW)
963                         : "%"REG_a, "%"REG_d, "%"REG_S
964                         );
965                 }
966                 break;
967 #endif
968         default:
969 #ifdef HAVE_ALTIVEC
970                 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971                             chrFilter, chrSrc, chrFilterSize,
972                             dest, dstW, dstY);
973 #else
974                 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975                             chrFilter, chrSrc, chrFilterSize,
976                             dest, dstW, dstY);
977 #endif
978                 break;
979         }
980 }
981
982 /**
983  * vertical bilinear scale YV12 to RGB
984  */
985 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
987 {
988         int yalpha1=yalpha^4095;
989         int uvalpha1=uvalpha^4095;
990         int i;
991
992 #if 0 //isn't used
993         if(flags&SWS_FULL_CHR_H_INT)
994         {
995                 switch(dstFormat)
996                 {
997 #ifdef HAVE_MMX
998                 case IMGFMT_BGR32:
999                         asm volatile(
1000
1001
1002 FULL_YSCALEYUV2RGB
1003                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1004                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1005
1006                         "movq %%mm3, %%mm1              \n\t"
1007                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1008                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1009
1010                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1012
1013                         "add $4, %%"REG_a"              \n\t"
1014                         "cmp %5, %%"REG_a"              \n\t"
1015                         " jb 1b                         \n\t"
1016
1017
1018                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019                         "m" (yalpha1), "m" (uvalpha1)
1020                         : "%"REG_a
1021                         );
1022                         break;
1023                 case IMGFMT_BGR24:
1024                         asm volatile(
1025
1026 FULL_YSCALEYUV2RGB
1027
1028                                                                 // lsb ... msb
1029                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1030                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1031
1032                         "movq %%mm3, %%mm1              \n\t"
1033                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1034                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1035
1036                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1037                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1038                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1041                         "movq %%mm1, %%mm2              \n\t"
1042                         "psllq $48, %%mm1               \n\t" // 000000BG
1043                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1044
1045                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1046                         "psrld $16, %%mm2               \n\t" // R000R000
1047                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1048                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1049
1050                         "mov %4, %%"REG_b"              \n\t"
1051                         "add %%"REG_a", %%"REG_b"       \n\t"
1052
1053 #ifdef HAVE_MMX2
1054                         //FIXME Alignment
1055                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1057 #else
1058                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1059                         "psrlq $32, %%mm3               \n\t"
1060                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1062 #endif
1063                         "add $4, %%"REG_a"              \n\t"
1064                         "cmp %5, %%"REG_a"              \n\t"
1065                         " jb 1b                         \n\t"
1066
1067                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068                         "m" (yalpha1), "m" (uvalpha1)
1069                         : "%"REG_a, "%"REG_b
1070                         );
1071                         break;
1072                 case IMGFMT_BGR15:
1073                         asm volatile(
1074
1075 FULL_YSCALEYUV2RGB
1076 #ifdef DITHER1XBPP
1077                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1080 #endif
1081                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1082                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1083                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1084
1085                         "psrlw $3, %%mm3                \n\t"
1086                         "psllw $2, %%mm1                \n\t"
1087                         "psllw $7, %%mm0                \n\t"
1088                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1089                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1090
1091                         "por %%mm3, %%mm1               \n\t"
1092                         "por %%mm1, %%mm0               \n\t"
1093
1094                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1095
1096                         "add $4, %%"REG_a"              \n\t"
1097                         "cmp %5, %%"REG_a"              \n\t"
1098                         " jb 1b                         \n\t"
1099
1100                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101                         "m" (yalpha1), "m" (uvalpha1)
1102                         : "%"REG_a
1103                         );
1104                         break;
1105                 case IMGFMT_BGR16:
1106                         asm volatile(
1107
1108 FULL_YSCALEYUV2RGB
1109 #ifdef DITHER1XBPP
1110                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1113 #endif
1114                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1115                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1116                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1117
1118                         "psrlw $3, %%mm3                \n\t"
1119                         "psllw $3, %%mm1                \n\t"
1120                         "psllw $8, %%mm0                \n\t"
1121                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1122                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1123
1124                         "por %%mm3, %%mm1               \n\t"
1125                         "por %%mm1, %%mm0               \n\t"
1126
1127                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1128
1129                         "add $4, %%"REG_a"              \n\t"
1130                         "cmp %5, %%"REG_a"              \n\t"
1131                         " jb 1b                         \n\t"
1132
1133                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134                         "m" (yalpha1), "m" (uvalpha1)
1135                         : "%"REG_a
1136                         );
1137                 break;
1138 #endif
1139                 case IMGFMT_RGB32:
1140 #ifndef HAVE_MMX
1141                 case IMGFMT_BGR32:
1142 #endif
1143                 if(dstFormat==IMGFMT_BGR32)
1144                 {
1145                         int i;
1146 #ifdef WORDS_BIGENDIAN
1147                         dest++;
1148 #endif
1149                         for(i=0;i<dstW;i++){
1150                                 // vertical linear interpolation && yuv2rgb in a single step:
1151                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157                                 dest+= 4;
1158                         }
1159                 }
1160                 else if(dstFormat==IMGFMT_BGR24)
1161                 {
1162                         int i;
1163                         for(i=0;i<dstW;i++){
1164                                 // vertical linear interpolation && yuv2rgb in a single step:
1165                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1171                                 dest+= 3;
1172                         }
1173                 }
1174                 else if(dstFormat==IMGFMT_BGR16)
1175                 {
1176                         int i;
1177                         for(i=0;i<dstW;i++){
1178                                 // vertical linear interpolation && yuv2rgb in a single step:
1179                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1182
1183                                 ((uint16_t*)dest)[i] =
1184                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1187                         }
1188                 }
1189                 else if(dstFormat==IMGFMT_BGR15)
1190                 {
1191                         int i;
1192                         for(i=0;i<dstW;i++){
1193                                 // vertical linear interpolation && yuv2rgb in a single step:
1194                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1197
1198                                 ((uint16_t*)dest)[i] =
1199                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1202                         }
1203                 }
1204         }//FULL_UV_IPOL
1205         else
1206         {
1207 #endif // if 0
1208 #ifdef HAVE_MMX
1209         switch(c->dstFormat)
1210         {
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1212         case IMGFMT_BGR32:
1213                         asm volatile(
1214                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1215                                 "mov %4, %%"REG_SP"                     \n\t"
1216                                 YSCALEYUV2RGB(%%REGa, %5)
1217                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1219
1220                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221                         "r" (&c->redDither)
1222                         : "%"REG_a
1223                         );
1224                         return;
1225         case IMGFMT_BGR24:
1226                         asm volatile(
1227                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1228                                 "mov %4, %%"REG_SP"                     \n\t"
1229                                 YSCALEYUV2RGB(%%REGa, %5)
1230                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1232                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233                         "r" (&c->redDither)
1234                         : "%"REG_a
1235                         );
1236                         return;
1237         case IMGFMT_BGR15:
1238                         asm volatile(
1239                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1240                                 "mov %4, %%"REG_SP"                     \n\t"
1241                                 YSCALEYUV2RGB(%%REGa, %5)
1242                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243 #ifdef DITHER1XBPP
1244                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1247 #endif
1248
1249                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1251
1252                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253                         "r" (&c->redDither)
1254                         : "%"REG_a
1255                         );
1256                         return;
1257         case IMGFMT_BGR16:
1258                         asm volatile(
1259                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1260                                 "mov %4, %%"REG_SP"                     \n\t"
1261                                 YSCALEYUV2RGB(%%REGa, %5)
1262                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263 #ifdef DITHER1XBPP
1264                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1267 #endif
1268
1269                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1271                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272                         "r" (&c->redDither)
1273                         : "%"REG_a
1274                         );
1275                         return;
1276         case IMGFMT_YUY2:
1277                         asm volatile(
1278                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1279                                 "mov %4, %%"REG_SP"                     \n\t"
1280                                 YSCALEYUV2PACKED(%%REGa, %5)
1281                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1283                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284                         "r" (&c->redDither)
1285                         : "%"REG_a
1286                         );
1287                         return;
1288         default: break;
1289         }
1290 #endif //HAVE_MMX
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1292 }
1293
1294 /**
1295  * YV12 to RGB without scaling or interpolating
1296  */
1297 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1299 {
1300         const int yalpha1=0;
1301         int i;
1302         
1303         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304         const int yalpha= 4096; //FIXME ...
1305
1306         if(flags&SWS_FULL_CHR_H_INT)
1307         {
1308                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1309                 return;
1310         }
1311
1312 #ifdef HAVE_MMX
1313         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1314         {
1315                 switch(dstFormat)
1316                 {
1317                 case IMGFMT_BGR32:
1318                         asm volatile(
1319                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1320                                 "mov %4, %%"REG_SP"                     \n\t"
1321                                 YSCALEYUV2RGB1(%%REGa, %5)
1322                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1324
1325                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326                         "r" (&c->redDither)
1327                         : "%"REG_a
1328                         );
1329                         return;
1330                 case IMGFMT_BGR24:
1331                         asm volatile(
1332                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1333                                 "mov %4, %%"REG_SP"                     \n\t"
1334                                 YSCALEYUV2RGB1(%%REGa, %5)
1335                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1337
1338                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339                         "r" (&c->redDither)
1340                         : "%"REG_a
1341                         );
1342                         return;
1343                 case IMGFMT_BGR15:
1344                         asm volatile(
1345                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1346                                 "mov %4, %%"REG_SP"                     \n\t"
1347                                 YSCALEYUV2RGB1(%%REGa, %5)
1348                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349 #ifdef DITHER1XBPP
1350                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1353 #endif
1354                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1356
1357                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358                         "r" (&c->redDither)
1359                         : "%"REG_a
1360                         );
1361                         return;
1362                 case IMGFMT_BGR16:
1363                         asm volatile(
1364                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1365                                 "mov %4, %%"REG_SP"                     \n\t"
1366                                 YSCALEYUV2RGB1(%%REGa, %5)
1367                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372 #endif
1373
1374                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1376
1377                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378                         "r" (&c->redDither)
1379                         : "%"REG_a
1380                         );
1381                         return;
1382                 case IMGFMT_YUY2:
1383                         asm volatile(
1384                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1385                                 "mov %4, %%"REG_SP"                     \n\t"
1386                                 YSCALEYUV2PACKED1(%%REGa, %5)
1387                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1389
1390                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391                         "r" (&c->redDither)
1392                         : "%"REG_a
1393                         );
1394                         return;
1395                 }
1396         }
1397         else
1398         {
1399                 switch(dstFormat)
1400                 {
1401                 case IMGFMT_BGR32:
1402                         asm volatile(
1403                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1404                                 "mov %4, %%"REG_SP"                     \n\t"
1405                                 YSCALEYUV2RGB1b(%%REGa, %5)
1406                                 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1408
1409                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410                         "r" (&c->redDither)
1411                         : "%"REG_a
1412                         );
1413                         return;
1414                 case IMGFMT_BGR24:
1415                         asm volatile(
1416                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1417                                 "mov %4, %%"REG_SP"                     \n\t"
1418                                 YSCALEYUV2RGB1b(%%REGa, %5)
1419                                 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1421
1422                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423                         "r" (&c->redDither)
1424                         : "%"REG_a
1425                         );
1426                         return;
1427                 case IMGFMT_BGR15:
1428                         asm volatile(
1429                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1430                                 "mov %4, %%"REG_SP"                     \n\t"
1431                                 YSCALEYUV2RGB1b(%%REGa, %5)
1432                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433 #ifdef DITHER1XBPP
1434                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1437 #endif
1438                                 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1440
1441                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442                         "r" (&c->redDither)
1443                         : "%"REG_a
1444                         );
1445                         return;
1446                 case IMGFMT_BGR16:
1447                         asm volatile(
1448                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1449                                 "mov %4, %%"REG_SP"                     \n\t"
1450                                 YSCALEYUV2RGB1b(%%REGa, %5)
1451                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1456 #endif
1457
1458                                 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1460
1461                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462                         "r" (&c->redDither)
1463                         : "%"REG_a
1464                         );
1465                         return;
1466                 case IMGFMT_YUY2:
1467                         asm volatile(
1468                                 "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1469                                 "mov %4, %%"REG_SP"                     \n\t"
1470                                 YSCALEYUV2PACKED1b(%%REGa, %5)
1471                                 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472                                 "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1473
1474                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475                         "r" (&c->redDither)
1476                         : "%"REG_a
1477                         );
1478                         return;
1479                 }
1480         }
1481 #endif
1482         if( uvalpha < 2048 )
1483         {
1484                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1485         }else{
1486                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1487         }
1488 }
1489
1490 //FIXME yuy2* can read upto 7 samples to much
1491
1492 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1493 {
1494 #ifdef HAVE_MMX
1495         asm volatile(
1496                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497                 "mov %0, %%"REG_a"              \n\t"
1498                 "1:                             \n\t"
1499                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1500                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501                 "pand %%mm2, %%mm0              \n\t"
1502                 "pand %%mm2, %%mm1              \n\t"
1503                 "packuswb %%mm1, %%mm0          \n\t"
1504                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1505                 "add $8, %%"REG_a"              \n\t"
1506                 " js 1b                         \n\t"
1507                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1508                 : "%"REG_a
1509         );
1510 #else
1511         int i;
1512         for(i=0; i<width; i++)
1513                 dst[i]= src[2*i];
1514 #endif
1515 }
1516
1517 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1518 {
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520         asm volatile(
1521                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522                 "mov %0, %%"REG_a"              \n\t"
1523                 "1:                             \n\t"
1524                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1525                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1527                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1528                 PAVGB(%%mm2, %%mm0)
1529                 PAVGB(%%mm3, %%mm1)
1530                 "psrlw $8, %%mm0                \n\t"
1531                 "psrlw $8, %%mm1                \n\t"
1532                 "packuswb %%mm1, %%mm0          \n\t"
1533                 "movq %%mm0, %%mm1              \n\t"
1534                 "psrlw $8, %%mm0                \n\t"
1535                 "pand %%mm4, %%mm1              \n\t"
1536                 "packuswb %%mm0, %%mm0          \n\t"
1537                 "packuswb %%mm1, %%mm1          \n\t"
1538                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1539                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1540                 "add $4, %%"REG_a"              \n\t"
1541                 " js 1b                         \n\t"
1542                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543                 : "%"REG_a
1544         );
1545 #else
1546         int i;
1547         for(i=0; i<width; i++)
1548         {
1549                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1551         }
1552 #endif
1553 }
1554
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1557 {
1558 #ifdef HAVE_MMX
1559         asm volatile(
1560                 "mov %0, %%"REG_a"              \n\t"
1561                 "1:                             \n\t"
1562                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1563                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564                 "psrlw $8, %%mm0                \n\t"
1565                 "psrlw $8, %%mm1                \n\t"
1566                 "packuswb %%mm1, %%mm0          \n\t"
1567                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1568                 "add $8, %%"REG_a"              \n\t"
1569                 " js 1b                         \n\t"
1570                 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1571                 : "%"REG_a
1572         );
1573 #else
1574         int i;
1575         for(i=0; i<width; i++)
1576                 dst[i]= src[2*i+1];
1577 #endif
1578 }
1579
1580 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1581 {
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583         asm volatile(
1584                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585                 "mov %0, %%"REG_a"              \n\t"
1586                 "1:                             \n\t"
1587                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1588                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1590                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1591                 PAVGB(%%mm2, %%mm0)
1592                 PAVGB(%%mm3, %%mm1)
1593                 "pand %%mm4, %%mm0              \n\t"
1594                 "pand %%mm4, %%mm1              \n\t"
1595                 "packuswb %%mm1, %%mm0          \n\t"
1596                 "movq %%mm0, %%mm1              \n\t"
1597                 "psrlw $8, %%mm0                \n\t"
1598                 "pand %%mm4, %%mm1              \n\t"
1599                 "packuswb %%mm0, %%mm0          \n\t"
1600                 "packuswb %%mm1, %%mm1          \n\t"
1601                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1602                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1603                 "add $4, %%"REG_a"              \n\t"
1604                 " js 1b                         \n\t"
1605                 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606                 : "%"REG_a
1607         );
1608 #else
1609         int i;
1610         for(i=0; i<width; i++)
1611         {
1612                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1614         }
1615 #endif
1616 }
1617
1618 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1619 {
1620 #ifdef HAVE_MMXFIXME
1621 #else
1622         int i;
1623         for(i=0; i<width; i++)
1624         {
1625                 int b=  ((uint32_t*)src)[i]&0xFF;
1626                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1627                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1628
1629                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1630         }
1631 #endif
1632 }
1633
1634 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1635 {
1636 #ifdef HAVE_MMXFIXME
1637 #else
1638         int i;
1639         for(i=0; i<width; i++)
1640         {
1641                 const int a= ((uint32_t*)src1)[2*i+0];
1642                 const int e= ((uint32_t*)src1)[2*i+1];
1643                 const int c= ((uint32_t*)src2)[2*i+0];
1644                 const int d= ((uint32_t*)src2)[2*i+1];
1645                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1646                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1647                 const int b=  l&0x3FF;
1648                 const int g=  h>>8;
1649                 const int r=  l>>16;
1650
1651                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1652                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1653         }
1654 #endif
1655 }
1656
1657 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1658 {
1659 #ifdef HAVE_MMX
1660         asm volatile(
1661                 "mov %2, %%"REG_a"              \n\t"
1662                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1663                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1664                 "pxor %%mm7, %%mm7              \n\t"
1665                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1666                 ".balign 16                     \n\t"
1667                 "1:                             \n\t"
1668                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1669                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1670                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1671                 "punpcklbw %%mm7, %%mm0         \n\t"
1672                 "punpcklbw %%mm7, %%mm1         \n\t"
1673                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1674                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1675                 "punpcklbw %%mm7, %%mm2         \n\t"
1676                 "punpcklbw %%mm7, %%mm3         \n\t"
1677                 "pmaddwd %%mm6, %%mm0           \n\t"
1678                 "pmaddwd %%mm6, %%mm1           \n\t"
1679                 "pmaddwd %%mm6, %%mm2           \n\t"
1680                 "pmaddwd %%mm6, %%mm3           \n\t"
1681 #ifndef FAST_BGR2YV12
1682                 "psrad $8, %%mm0                \n\t"
1683                 "psrad $8, %%mm1                \n\t"
1684                 "psrad $8, %%mm2                \n\t"
1685                 "psrad $8, %%mm3                \n\t"
1686 #endif
1687                 "packssdw %%mm1, %%mm0          \n\t"
1688                 "packssdw %%mm3, %%mm2          \n\t"
1689                 "pmaddwd %%mm5, %%mm0           \n\t"
1690                 "pmaddwd %%mm5, %%mm2           \n\t"
1691                 "packssdw %%mm2, %%mm0          \n\t"
1692                 "psraw $7, %%mm0                \n\t"
1693
1694                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1695                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1696                 "punpcklbw %%mm7, %%mm4         \n\t"
1697                 "punpcklbw %%mm7, %%mm1         \n\t"
1698                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1699                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1700                 "punpcklbw %%mm7, %%mm2         \n\t"
1701                 "punpcklbw %%mm7, %%mm3         \n\t"
1702                 "pmaddwd %%mm6, %%mm4           \n\t"
1703                 "pmaddwd %%mm6, %%mm1           \n\t"
1704                 "pmaddwd %%mm6, %%mm2           \n\t"
1705                 "pmaddwd %%mm6, %%mm3           \n\t"
1706 #ifndef FAST_BGR2YV12
1707                 "psrad $8, %%mm4                \n\t"
1708                 "psrad $8, %%mm1                \n\t"
1709                 "psrad $8, %%mm2                \n\t"
1710                 "psrad $8, %%mm3                \n\t"
1711 #endif
1712                 "packssdw %%mm1, %%mm4          \n\t"
1713                 "packssdw %%mm3, %%mm2          \n\t"
1714                 "pmaddwd %%mm5, %%mm4           \n\t"
1715                 "pmaddwd %%mm5, %%mm2           \n\t"
1716                 "add $24, %%"REG_b"             \n\t"
1717                 "packssdw %%mm2, %%mm4          \n\t"
1718                 "psraw $7, %%mm4                \n\t"
1719
1720                 "packuswb %%mm4, %%mm0          \n\t"
1721                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1722
1723                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1724                 "add $8, %%"REG_a"              \n\t"
1725                 " js 1b                         \n\t"
1726                 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1727                 : "%"REG_a, "%"REG_b
1728         );
1729 #else
1730         int i;
1731         for(i=0; i<width; i++)
1732         {
1733                 int b= src[i*3+0];
1734                 int g= src[i*3+1];
1735                 int r= src[i*3+2];
1736
1737                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1738         }
1739 #endif
1740 }
1741
1742 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1743 {
1744 #ifdef HAVE_MMX
1745         asm volatile(
1746                 "mov %4, %%"REG_a"              \n\t"
1747                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1748                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1749                 "pxor %%mm7, %%mm7              \n\t"
1750                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1751                 "add %%"REG_b", %%"REG_b"       \n\t"
1752                 ".balign 16                     \n\t"
1753                 "1:                             \n\t"
1754                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1755                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1756 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1757                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1758                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1759                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1760                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1761                 PAVGB(%%mm1, %%mm0)
1762                 PAVGB(%%mm3, %%mm2)
1763                 "movq %%mm0, %%mm1              \n\t"
1764                 "movq %%mm2, %%mm3              \n\t"
1765                 "psrlq $24, %%mm0               \n\t"
1766                 "psrlq $24, %%mm2               \n\t"
1767                 PAVGB(%%mm1, %%mm0)
1768                 PAVGB(%%mm3, %%mm2)
1769                 "punpcklbw %%mm7, %%mm0         \n\t"
1770                 "punpcklbw %%mm7, %%mm2         \n\t"
1771 #else
1772                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1773                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1774                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1775                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1776                 "punpcklbw %%mm7, %%mm0         \n\t"
1777                 "punpcklbw %%mm7, %%mm1         \n\t"
1778                 "punpcklbw %%mm7, %%mm2         \n\t"
1779                 "punpcklbw %%mm7, %%mm3         \n\t"
1780                 "paddw %%mm1, %%mm0             \n\t"
1781                 "paddw %%mm3, %%mm2             \n\t"
1782                 "paddw %%mm2, %%mm0             \n\t"
1783                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1784                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1785                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1786                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1787                 "punpcklbw %%mm7, %%mm4         \n\t"
1788                 "punpcklbw %%mm7, %%mm1         \n\t"
1789                 "punpcklbw %%mm7, %%mm2         \n\t"
1790                 "punpcklbw %%mm7, %%mm3         \n\t"
1791                 "paddw %%mm1, %%mm4             \n\t"
1792                 "paddw %%mm3, %%mm2             \n\t"
1793                 "paddw %%mm4, %%mm2             \n\t"
1794                 "psrlw $2, %%mm0                \n\t"
1795                 "psrlw $2, %%mm2                \n\t"
1796 #endif
1797                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1798                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1799                 
1800                 "pmaddwd %%mm0, %%mm1           \n\t"
1801                 "pmaddwd %%mm2, %%mm3           \n\t"
1802                 "pmaddwd %%mm6, %%mm0           \n\t"
1803                 "pmaddwd %%mm6, %%mm2           \n\t"
1804 #ifndef FAST_BGR2YV12
1805                 "psrad $8, %%mm0                \n\t"
1806                 "psrad $8, %%mm1                \n\t"
1807                 "psrad $8, %%mm2                \n\t"
1808                 "psrad $8, %%mm3                \n\t"
1809 #endif
1810                 "packssdw %%mm2, %%mm0          \n\t"
1811                 "packssdw %%mm3, %%mm1          \n\t"
1812                 "pmaddwd %%mm5, %%mm0           \n\t"
1813                 "pmaddwd %%mm5, %%mm1           \n\t"
1814                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1815                 "psraw $7, %%mm0                \n\t"
1816
1817 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1818                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1819                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1820                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1821                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1822                 PAVGB(%%mm1, %%mm4)
1823                 PAVGB(%%mm3, %%mm2)
1824                 "movq %%mm4, %%mm1              \n\t"
1825                 "movq %%mm2, %%mm3              \n\t"
1826                 "psrlq $24, %%mm4               \n\t"
1827                 "psrlq $24, %%mm2               \n\t"
1828                 PAVGB(%%mm1, %%mm4)
1829                 PAVGB(%%mm3, %%mm2)
1830                 "punpcklbw %%mm7, %%mm4         \n\t"
1831                 "punpcklbw %%mm7, %%mm2         \n\t"
1832 #else
1833                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1834                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1835                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1836                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1837                 "punpcklbw %%mm7, %%mm4         \n\t"
1838                 "punpcklbw %%mm7, %%mm1         \n\t"
1839                 "punpcklbw %%mm7, %%mm2         \n\t"
1840                 "punpcklbw %%mm7, %%mm3         \n\t"
1841                 "paddw %%mm1, %%mm4             \n\t"
1842                 "paddw %%mm3, %%mm2             \n\t"
1843                 "paddw %%mm2, %%mm4             \n\t"
1844                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1845                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1846                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1847                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1848                 "punpcklbw %%mm7, %%mm5         \n\t"
1849                 "punpcklbw %%mm7, %%mm1         \n\t"
1850                 "punpcklbw %%mm7, %%mm2         \n\t"
1851                 "punpcklbw %%mm7, %%mm3         \n\t"
1852                 "paddw %%mm1, %%mm5             \n\t"
1853                 "paddw %%mm3, %%mm2             \n\t"
1854                 "paddw %%mm5, %%mm2             \n\t"
1855                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1856                 "psrlw $2, %%mm4                \n\t"
1857                 "psrlw $2, %%mm2                \n\t"
1858 #endif
1859                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1860                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1861                 
1862                 "pmaddwd %%mm4, %%mm1           \n\t"
1863                 "pmaddwd %%mm2, %%mm3           \n\t"
1864                 "pmaddwd %%mm6, %%mm4           \n\t"
1865                 "pmaddwd %%mm6, %%mm2           \n\t"
1866 #ifndef FAST_BGR2YV12
1867                 "psrad $8, %%mm4                \n\t"
1868                 "psrad $8, %%mm1                \n\t"
1869                 "psrad $8, %%mm2                \n\t"
1870                 "psrad $8, %%mm3                \n\t"
1871 #endif
1872                 "packssdw %%mm2, %%mm4          \n\t"
1873                 "packssdw %%mm3, %%mm1          \n\t"
1874                 "pmaddwd %%mm5, %%mm4           \n\t"
1875                 "pmaddwd %%mm5, %%mm1           \n\t"
1876                 "add $24, %%"REG_b"             \n\t"
1877                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1878                 "psraw $7, %%mm4                \n\t"
1879                 
1880                 "movq %%mm0, %%mm1              \n\t"
1881                 "punpckldq %%mm4, %%mm0         \n\t"
1882                 "punpckhdq %%mm4, %%mm1         \n\t"
1883                 "packsswb %%mm1, %%mm0          \n\t"
1884                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1885
1886                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1887                 "punpckhdq %%mm0, %%mm0         \n\t"
1888                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1889                 "add $4, %%"REG_a"              \n\t"
1890                 " js 1b                         \n\t"
1891                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1892                 : "%"REG_a, "%"REG_b
1893         );
1894 #else
1895         int i;
1896         for(i=0; i<width; i++)
1897         {
1898                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1899                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1900                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1901
1902                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1903                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1904         }
1905 #endif
1906 }
1907
1908 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1909 {
1910         int i;
1911         for(i=0; i<width; i++)
1912         {
1913                 int d= ((uint16_t*)src)[i];
1914                 int b= d&0x1F;
1915                 int g= (d>>5)&0x3F;
1916                 int r= (d>>11)&0x1F;
1917
1918                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1919         }
1920 }
1921
1922 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1923 {
1924         int i;
1925         for(i=0; i<width; i++)
1926         {
1927                 int d0= ((uint32_t*)src1)[i];
1928                 int d1= ((uint32_t*)src2)[i];
1929                 
1930                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1931                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1932
1933                 int dh2= (dh>>11) + (dh<<21);
1934                 int d= dh2 + dl;
1935
1936                 int b= d&0x7F;
1937                 int r= (d>>11)&0x7F;
1938                 int g= d>>21;
1939                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1940                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1941         }
1942 }
1943
1944 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1945 {
1946         int i;
1947         for(i=0; i<width; i++)
1948         {
1949                 int d= ((uint16_t*)src)[i];
1950                 int b= d&0x1F;
1951                 int g= (d>>5)&0x1F;
1952                 int r= (d>>10)&0x1F;
1953
1954                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1955         }
1956 }
1957
1958 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1959 {
1960         int i;
1961         for(i=0; i<width; i++)
1962         {
1963                 int d0= ((uint32_t*)src1)[i];
1964                 int d1= ((uint32_t*)src2)[i];
1965                 
1966                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1967                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1968
1969                 int dh2= (dh>>11) + (dh<<21);
1970                 int d= dh2 + dl;
1971
1972                 int b= d&0x7F;
1973                 int r= (d>>10)&0x7F;
1974                 int g= d>>21;
1975                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1976                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1977         }
1978 }
1979
1980
1981 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1982 {
1983         int i;
1984         for(i=0; i<width; i++)
1985         {
1986                 int r=  ((uint32_t*)src)[i]&0xFF;
1987                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1988                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1989
1990                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1991         }
1992 }
1993
1994 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1995 {
1996         int i;
1997         for(i=0; i<width; i++)
1998         {
1999                 const int a= ((uint32_t*)src1)[2*i+0];
2000                 const int e= ((uint32_t*)src1)[2*i+1];
2001                 const int c= ((uint32_t*)src2)[2*i+0];
2002                 const int d= ((uint32_t*)src2)[2*i+1];
2003                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2004                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2005                 const int r=  l&0x3FF;
2006                 const int g=  h>>8;
2007                 const int b=  l>>16;
2008
2009                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2010                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2011         }
2012 }
2013
2014 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2015 {
2016         int i;
2017         for(i=0; i<width; i++)
2018         {
2019                 int r= src[i*3+0];
2020                 int g= src[i*3+1];
2021                 int b= src[i*3+2];
2022
2023                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2024         }
2025 }
2026
2027 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2028 {
2029         int i;
2030         for(i=0; i<width; i++)
2031         {
2032                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2033                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2034                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2035
2036                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2037                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2038         }
2039 }
2040
2041
2042 // Bilinear / Bicubic scaling
2043 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2044                                   int16_t *filter, int16_t *filterPos, int filterSize)
2045 {
2046 #ifdef HAVE_MMX
2047         assert(filterSize % 4 == 0 && filterSize>0);
2048         if(filterSize==4) // allways true for upscaling, sometimes for down too
2049         {
2050                 long counter= -2*dstW;
2051                 filter-= counter*2;
2052                 filterPos-= counter/2;
2053                 dst-= counter/2;
2054                 asm volatile(
2055                         "pxor %%mm7, %%mm7              \n\t"
2056                         "movq "MANGLE(w02)", %%mm6      \n\t"
2057                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2058                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2059                         ".balign 16                     \n\t"
2060                         "1:                             \n\t"
2061                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2062                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2063                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2064                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2065                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2066                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2067                         "punpcklbw %%mm7, %%mm0         \n\t"
2068                         "punpcklbw %%mm7, %%mm2         \n\t"
2069                         "pmaddwd %%mm1, %%mm0           \n\t"
2070                         "pmaddwd %%mm2, %%mm3           \n\t"
2071                         "psrad $8, %%mm0                \n\t"
2072                         "psrad $8, %%mm3                \n\t"
2073                         "packssdw %%mm3, %%mm0          \n\t"
2074                         "pmaddwd %%mm6, %%mm0           \n\t"
2075                         "packssdw %%mm0, %%mm0          \n\t"
2076                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2077                         "add $4, %%"REG_BP"             \n\t"
2078                         " jnc 1b                        \n\t"
2079
2080                         "pop %%"REG_BP"                 \n\t"
2081                         : "+a" (counter)
2082                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2083                         : "%"REG_b
2084                 );
2085         }
2086         else if(filterSize==8)
2087         {
2088                 long counter= -2*dstW;
2089                 filter-= counter*4;
2090                 filterPos-= counter/2;
2091                 dst-= counter/2;
2092                 asm volatile(
2093                         "pxor %%mm7, %%mm7              \n\t"
2094                         "movq "MANGLE(w02)", %%mm6      \n\t"
2095                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2096                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2097                         ".balign 16                     \n\t"
2098                         "1:                             \n\t"
2099                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2100                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2101                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2102                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2103                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2104                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2105                         "punpcklbw %%mm7, %%mm0         \n\t"
2106                         "punpcklbw %%mm7, %%mm2         \n\t"
2107                         "pmaddwd %%mm1, %%mm0           \n\t"
2108                         "pmaddwd %%mm2, %%mm3           \n\t"
2109
2110                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2111                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2112                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2113                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2114                         "punpcklbw %%mm7, %%mm4         \n\t"
2115                         "punpcklbw %%mm7, %%mm2         \n\t"
2116                         "pmaddwd %%mm1, %%mm4           \n\t"
2117                         "pmaddwd %%mm2, %%mm5           \n\t"
2118                         "paddd %%mm4, %%mm0             \n\t"
2119                         "paddd %%mm5, %%mm3             \n\t"
2120                                                 
2121                         "psrad $8, %%mm0                \n\t"
2122                         "psrad $8, %%mm3                \n\t"
2123                         "packssdw %%mm3, %%mm0          \n\t"
2124                         "pmaddwd %%mm6, %%mm0           \n\t"
2125                         "packssdw %%mm0, %%mm0          \n\t"
2126                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2127                         "add $4, %%"REG_BP"             \n\t"
2128                         " jnc 1b                        \n\t"
2129
2130                         "pop %%"REG_BP"                 \n\t"
2131                         : "+a" (counter)
2132                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2133                         : "%"REG_b
2134                 );
2135         }
2136         else
2137         {
2138                 long counter= -2*dstW;
2139 //              filter-= counter*filterSize/2;
2140                 filterPos-= counter/2;
2141                 dst-= counter/2;
2142                 asm volatile(
2143                         "pxor %%mm7, %%mm7              \n\t"
2144                         "movq "MANGLE(w02)", %%mm6      \n\t"
2145                         ".balign 16                     \n\t"
2146                         "1:                             \n\t"
2147                         "mov %2, %%"REG_c"              \n\t"
2148                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2149                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2150                         "mov %5, %%"REG_c"              \n\t"
2151                         "pxor %%mm4, %%mm4              \n\t"
2152                         "pxor %%mm5, %%mm5              \n\t"
2153                         "2:                             \n\t"
2154                         "movq (%1), %%mm1               \n\t"
2155                         "movq (%1, %6), %%mm3           \n\t"
2156                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2157                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2158                         "punpcklbw %%mm7, %%mm0         \n\t"
2159                         "punpcklbw %%mm7, %%mm2         \n\t"
2160                         "pmaddwd %%mm1, %%mm0           \n\t"
2161                         "pmaddwd %%mm2, %%mm3           \n\t"
2162                         "paddd %%mm3, %%mm5             \n\t"
2163                         "paddd %%mm0, %%mm4             \n\t"
2164                         "add $8, %1                     \n\t"
2165                         "add $4, %%"REG_c"              \n\t"
2166                         "cmp %4, %%"REG_c"              \n\t"
2167                         " jb 2b                         \n\t"
2168                         "add %6, %1                     \n\t"
2169                         "psrad $8, %%mm4                \n\t"
2170                         "psrad $8, %%mm5                \n\t"
2171                         "packssdw %%mm5, %%mm4          \n\t"
2172                         "pmaddwd %%mm6, %%mm4           \n\t"
2173                         "packssdw %%mm4, %%mm4          \n\t"
2174                         "mov %3, %%"REG_a"              \n\t"
2175                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2176                         "add $4, %0                     \n\t"
2177                         " jnc 1b                        \n\t"
2178
2179                         : "+r" (counter), "+r" (filter)
2180                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2181                           "m" (src), "r" ((long)filterSize*2)
2182                         : "%"REG_b, "%"REG_a, "%"REG_c
2183                 );
2184         }
2185 #else
2186 #ifdef HAVE_ALTIVEC
2187         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2188 #else
2189         int i;
2190         for(i=0; i<dstW; i++)
2191         {
2192                 int j;
2193                 int srcPos= filterPos[i];
2194                 int val=0;
2195 //              printf("filterPos: %d\n", filterPos[i]);
2196                 for(j=0; j<filterSize; j++)
2197                 {
2198 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2199                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2200                 }
2201 //              filter += hFilterSize;
2202                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2203 //              dst[i] = val>>7;
2204         }
2205 #endif
2206 #endif
2207 }
2208       // *** horizontal scale Y line to temp buffer
2209 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2210                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2211                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2212                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2213                                    int32_t *mmx2FilterPos)
2214 {
2215     if(srcFormat==IMGFMT_YUY2)
2216     {
2217         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2218         src= formatConvBuffer;
2219     }
2220     else if(srcFormat==IMGFMT_UYVY)
2221     {
2222         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2223         src= formatConvBuffer;
2224     }
2225     else if(srcFormat==IMGFMT_BGR32)
2226     {
2227         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2228         src= formatConvBuffer;
2229     }
2230     else if(srcFormat==IMGFMT_BGR24)
2231     {
2232         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2233         src= formatConvBuffer;
2234     }
2235     else if(srcFormat==IMGFMT_BGR16)
2236     {
2237         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2238         src= formatConvBuffer;
2239     }
2240     else if(srcFormat==IMGFMT_BGR15)
2241     {
2242         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2243         src= formatConvBuffer;
2244     }
2245     else if(srcFormat==IMGFMT_RGB32)
2246     {
2247         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2248         src= formatConvBuffer;
2249     }
2250     else if(srcFormat==IMGFMT_RGB24)
2251     {
2252         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2253         src= formatConvBuffer;
2254     }
2255
2256 #ifdef HAVE_MMX
2257         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2258     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2259 #else
2260     if(!(flags&SWS_FAST_BILINEAR))
2261 #endif
2262     {
2263         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2264     }
2265     else // Fast Bilinear upscale / crap downscale
2266     {
2267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2268 #ifdef HAVE_MMX2
2269         int i;
2270         if(canMMX2BeUsed)
2271         {
2272                 asm volatile(
2273                         "pxor %%mm7, %%mm7              \n\t"
2274                         "mov %0, %%"REG_c"              \n\t"
2275                         "mov %1, %%"REG_D"              \n\t"
2276                         "mov %2, %%"REG_d"              \n\t"
2277                         "mov %3, %%"REG_b"              \n\t"
2278                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2279                         PREFETCH" (%%"REG_c")           \n\t"
2280                         PREFETCH" 32(%%"REG_c")         \n\t"
2281                         PREFETCH" 64(%%"REG_c")         \n\t"
2282
2283 #ifdef ARCH_X86_64
2284
2285 #define FUNNY_Y_CODE \
2286                         "movl (%%"REG_b"), %%esi        \n\t"\
2287                         "call *%4                       \n\t"\
2288                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2289                         "add %%"REG_S", %%"REG_c"       \n\t"\
2290                         "add %%"REG_a", %%"REG_D"       \n\t"\
2291                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2292
2293 #else
2294
2295 #define FUNNY_Y_CODE \
2296                         "movl (%%"REG_b"), %%esi        \n\t"\
2297                         "call *%4                       \n\t"\
2298                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2299                         "add %%"REG_a", %%"REG_D"       \n\t"\
2300                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2301
2302 #endif
2303
2304 FUNNY_Y_CODE
2305 FUNNY_Y_CODE
2306 FUNNY_Y_CODE
2307 FUNNY_Y_CODE
2308 FUNNY_Y_CODE
2309 FUNNY_Y_CODE
2310 FUNNY_Y_CODE
2311 FUNNY_Y_CODE
2312
2313                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2314                         "m" (funnyYCode)
2315                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2316                 );
2317                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2318         }
2319         else
2320         {
2321 #endif
2322         //NO MMX just normal asm ...
2323         asm volatile(
2324                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2325                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2326                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2327                 ".balign 16                     \n\t"
2328                 "1:                             \n\t"
2329                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2330                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2331                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2332                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2333                 "shll $16, %%edi                \n\t"
2334                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2335                 "mov %1, %%"REG_D"              \n\t"
2336                 "shrl $9, %%esi                 \n\t"
2337                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2338                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2339                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2340
2341                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2342                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2343                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2344                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2345                 "shll $16, %%edi                \n\t"
2346                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2347                 "mov %1, %%"REG_D"              \n\t"
2348                 "shrl $9, %%esi                 \n\t"
2349                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2350                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2351                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2352
2353
2354                 "add $2, %%"REG_a"              \n\t"
2355                 "cmp %2, %%"REG_a"              \n\t"
2356                 " jb 1b                         \n\t"
2357
2358
2359                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2360                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2361                 );
2362 #ifdef HAVE_MMX2
2363         } //if MMX2 can't be used
2364 #endif
2365 #else
2366         int i;
2367         unsigned int xpos=0;
2368         for(i=0;i<dstWidth;i++)
2369         {
2370                 register unsigned int xx=xpos>>16;
2371                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2372                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2373                 xpos+=xInc;
2374         }
2375 #endif
2376     }
2377 }
2378
2379 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2380                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2381                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2382                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2383                                    int32_t *mmx2FilterPos)
2384 {
2385     if(srcFormat==IMGFMT_YUY2)
2386     {
2387         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2388         src1= formatConvBuffer;
2389         src2= formatConvBuffer+2048;
2390     }
2391     else if(srcFormat==IMGFMT_UYVY)
2392     {
2393         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2394         src1= formatConvBuffer;
2395         src2= formatConvBuffer+2048;
2396     }
2397     else if(srcFormat==IMGFMT_BGR32)
2398     {
2399         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2400         src1= formatConvBuffer;
2401         src2= formatConvBuffer+2048;
2402     }
2403     else if(srcFormat==IMGFMT_BGR24)
2404     {
2405         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2406         src1= formatConvBuffer;
2407         src2= formatConvBuffer+2048;
2408     }
2409     else if(srcFormat==IMGFMT_BGR16)
2410     {
2411         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2412         src1= formatConvBuffer;
2413         src2= formatConvBuffer+2048;
2414     }
2415     else if(srcFormat==IMGFMT_BGR15)
2416     {
2417         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2418         src1= formatConvBuffer;
2419         src2= formatConvBuffer+2048;
2420     }
2421     else if(srcFormat==IMGFMT_RGB32)
2422     {
2423         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2424         src1= formatConvBuffer;
2425         src2= formatConvBuffer+2048;
2426     }
2427     else if(srcFormat==IMGFMT_RGB24)
2428     {
2429         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2430         src1= formatConvBuffer;
2431         src2= formatConvBuffer+2048;
2432     }
2433     else if(isGray(srcFormat))
2434     {
2435         return;
2436     }
2437
2438 #ifdef HAVE_MMX
2439         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2440     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2441 #else
2442     if(!(flags&SWS_FAST_BILINEAR))
2443 #endif
2444     {
2445         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2446         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2447     }
2448     else // Fast Bilinear upscale / crap downscale
2449     {
2450 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2451 #ifdef HAVE_MMX2
2452         int i;
2453         if(canMMX2BeUsed)
2454         {
2455                 asm volatile(
2456                         "pxor %%mm7, %%mm7              \n\t"
2457                         "mov %0, %%"REG_c"              \n\t"
2458                         "mov %1, %%"REG_D"              \n\t"
2459                         "mov %2, %%"REG_d"              \n\t"
2460                         "mov %3, %%"REG_b"              \n\t"
2461                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2462                         PREFETCH" (%%"REG_c")           \n\t"
2463                         PREFETCH" 32(%%"REG_c")         \n\t"
2464                         PREFETCH" 64(%%"REG_c")         \n\t"
2465
2466 #ifdef ARCH_X86_64
2467
2468 #define FUNNY_UV_CODE \
2469                         "movl (%%"REG_b"), %%esi        \n\t"\
2470                         "call *%4                       \n\t"\
2471                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2472                         "add %%"REG_S", %%"REG_c"       \n\t"\
2473                         "add %%"REG_a", %%"REG_D"       \n\t"\
2474                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2475
2476 #else
2477
2478 #define FUNNY_UV_CODE \
2479                         "movl (%%"REG_b"), %%esi        \n\t"\
2480                         "call *%4                       \n\t"\
2481                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2482                         "add %%"REG_a", %%"REG_D"       \n\t"\
2483                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2484
2485 #endif
2486
2487 FUNNY_UV_CODE
2488 FUNNY_UV_CODE
2489 FUNNY_UV_CODE
2490 FUNNY_UV_CODE
2491                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2492                         "mov %5, %%"REG_c"              \n\t" // src
2493                         "mov %1, %%"REG_D"              \n\t" // buf1
2494                         "add $4096, %%"REG_D"           \n\t"
2495                         PREFETCH" (%%"REG_c")           \n\t"
2496                         PREFETCH" 32(%%"REG_c")         \n\t"
2497                         PREFETCH" 64(%%"REG_c")         \n\t"
2498
2499 FUNNY_UV_CODE
2500 FUNNY_UV_CODE
2501 FUNNY_UV_CODE
2502 FUNNY_UV_CODE
2503
2504                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2505                         "m" (funnyUVCode), "m" (src2)
2506                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2507                 );
2508                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2509                 {
2510 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2511                         dst[i] = src1[srcW-1]*128;
2512                         dst[i+2048] = src2[srcW-1]*128;
2513                 }
2514         }
2515         else
2516         {
2517 #endif
2518         asm volatile(
2519                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2520                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2521                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2522                 ".balign 16                     \n\t"
2523                 "1:                             \n\t"
2524                 "mov %0, %%"REG_S"              \n\t"
2525                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2526                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2527                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2528                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2529                 "shll $16, %%edi                \n\t"
2530                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2531                 "mov %1, %%"REG_D"              \n\t"
2532                 "shrl $9, %%esi                 \n\t"
2533                 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2534
2535                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2536                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2537                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2538                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2539                 "shll $16, %%edi                \n\t"
2540                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2541                 "mov %1, %%"REG_D"              \n\t"
2542                 "shrl $9, %%esi                 \n\t"
2543                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2544
2545                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2546                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2547                 "add $1, %%"REG_a"              \n\t"
2548                 "cmp %2, %%"REG_a"              \n\t"
2549                 " jb 1b                         \n\t"
2550
2551                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2552                 "r" (src2)
2553                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2554                 );
2555 #ifdef HAVE_MMX2
2556         } //if MMX2 can't be used
2557 #endif
2558 #else
2559         int i;
2560         unsigned int xpos=0;
2561         for(i=0;i<dstWidth;i++)
2562         {
2563                 register unsigned int xx=xpos>>16;
2564                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2565                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2566                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2567 /* slower
2568           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2569           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2570 */
2571                 xpos+=xInc;
2572         }
2573 #endif
2574    }
2575 }
2576
2577 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2578              int srcSliceH, uint8_t* dst[], int dstStride[]){
2579
2580         /* load a few things into local vars to make the code more readable? and faster */
2581         const int srcW= c->srcW;
2582         const int dstW= c->dstW;
2583         const int dstH= c->dstH;
2584         const int chrDstW= c->chrDstW;
2585         const int chrSrcW= c->chrSrcW;
2586         const int lumXInc= c->lumXInc;
2587         const int chrXInc= c->chrXInc;
2588         const int dstFormat= c->dstFormat;
2589         const int srcFormat= c->srcFormat;
2590         const int flags= c->flags;
2591         const int canMMX2BeUsed= c->canMMX2BeUsed;
2592         int16_t *vLumFilterPos= c->vLumFilterPos;
2593         int16_t *vChrFilterPos= c->vChrFilterPos;
2594         int16_t *hLumFilterPos= c->hLumFilterPos;
2595         int16_t *hChrFilterPos= c->hChrFilterPos;
2596         int16_t *vLumFilter= c->vLumFilter;
2597         int16_t *vChrFilter= c->vChrFilter;
2598         int16_t *hLumFilter= c->hLumFilter;
2599         int16_t *hChrFilter= c->hChrFilter;
2600         int32_t *lumMmxFilter= c->lumMmxFilter;
2601         int32_t *chrMmxFilter= c->chrMmxFilter;
2602         const int vLumFilterSize= c->vLumFilterSize;
2603         const int vChrFilterSize= c->vChrFilterSize;
2604         const int hLumFilterSize= c->hLumFilterSize;
2605         const int hChrFilterSize= c->hChrFilterSize;
2606         int16_t **lumPixBuf= c->lumPixBuf;
2607         int16_t **chrPixBuf= c->chrPixBuf;
2608         const int vLumBufSize= c->vLumBufSize;
2609         const int vChrBufSize= c->vChrBufSize;
2610         uint8_t *funnyYCode= c->funnyYCode;
2611         uint8_t *funnyUVCode= c->funnyUVCode;
2612         uint8_t *formatConvBuffer= c->formatConvBuffer;
2613         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2614         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2615         int lastDstY;
2616
2617         /* vars whch will change and which we need to storw back in the context */
2618         int dstY= c->dstY;
2619         int lumBufIndex= c->lumBufIndex;
2620         int chrBufIndex= c->chrBufIndex;
2621         int lastInLumBuf= c->lastInLumBuf;
2622         int lastInChrBuf= c->lastInChrBuf;
2623         
2624         if(isPacked(c->srcFormat)){
2625                 src[0]=
2626                 src[1]=
2627                 src[2]= src[0];
2628                 srcStride[0]=
2629                 srcStride[1]=
2630                 srcStride[2]= srcStride[0];
2631         }
2632         srcStride[1]<<= c->vChrDrop;
2633         srcStride[2]<<= c->vChrDrop;
2634
2635 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2636 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2637
2638 #if 0 //self test FIXME move to a vfilter or something
2639 {
2640 static volatile int i=0;
2641 i++;
2642 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2643         selfTest(src, srcStride, c->srcW, c->srcH);
2644 i--;
2645 }
2646 #endif
2647
2648 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2649 //dstStride[0],dstStride[1],dstStride[2]);
2650
2651         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2652         {
2653                 static int firstTime=1; //FIXME move this into the context perhaps
2654                 if(flags & SWS_PRINT_INFO && firstTime)
2655                 {
2656                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2657                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2658                         firstTime=0;
2659                 }
2660         }
2661
2662         /* Note the user might start scaling the picture in the middle so this will not get executed
2663            this is not really intended but works currently, so ppl might do it */
2664         if(srcSliceY ==0){
2665                 lumBufIndex=0;
2666                 chrBufIndex=0;
2667                 dstY=0; 
2668                 lastInLumBuf= -1;
2669                 lastInChrBuf= -1;
2670         }
2671
2672         lastDstY= dstY;
2673
2674         for(;dstY < dstH; dstY++){
2675                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2676                 const int chrDstY= dstY>>c->chrDstVSubSample;
2677                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2678                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2679
2680                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2681                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2682                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2683                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2684
2685 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2686 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2687                 //handle holes (FAST_BILINEAR & weird filters)
2688                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2689                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2690 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2691                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2692                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2693
2694                 // Do we have enough lines in this slice to output the dstY line
2695                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2696                 {
2697                         //Do horizontal scaling
2698                         while(lastInLumBuf < lastLumSrcY)
2699                         {
2700                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2701                                 lumBufIndex++;
2702 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2703                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2704                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2705                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2706 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2707                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2708                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2709                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2710                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2711                                 lastInLumBuf++;
2712                         }
2713                         while(lastInChrBuf < lastChrSrcY)
2714                         {
2715                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2716                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2717                                 chrBufIndex++;
2718                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2719                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2720                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2721                                 //FIXME replace parameters through context struct (some at least)
2722
2723                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2724                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2725                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2726                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2727                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2728                                 lastInChrBuf++;
2729                         }
2730                         //wrap buf index around to stay inside the ring buffer
2731                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2732                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2733                 }
2734                 else // not enough lines left in this slice -> load the rest in the buffer
2735                 {
2736 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2737                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2738                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2739                         vChrBufSize, vLumBufSize);*/
2740
2741                         //Do horizontal scaling
2742                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2743                         {
2744                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2745                                 lumBufIndex++;
2746                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2747                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2748                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2749                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2750                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2751                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2752                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2753                                 lastInLumBuf++;
2754                         }
2755                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2756                         {
2757                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2758                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2759                                 chrBufIndex++;
2760                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2761                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2762                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2763
2764                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2765                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2766                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2767                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2768                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2769                                 lastInChrBuf++;
2770                         }
2771                         //wrap buf index around to stay inside the ring buffer
2772                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2773                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2774                         break; //we can't output a dstY line so let's try with the next slice
2775                 }
2776
2777 #ifdef HAVE_MMX
2778                 b5Dither= dither8[dstY&1];
2779                 g6Dither= dither4[dstY&1];
2780                 g5Dither= dither8[dstY&1];
2781                 r5Dither= dither8[(dstY+1)&1];
2782 #endif
2783             if(dstY < dstH-2)
2784             {
2785                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2786                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2787 #ifdef HAVE_MMX
2788                 int i;
2789                 for(i=0; i<vLumFilterSize; i++)
2790                 {
2791                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2792                         lumMmxFilter[4*i+2]= 
2793                         lumMmxFilter[4*i+3]= 
2794                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2795                 }
2796                 for(i=0; i<vChrFilterSize; i++)
2797                 {
2798                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2799                         chrMmxFilter[4*i+2]= 
2800                         chrMmxFilter[4*i+3]= 
2801                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2802                 }
2803 #endif
2804                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2805                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2806                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2807                         RENAME(yuv2nv12X)(c,
2808                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2809                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810                                 dest, uDest, dstW, chrDstW, dstFormat);
2811                 }
2812                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2813                 {
2814                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2815                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2816                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2817                         {
2818                                 int16_t *lumBuf = lumPixBuf[0];
2819                                 int16_t *chrBuf= chrPixBuf[0];
2820                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2821                         }
2822                         else //General YV12
2823                         {
2824                                 RENAME(yuv2yuvX)(c,
2825                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2826                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2827                                         dest, uDest, vDest, dstW, chrDstW);
2828                         }
2829                 }
2830                 else
2831                 {
2832                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2833                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2834                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2835                         {
2836                                 int chrAlpha= vChrFilter[2*dstY+1];
2837                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2838                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2839                         }
2840                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2841                         {
2842                                 int lumAlpha= vLumFilter[2*dstY+1];
2843                                 int chrAlpha= vChrFilter[2*dstY+1];
2844                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2845                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2846                         }
2847                         else //General RGB
2848                         {
2849                                 RENAME(yuv2packedX)(c,
2850                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2851                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2852                                         dest, dstW, dstY);
2853                         }
2854                 }
2855             }
2856             else // hmm looks like we can't use MMX here without overwriting this array's tail
2857             {
2858                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2859                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2860                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2861                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2862                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2863                         yuv2nv12XinC(
2864                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2865                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2866                                 dest, uDest, dstW, chrDstW, dstFormat);
2867                 }
2868                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2869                 {
2870                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2871                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2872                         yuv2yuvXinC(
2873                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2874                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2875                                 dest, uDest, vDest, dstW, chrDstW);
2876                 }
2877                 else
2878                 {
2879                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2880                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2881                         yuv2packedXinC(c, 
2882                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2883                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884                                 dest, dstW, dstY);
2885                 }
2886             }
2887         }
2888
2889 #ifdef HAVE_MMX
2890         __asm __volatile(SFENCE:::"memory");
2891         __asm __volatile(EMMS:::"memory");
2892 #endif
2893         /* store changed local vars back in the context */
2894         c->dstY= dstY;
2895         c->lumBufIndex= lumBufIndex;
2896         c->chrBufIndex= chrBufIndex;
2897         c->lastInLumBuf= lastInLumBuf;
2898         c->lastInChrBuf= lastInChrBuf;
2899
2900         return dstY - lastDstY;
2901 }