]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
output shifted by a few pixels on extreem scalings bugfix
[ffmpeg] / postproc / swscale_template.c
1
2 // Software scaling and colorspace conversion routines for MPlayer
3
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL
7
8 #undef MOVNTQ
9 #undef PAVGB
10 #undef PREFETCH
11 #undef PREFETCHW
12 #undef EMMS
13 #undef SFENCE
14
15 #ifdef HAVE_3DNOW
16 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
17 #define EMMS     "femms"
18 #else
19 #define EMMS     "emms"
20 #endif
21
22 #ifdef HAVE_3DNOW
23 #define PREFETCH  "prefetch"
24 #define PREFETCHW "prefetchw"
25 #elif defined ( HAVE_MMX2 )
26 #define PREFETCH "prefetchnta"
27 #define PREFETCHW "prefetcht0"
28 #else
29 #define PREFETCH "/nop"
30 #define PREFETCHW "/nop"
31 #endif
32
33 #ifdef HAVE_MMX2
34 #define SFENCE "sfence"
35 #else
36 #define SFENCE "/nop"
37 #endif
38
39 #ifdef HAVE_MMX2
40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41 #elif defined (HAVE_3DNOW)
42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
47 #else
48 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
49 #endif
50
51
52 #define YSCALEYUV2YV12X(x) \
53                         "xorl %%eax, %%eax              \n\t"\
54                         "pxor %%mm3, %%mm3              \n\t"\
55                         "pxor %%mm4, %%mm4              \n\t"\
56                         "movl %0, %%edx                 \n\t"\
57                         ".balign 16                     \n\t" /* FIXME Unroll? */\
58                         "1:                             \n\t"\
59                         "movl (%1, %%edx, 4), %%esi     \n\t"\
60                         "movq (%2, %%edx, 8), %%mm0     \n\t" /* filterCoeff */\
61                         "movq " #x "(%%esi, %%eax, 2), %%mm2    \n\t" /* srcData */\
62                         "movq 8+" #x "(%%esi, %%eax, 2), %%mm5  \n\t" /* srcData */\
63                         "pmulhw %%mm0, %%mm2            \n\t"\
64                         "pmulhw %%mm0, %%mm5            \n\t"\
65                         "paddw %%mm2, %%mm3             \n\t"\
66                         "paddw %%mm5, %%mm4             \n\t"\
67                         "addl $1, %%edx                 \n\t"\
68                         " jnz 1b                        \n\t"\
69                         "psraw $3, %%mm3                \n\t"\
70                         "psraw $3, %%mm4                \n\t"\
71                         "packuswb %%mm4, %%mm3          \n\t"\
72                         MOVNTQ(%%mm3, (%3, %%eax))\
73                         "addl $8, %%eax                 \n\t"\
74                         "cmpl %4, %%eax                 \n\t"\
75                         "pxor %%mm3, %%mm3              \n\t"\
76                         "pxor %%mm4, %%mm4              \n\t"\
77                         "movl %0, %%edx                 \n\t"\
78                         "jb 1b                          \n\t"
79
80 #define YSCALEYUV2YV121 \
81                         "movl %2, %%eax                 \n\t"\
82                         ".balign 16                     \n\t" /* FIXME Unroll? */\
83                         "1:                             \n\t"\
84                         "movq (%0, %%eax, 2), %%mm0     \n\t"\
85                         "movq 8(%0, %%eax, 2), %%mm1    \n\t"\
86                         "psraw $7, %%mm0                \n\t"\
87                         "psraw $7, %%mm1                \n\t"\
88                         "packuswb %%mm1, %%mm0          \n\t"\
89                         MOVNTQ(%%mm0, (%1, %%eax))\
90                         "addl $8, %%eax                 \n\t"\
91                         "jnc 1b                         \n\t"
92
93 /*
94                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
95                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
96                            "r" (dest), "m" (dstW),
97                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
98                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
99 */
100 #define YSCALEYUV2RGBX \
101                 "xorl %%eax, %%eax              \n\t"\
102                 ".balign 16                     \n\t"\
103                 "1:                             \n\t"\
104                 "movl %1, %%edx                 \n\t" /* -chrFilterSize */\
105                 "movl %3, %%ebx                 \n\t" /* chrMmxFilter+lumFilterSize */\
106                 "movl %7, %%ecx                 \n\t" /* chrSrc+lumFilterSize */\
107                 "pxor %%mm3, %%mm3              \n\t"\
108                 "pxor %%mm4, %%mm4              \n\t"\
109                 "2:                             \n\t"\
110                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
111                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
112                 "movq (%%esi, %%eax), %%mm2     \n\t" /* UsrcData */\
113                 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
114                 "pmulhw %%mm0, %%mm2            \n\t"\
115                 "pmulhw %%mm0, %%mm5            \n\t"\
116                 "paddw %%mm2, %%mm3             \n\t"\
117                 "paddw %%mm5, %%mm4             \n\t"\
118                 "addl $1, %%edx                 \n\t"\
119                 " jnz 2b                        \n\t"\
120 \
121                 "movl %0, %%edx                 \n\t" /* -lumFilterSize */\
122                 "movl %2, %%ebx                 \n\t" /* lumMmxFilter+lumFilterSize */\
123                 "movl %6, %%ecx                 \n\t" /* lumSrc+lumFilterSize */\
124                 "pxor %%mm1, %%mm1              \n\t"\
125                 "pxor %%mm7, %%mm7              \n\t"\
126                 "2:                             \n\t"\
127                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
128                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
129                 "movq (%%esi, %%eax, 2), %%mm2  \n\t" /* Y1srcData */\
130                 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
131                 "pmulhw %%mm0, %%mm2            \n\t"\
132                 "pmulhw %%mm0, %%mm5            \n\t"\
133                 "paddw %%mm2, %%mm1             \n\t"\
134                 "paddw %%mm5, %%mm7             \n\t"\
135                 "addl $1, %%edx                 \n\t"\
136                 " jnz 2b                        \n\t"\
137 \
138                 "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
139                 "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
140                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
141                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
142                 "pmulhw ugCoeff, %%mm3          \n\t"\
143                 "pmulhw vgCoeff, %%mm4          \n\t"\
144         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
145                 "pmulhw ubCoeff, %%mm2          \n\t"\
146                 "pmulhw vrCoeff, %%mm5          \n\t"\
147                 "psubw w80, %%mm1               \n\t" /* 8(Y-16)*/\
148                 "psubw w80, %%mm7               \n\t" /* 8(Y-16)*/\
149                 "pmulhw yCoeff, %%mm1           \n\t"\
150                 "pmulhw yCoeff, %%mm7           \n\t"\
151         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
152                 "paddw %%mm3, %%mm4             \n\t"\
153                 "movq %%mm2, %%mm0              \n\t"\
154                 "movq %%mm5, %%mm6              \n\t"\
155                 "movq %%mm4, %%mm3              \n\t"\
156                 "punpcklwd %%mm2, %%mm2         \n\t"\
157                 "punpcklwd %%mm5, %%mm5         \n\t"\
158                 "punpcklwd %%mm4, %%mm4         \n\t"\
159                 "paddw %%mm1, %%mm2             \n\t"\
160                 "paddw %%mm1, %%mm5             \n\t"\
161                 "paddw %%mm1, %%mm4             \n\t"\
162                 "punpckhwd %%mm0, %%mm0         \n\t"\
163                 "punpckhwd %%mm6, %%mm6         \n\t"\
164                 "punpckhwd %%mm3, %%mm3         \n\t"\
165                 "paddw %%mm7, %%mm0             \n\t"\
166                 "paddw %%mm7, %%mm6             \n\t"\
167                 "paddw %%mm7, %%mm3             \n\t"\
168                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
169                 "packuswb %%mm0, %%mm2          \n\t"\
170                 "packuswb %%mm6, %%mm5          \n\t"\
171                 "packuswb %%mm3, %%mm4          \n\t"\
172                 "pxor %%mm7, %%mm7              \n\t"
173
174 #define FULL_YSCALEYUV2RGB \
175                 "pxor %%mm7, %%mm7              \n\t"\
176                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
177                 "punpcklwd %%mm6, %%mm6         \n\t"\
178                 "punpcklwd %%mm6, %%mm6         \n\t"\
179                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
180                 "punpcklwd %%mm5, %%mm5         \n\t"\
181                 "punpcklwd %%mm5, %%mm5         \n\t"\
182                 "xorl %%eax, %%eax              \n\t"\
183                 ".balign 16                     \n\t"\
184                 "1:                             \n\t"\
185                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
186                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
187                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
188                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
189                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
190                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
191                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
192                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
193                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
194                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
195                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
196                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
197                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
198                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
199                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
200                 "psubw w80, %%mm1               \n\t" /* 8(Y-16)*/\
201                 "psubw w400, %%mm3              \n\t" /* 8(U-128)*/\
202                 "pmulhw yCoeff, %%mm1           \n\t"\
203 \
204 \
205                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
206                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
207                 "pmulhw ubCoeff, %%mm3          \n\t"\
208                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
209                 "pmulhw ugCoeff, %%mm2          \n\t"\
210                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
211                 "psubw w400, %%mm0              \n\t" /* (V-128)8*/\
212 \
213 \
214                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
215                 "pmulhw vrCoeff, %%mm0          \n\t"\
216                 "pmulhw vgCoeff, %%mm4          \n\t"\
217                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
218                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
219                 "packuswb %%mm3, %%mm3          \n\t"\
220 \
221                 "packuswb %%mm0, %%mm0          \n\t"\
222                 "paddw %%mm4, %%mm2             \n\t"\
223                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
224 \
225                 "packuswb %%mm1, %%mm1          \n\t"
226
227 #define YSCALEYUV2RGB \
228                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
229                 "punpcklwd %%mm6, %%mm6         \n\t"\
230                 "punpcklwd %%mm6, %%mm6         \n\t"\
231                 "movq %%mm6, asm_yalpha1        \n\t"\
232                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
233                 "punpcklwd %%mm5, %%mm5         \n\t"\
234                 "punpcklwd %%mm5, %%mm5         \n\t"\
235                 "movq %%mm5, asm_uvalpha1       \n\t"\
236                 "xorl %%eax, %%eax              \n\t"\
237                 ".balign 16                     \n\t"\
238                 "1:                             \n\t"\
239                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
240                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
241                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
242                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
243                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
244                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
245                 "movq asm_uvalpha1, %%mm0       \n\t"\
246                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
247                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
248                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
249                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
250                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
251                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
252                 "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
253                 "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
254                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
255                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
256                 "pmulhw ugCoeff, %%mm3          \n\t"\
257                 "pmulhw vgCoeff, %%mm4          \n\t"\
258         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
259                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
260                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
261                 "movq 8(%0, %%eax, 2), %%mm6    \n\t" /*buf0[eax]*/\
262                 "movq 8(%1, %%eax, 2), %%mm7    \n\t" /*buf1[eax]*/\
263                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
264                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
265                 "pmulhw asm_yalpha1, %%mm0      \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
266                 "pmulhw asm_yalpha1, %%mm6      \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
267                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
268                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
269                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
270                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
271                 "pmulhw ubCoeff, %%mm2          \n\t"\
272                 "pmulhw vrCoeff, %%mm5          \n\t"\
273                 "psubw w80, %%mm1               \n\t" /* 8(Y-16)*/\
274                 "psubw w80, %%mm7               \n\t" /* 8(Y-16)*/\
275                 "pmulhw yCoeff, %%mm1           \n\t"\
276                 "pmulhw yCoeff, %%mm7           \n\t"\
277         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
278                 "paddw %%mm3, %%mm4             \n\t"\
279                 "movq %%mm2, %%mm0              \n\t"\
280                 "movq %%mm5, %%mm6              \n\t"\
281                 "movq %%mm4, %%mm3              \n\t"\
282                 "punpcklwd %%mm2, %%mm2         \n\t"\
283                 "punpcklwd %%mm5, %%mm5         \n\t"\
284                 "punpcklwd %%mm4, %%mm4         \n\t"\
285                 "paddw %%mm1, %%mm2             \n\t"\
286                 "paddw %%mm1, %%mm5             \n\t"\
287                 "paddw %%mm1, %%mm4             \n\t"\
288                 "punpckhwd %%mm0, %%mm0         \n\t"\
289                 "punpckhwd %%mm6, %%mm6         \n\t"\
290                 "punpckhwd %%mm3, %%mm3         \n\t"\
291                 "paddw %%mm7, %%mm0             \n\t"\
292                 "paddw %%mm7, %%mm6             \n\t"\
293                 "paddw %%mm7, %%mm3             \n\t"\
294                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
295                 "packuswb %%mm0, %%mm2          \n\t"\
296                 "packuswb %%mm6, %%mm5          \n\t"\
297                 "packuswb %%mm3, %%mm4          \n\t"\
298                 "pxor %%mm7, %%mm7              \n\t"
299
300 #define YSCALEYUV2RGB1 \
301                 "xorl %%eax, %%eax              \n\t"\
302                 ".balign 16                     \n\t"\
303                 "1:                             \n\t"\
304                 "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
305                 "movq 4096(%2, %%eax), %%mm4    \n\t" /* uvbuf0[eax+2048]*/\
306                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308                 "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
309                 "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
310                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312                 "pmulhw ugCoeff, %%mm3          \n\t"\
313                 "pmulhw vgCoeff, %%mm4          \n\t"\
314         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
316                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
317                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319                 "pmulhw ubCoeff, %%mm2          \n\t"\
320                 "pmulhw vrCoeff, %%mm5          \n\t"\
321                 "psubw w80, %%mm1               \n\t" /* 8(Y-16)*/\
322                 "psubw w80, %%mm7               \n\t" /* 8(Y-16)*/\
323                 "pmulhw yCoeff, %%mm1           \n\t"\
324                 "pmulhw yCoeff, %%mm7           \n\t"\
325         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326                 "paddw %%mm3, %%mm4             \n\t"\
327                 "movq %%mm2, %%mm0              \n\t"\
328                 "movq %%mm5, %%mm6              \n\t"\
329                 "movq %%mm4, %%mm3              \n\t"\
330                 "punpcklwd %%mm2, %%mm2         \n\t"\
331                 "punpcklwd %%mm5, %%mm5         \n\t"\
332                 "punpcklwd %%mm4, %%mm4         \n\t"\
333                 "paddw %%mm1, %%mm2             \n\t"\
334                 "paddw %%mm1, %%mm5             \n\t"\
335                 "paddw %%mm1, %%mm4             \n\t"\
336                 "punpckhwd %%mm0, %%mm0         \n\t"\
337                 "punpckhwd %%mm6, %%mm6         \n\t"\
338                 "punpckhwd %%mm3, %%mm3         \n\t"\
339                 "paddw %%mm7, %%mm0             \n\t"\
340                 "paddw %%mm7, %%mm6             \n\t"\
341                 "paddw %%mm7, %%mm3             \n\t"\
342                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343                 "packuswb %%mm0, %%mm2          \n\t"\
344                 "packuswb %%mm6, %%mm5          \n\t"\
345                 "packuswb %%mm3, %%mm4          \n\t"\
346                 "pxor %%mm7, %%mm7              \n\t"
347
348 // do vertical chrominance interpolation
349 #define YSCALEYUV2RGB1b \
350                 "xorl %%eax, %%eax              \n\t"\
351                 ".balign 16                     \n\t"\
352                 "1:                             \n\t"\
353                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
354                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
355                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
356                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
357                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
358                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
359                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
360                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
361                 "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
362                 "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
363                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
364                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
365                 "pmulhw ugCoeff, %%mm3          \n\t"\
366                 "pmulhw vgCoeff, %%mm4          \n\t"\
367         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
368                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
369                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
370                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
371                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
372                 "pmulhw ubCoeff, %%mm2          \n\t"\
373                 "pmulhw vrCoeff, %%mm5          \n\t"\
374                 "psubw w80, %%mm1               \n\t" /* 8(Y-16)*/\
375                 "psubw w80, %%mm7               \n\t" /* 8(Y-16)*/\
376                 "pmulhw yCoeff, %%mm1           \n\t"\
377                 "pmulhw yCoeff, %%mm7           \n\t"\
378         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
379                 "paddw %%mm3, %%mm4             \n\t"\
380                 "movq %%mm2, %%mm0              \n\t"\
381                 "movq %%mm5, %%mm6              \n\t"\
382                 "movq %%mm4, %%mm3              \n\t"\
383                 "punpcklwd %%mm2, %%mm2         \n\t"\
384                 "punpcklwd %%mm5, %%mm5         \n\t"\
385                 "punpcklwd %%mm4, %%mm4         \n\t"\
386                 "paddw %%mm1, %%mm2             \n\t"\
387                 "paddw %%mm1, %%mm5             \n\t"\
388                 "paddw %%mm1, %%mm4             \n\t"\
389                 "punpckhwd %%mm0, %%mm0         \n\t"\
390                 "punpckhwd %%mm6, %%mm6         \n\t"\
391                 "punpckhwd %%mm3, %%mm3         \n\t"\
392                 "paddw %%mm7, %%mm0             \n\t"\
393                 "paddw %%mm7, %%mm6             \n\t"\
394                 "paddw %%mm7, %%mm3             \n\t"\
395                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
396                 "packuswb %%mm0, %%mm2          \n\t"\
397                 "packuswb %%mm6, %%mm5          \n\t"\
398                 "packuswb %%mm3, %%mm4          \n\t"\
399                 "pxor %%mm7, %%mm7              \n\t"
400
401 #define WRITEBGR32 \
402                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
403                         "movq %%mm2, %%mm1              \n\t" /* B */\
404                         "movq %%mm5, %%mm6              \n\t" /* R */\
405                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
406                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
407                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
408                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
409                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
410                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
411                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
412                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
413                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
414                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
415 \
416                         MOVNTQ(%%mm0, (%4, %%eax, 4))\
417                         MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
418                         MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
419                         MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
420 \
421                         "addl $8, %%eax                 \n\t"\
422                         "cmpl %5, %%eax                 \n\t"\
423                         " jb 1b                         \n\t"
424
425 #define WRITEBGR16 \
426                         "pand bF8, %%mm2                \n\t" /* B */\
427                         "pand bFC, %%mm4                \n\t" /* G */\
428                         "pand bF8, %%mm5                \n\t" /* R */\
429                         "psrlq $3, %%mm2                \n\t"\
430 \
431                         "movq %%mm2, %%mm1              \n\t"\
432                         "movq %%mm4, %%mm3              \n\t"\
433 \
434                         "punpcklbw %%mm7, %%mm3         \n\t"\
435                         "punpcklbw %%mm5, %%mm2         \n\t"\
436                         "punpckhbw %%mm7, %%mm4         \n\t"\
437                         "punpckhbw %%mm5, %%mm1         \n\t"\
438 \
439                         "psllq $3, %%mm3                \n\t"\
440                         "psllq $3, %%mm4                \n\t"\
441 \
442                         "por %%mm3, %%mm2               \n\t"\
443                         "por %%mm4, %%mm1               \n\t"\
444 \
445                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
446                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
447 \
448                         "addl $8, %%eax                 \n\t"\
449                         "cmpl %5, %%eax                 \n\t"\
450                         " jb 1b                         \n\t"
451
452 #define WRITEBGR15 \
453                         "pand bF8, %%mm2                \n\t" /* B */\
454                         "pand bF8, %%mm4                \n\t" /* G */\
455                         "pand bF8, %%mm5                \n\t" /* R */\
456                         "psrlq $3, %%mm2                \n\t"\
457                         "psrlq $1, %%mm5                \n\t"\
458 \
459                         "movq %%mm2, %%mm1              \n\t"\
460                         "movq %%mm4, %%mm3              \n\t"\
461 \
462                         "punpcklbw %%mm7, %%mm3         \n\t"\
463                         "punpcklbw %%mm5, %%mm2         \n\t"\
464                         "punpckhbw %%mm7, %%mm4         \n\t"\
465                         "punpckhbw %%mm5, %%mm1         \n\t"\
466 \
467                         "psllq $2, %%mm3                \n\t"\
468                         "psllq $2, %%mm4                \n\t"\
469 \
470                         "por %%mm3, %%mm2               \n\t"\
471                         "por %%mm4, %%mm1               \n\t"\
472 \
473                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
474                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
475 \
476                         "addl $8, %%eax                 \n\t"\
477                         "cmpl %5, %%eax                 \n\t"\
478                         " jb 1b                         \n\t"
479
480 #define WRITEBGR24OLD \
481                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
482                         "movq %%mm2, %%mm1              \n\t" /* B */\
483                         "movq %%mm5, %%mm6              \n\t" /* R */\
484                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
485                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
486                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
487                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
488                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
489                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
490                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
491                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
492                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
493                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
494 \
495                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
496                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
497                         "pand bm00000111, %%mm4         \n\t" /* 00000RGB 0 */\
498                         "pand bm11111000, %%mm0         \n\t" /* 00RGB000 0.5 */\
499                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
500                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
501                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
502                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
503 \
504                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
505                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
506                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
507                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
508                         "pand bm00001111, %%mm2         \n\t" /* 0000RGBR 1 */\
509                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
510                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
511                         "pand bm00000111, %%mm4         \n\t" /* 00000RGB 2 */\
512                         "pand bm11111000, %%mm1         \n\t" /* 00RGB000 2.5 */\
513                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
514                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
515                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
516                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
517 \
518                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
519                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
520                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
521                         "pand bm00000111, %%mm5         \n\t" /* 00000RGB 3 */\
522                         "pand bm11111000, %%mm3         \n\t" /* 00RGB000 3.5 */\
523                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
524                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
525                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
526 \
527                         MOVNTQ(%%mm0, (%%ebx))\
528                         MOVNTQ(%%mm2, 8(%%ebx))\
529                         MOVNTQ(%%mm3, 16(%%ebx))\
530                         "addl $24, %%ebx                \n\t"\
531 \
532                         "addl $8, %%eax                 \n\t"\
533                         "cmpl %5, %%eax                 \n\t"\
534                         " jb 1b                         \n\t"
535
536 #define WRITEBGR24MMX \
537                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
538                         "movq %%mm2, %%mm1              \n\t" /* B */\
539                         "movq %%mm5, %%mm6              \n\t" /* R */\
540                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
541                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
542                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
543                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
544                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
545                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
546                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
547                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
548                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
549                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
550 \
551                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
552                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
553                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
554                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
555 \
556                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
557                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
558                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
559                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
560 \
561                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
562                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
563                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
564                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
565 \
566                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
567                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
568                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
569                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
570                         MOVNTQ(%%mm0, (%%ebx))\
571 \
572                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
573                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
574                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
575                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
576                         MOVNTQ(%%mm6, 8(%%ebx))\
577 \
578                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
579                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
580                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
581                         MOVNTQ(%%mm5, 16(%%ebx))\
582 \
583                         "addl $24, %%ebx                \n\t"\
584 \
585                         "addl $8, %%eax                 \n\t"\
586                         "cmpl %5, %%eax                 \n\t"\
587                         " jb 1b                         \n\t"
588
589 #define WRITEBGR24MMX2 \
590                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
591                         "movq M24A, %%mm0               \n\t"\
592                         "movq M24C, %%mm7               \n\t"\
593                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
594                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
595                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
596 \
597                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
598                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
599                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
600 \
601                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
602                         "por %%mm1, %%mm6               \n\t"\
603                         "por %%mm3, %%mm6               \n\t"\
604                         MOVNTQ(%%mm6, (%%ebx))\
605 \
606                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
607                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
608                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
609                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
610 \
611                         "pand M24B, %%mm1               \n\t" /* B5       B4        B3    */\
612                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
613                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
614 \
615                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
616                         "por %%mm3, %%mm6               \n\t"\
617                         MOVNTQ(%%mm6, 8(%%ebx))\
618 \
619                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
620                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
621                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
622 \
623                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
624                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
625                         "pand M24B, %%mm6               \n\t" /* R7       R6        R5    */\
626 \
627                         "por %%mm1, %%mm3               \n\t"\
628                         "por %%mm3, %%mm6               \n\t"\
629                         MOVNTQ(%%mm6, 16(%%ebx))\
630 \
631                         "addl $24, %%ebx                \n\t"\
632 \
633                         "addl $8, %%eax                 \n\t"\
634                         "cmpl %5, %%eax                 \n\t"\
635                         " jb 1b                         \n\t"
636
637 #ifdef HAVE_MMX2
638 #undef WRITEBGR24
639 #define WRITEBGR24 WRITEBGR24MMX2
640 #else
641 #undef WRITEBGR24
642 #define WRITEBGR24 WRITEBGR24MMX
643 #endif
644
645 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
646                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
647                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
648                                     int16_t * lumMmxFilter, int16_t * chrMmxFilter)
649 {
650 #ifdef HAVE_MMX
651         if(uDest != NULL)
652         {
653                 asm volatile(
654                                 YSCALEYUV2YV12X(0)
655                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
656                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
657                                 : "%eax", "%edx", "%esi"
658                         );
659
660                 asm volatile(
661                                 YSCALEYUV2YV12X(4096)
662                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
663                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
664                                 : "%eax", "%edx", "%esi"
665                         );
666         }
667
668         asm volatile(
669                         YSCALEYUV2YV12X(0)
670                         :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671                            "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672                         : "%eax", "%edx", "%esi"
673                 );
674 #else
675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676             chrFilter, chrSrc, chrFilterSize,
677             dest, uDest, vDest, dstW);
678 #endif
679 }
680
681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
682                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
683 {
684 #ifdef HAVE_MMX
685         if(uDest != NULL)
686         {
687                 asm volatile(
688                                 YSCALEYUV2YV121
689                                 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
690                                 "g" (-(dstW>>1))
691                                 : "%eax"
692                         );
693
694                 asm volatile(
695                                 YSCALEYUV2YV121
696                                 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
697                                 "g" (-(dstW>>1))
698                                 : "%eax"
699                         );
700         }
701
702         asm volatile(
703                 YSCALEYUV2YV121
704                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
705                 "g" (-dstW)
706                 : "%eax"
707         );
708 #else
709         //FIXME Optimize (just quickly writen not opti..)
710         //FIXME replace MINMAX with LUTs
711         int i;
712         for(i=0; i<dstW; i++)
713         {
714                 int val= lumSrc[i]>>7;
715
716                 dest[i]= MIN(MAX(val>>19, 0), 255);
717         }
718
719         if(uDest != NULL)
720                 for(i=0; i<(dstW>>1); i++)
721                 {
722                         int u=chrSrc[i]>>7;
723                         int v=chrSrc[i + 2048]>>7;
724
725                         uDest[i]= MIN(MAX(u>>19, 0), 255);
726                         vDest[i]= MIN(MAX(v>>19, 0), 255);
727                 }
728 #endif
729 }
730
731
732 /**
733  * vertical scale YV12 to RGB
734  */
735 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
736                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
737                             uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
738 {
739         if(fullUVIpol)
740         {
741 //FIXME
742         }//FULL_UV_IPOL
743         else
744         {
745 #ifdef HAVE_MMX
746                 if(dstbpp == 32) //FIXME untested
747                 {
748                         asm volatile(
749                                 YSCALEYUV2RGBX
750                                 WRITEBGR32
751
752                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
753                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
754                            "r" (dest), "m" (dstW),
755                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
756                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
757                         );
758                 }
759                 else if(dstbpp==24) //FIXME untested
760                 {
761                         asm volatile(
762                                 YSCALEYUV2RGBX
763                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t" //FIXME optimize
764                                 "addl %4, %%ebx                 \n\t"
765                                 WRITEBGR24
766
767                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
768                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
769                            "r" (dest), "m" (dstW),
770                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
771                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
772                         );
773                 }
774                 else if(dstbpp==15)
775                 {
776                         asm volatile(
777                                 YSCALEYUV2RGBX
778                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
779 #ifdef DITHER1XBPP
780                                 "paddusb b5Dither, %%mm2        \n\t"
781                                 "paddusb g5Dither, %%mm4        \n\t"
782                                 "paddusb r5Dither, %%mm5        \n\t"
783 #endif
784
785                                 WRITEBGR15
786
787                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789                            "r" (dest), "m" (dstW),
790                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
792                         );
793                 }
794                 else if(dstbpp==16)
795                 {
796                         asm volatile(
797                                 YSCALEYUV2RGBX
798                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
799 #ifdef DITHER1XBPP
800                                 "paddusb b5Dither, %%mm2        \n\t"
801                                 "paddusb g6Dither, %%mm4        \n\t"
802                                 "paddusb r5Dither, %%mm5        \n\t"
803 #endif
804
805                                 WRITEBGR16
806
807                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809                            "r" (dest), "m" (dstW),
810                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
812                         );
813                 }
814 #else
815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
816             chrFilter, chrSrc, chrFilterSize,
817             dest, dstW, dstbpp);
818
819 #endif
820         } //!FULL_UV_IPOL
821 }
822
823
824 /**
825  * vertical bilinear scale YV12 to RGB
826  */
827 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
828                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
829 {
830         int yalpha1=yalpha^4095;
831         int uvalpha1=uvalpha^4095;
832
833         if(fullUVIpol)
834         {
835
836 #ifdef HAVE_MMX
837                 if(dstbpp == 32)
838                 {
839                         asm volatile(
840
841
842 FULL_YSCALEYUV2RGB
843                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
844                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
845
846                         "movq %%mm3, %%mm1              \n\t"
847                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
848                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
849
850                         MOVNTQ(%%mm3, (%4, %%eax, 4))
851                         MOVNTQ(%%mm1, 8(%4, %%eax, 4))
852
853                         "addl $4, %%eax                 \n\t"
854                         "cmpl %5, %%eax                 \n\t"
855                         " jb 1b                         \n\t"
856
857
858                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
859                         "m" (yalpha1), "m" (uvalpha1)
860                         : "%eax"
861                         );
862                 }
863                 else if(dstbpp==24)
864                 {
865                         asm volatile(
866
867 FULL_YSCALEYUV2RGB
868
869                                                                 // lsb ... msb
870                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
871                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
872
873                         "movq %%mm3, %%mm1              \n\t"
874                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
875                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
876
877                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
878                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
879                         "pand bm00000111, %%mm2         \n\t" // BGR00000
880                         "pand bm11111000, %%mm3         \n\t" // 000BGR00
881                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
882                         "movq %%mm1, %%mm2              \n\t"
883                         "psllq $48, %%mm1               \n\t" // 000000BG
884                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
885
886                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
887                         "psrld $16, %%mm2               \n\t" // R000R000
888                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
889                         "por %%mm2, %%mm1               \n\t" // RBGRR000
890
891                         "movl %4, %%ebx                 \n\t"
892                         "addl %%eax, %%ebx              \n\t"
893
894 #ifdef HAVE_MMX2
895                         //FIXME Alignment
896                         "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
897                         "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
898 #else
899                         "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
900                         "psrlq $32, %%mm3               \n\t"
901                         "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
902                         "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
903 #endif
904                         "addl $4, %%eax                 \n\t"
905                         "cmpl %5, %%eax                 \n\t"
906                         " jb 1b                         \n\t"
907
908                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
909                         "m" (yalpha1), "m" (uvalpha1)
910                         : "%eax", "%ebx"
911                         );
912                 }
913                 else if(dstbpp==15)
914                 {
915                         asm volatile(
916
917 FULL_YSCALEYUV2RGB
918 #ifdef DITHER1XBPP
919                         "paddusb g5Dither, %%mm1        \n\t"
920                         "paddusb r5Dither, %%mm0        \n\t"
921                         "paddusb b5Dither, %%mm3        \n\t"
922 #endif
923                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
924                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
925                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
926
927                         "psrlw $3, %%mm3                \n\t"
928                         "psllw $2, %%mm1                \n\t"
929                         "psllw $7, %%mm0                \n\t"
930                         "pand g15Mask, %%mm1            \n\t"
931                         "pand r15Mask, %%mm0            \n\t"
932
933                         "por %%mm3, %%mm1               \n\t"
934                         "por %%mm1, %%mm0               \n\t"
935
936                         MOVNTQ(%%mm0, (%4, %%eax, 2))
937
938                         "addl $4, %%eax                 \n\t"
939                         "cmpl %5, %%eax                 \n\t"
940                         " jb 1b                         \n\t"
941
942                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
943                         "m" (yalpha1), "m" (uvalpha1)
944                         : "%eax"
945                         );
946                 }
947                 else if(dstbpp==16)
948                 {
949                         asm volatile(
950
951 FULL_YSCALEYUV2RGB
952 #ifdef DITHER1XBPP
953                         "paddusb g6Dither, %%mm1        \n\t"
954                         "paddusb r5Dither, %%mm0        \n\t"
955                         "paddusb b5Dither, %%mm3        \n\t"
956 #endif
957                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
958                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
959                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
960
961                         "psrlw $3, %%mm3                \n\t"
962                         "psllw $3, %%mm1                \n\t"
963                         "psllw $8, %%mm0                \n\t"
964                         "pand g16Mask, %%mm1            \n\t"
965                         "pand r16Mask, %%mm0            \n\t"
966
967                         "por %%mm3, %%mm1               \n\t"
968                         "por %%mm1, %%mm0               \n\t"
969
970                         MOVNTQ(%%mm0, (%4, %%eax, 2))
971
972                         "addl $4, %%eax                 \n\t"
973                         "cmpl %5, %%eax                 \n\t"
974                         " jb 1b                         \n\t"
975
976                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
977                         "m" (yalpha1), "m" (uvalpha1)
978                         : "%eax"
979                         );
980                 }
981 #else
982                 if(dstbpp==32 || dstbpp==24)
983                 {
984                         int i;
985                         for(i=0;i<dstW;i++){
986                                 // vertical linear interpolation && yuv2rgb in a single step:
987                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
988                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
989                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
990                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
991                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
992                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
993                                 dest+=dstbpp>>3;
994                         }
995                 }
996                 else if(dstbpp==16)
997                 {
998                         int i;
999                         for(i=0;i<dstW;i++){
1000                                 // vertical linear interpolation && yuv2rgb in a single step:
1001                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004
1005                                 ((uint16_t*)dest)[i] =
1006                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1007                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1008                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1009                         }
1010                 }
1011                 else if(dstbpp==15)
1012                 {
1013                         int i;
1014                         for(i=0;i<dstW;i++){
1015                                 // vertical linear interpolation && yuv2rgb in a single step:
1016                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1017                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1018                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1019
1020                                 ((uint16_t*)dest)[i] =
1021                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1022                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1023                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1024                         }
1025                 }
1026 #endif
1027         }//FULL_UV_IPOL
1028         else
1029         {
1030 #ifdef HAVE_MMX
1031                 if(dstbpp == 32)
1032                 {
1033                         asm volatile(
1034                                 YSCALEYUV2RGB
1035                                 WRITEBGR32
1036
1037                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1038                         "m" (yalpha1), "m" (uvalpha1)
1039                         : "%eax"
1040                         );
1041                 }
1042                 else if(dstbpp==24)
1043                 {
1044                         asm volatile(
1045                                 "movl %4, %%ebx                 \n\t"
1046                                 YSCALEYUV2RGB
1047                                 WRITEBGR24
1048
1049                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1050                         "m" (yalpha1), "m" (uvalpha1)
1051                         : "%eax", "%ebx"
1052                         );
1053                 }
1054                 else if(dstbpp==15)
1055                 {
1056                         asm volatile(
1057                                 YSCALEYUV2RGB
1058                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059 #ifdef DITHER1XBPP
1060                                 "paddusb b5Dither, %%mm2        \n\t"
1061                                 "paddusb g5Dither, %%mm4        \n\t"
1062                                 "paddusb r5Dither, %%mm5        \n\t"
1063 #endif
1064
1065                                 WRITEBGR15
1066
1067                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1068                         "m" (yalpha1), "m" (uvalpha1)
1069                         : "%eax"
1070                         );
1071                 }
1072                 else if(dstbpp==16)
1073                 {
1074                         asm volatile(
1075                                 YSCALEYUV2RGB
1076                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077 #ifdef DITHER1XBPP
1078                                 "paddusb b5Dither, %%mm2        \n\t"
1079                                 "paddusb g6Dither, %%mm4        \n\t"
1080                                 "paddusb r5Dither, %%mm5        \n\t"
1081 #endif
1082
1083                                 WRITEBGR16
1084
1085                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1086                         "m" (yalpha1), "m" (uvalpha1)
1087                         : "%eax"
1088                         );
1089                 }
1090 #else
1091                 if(dstbpp==32)
1092                 {
1093                         int i;
1094                         for(i=0; i<dstW-1; i+=2){
1095                                 // vertical linear interpolation && yuv2rgb in a single step:
1096                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1097                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1098                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1099                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1100
1101                                 int Cb= yuvtab_40cf[U];
1102                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1103                                 int Cr= yuvtab_3343[V];
1104
1105                                 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1106                                 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1107                                 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1108
1109                                 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1110                                 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1111                                 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1112                         }
1113                 }
1114                 else if(dstbpp==24)
1115                 {
1116                         int i;
1117                         for(i=0; i<dstW-1; i+=2){
1118                                 // vertical linear interpolation && yuv2rgb in a single step:
1119                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1120                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1121                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1122                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1123
1124                                 int Cb= yuvtab_40cf[U];
1125                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1126                                 int Cr= yuvtab_3343[V];
1127
1128                                 dest[0]=clip_table[((Y1 + Cb) >>13)];
1129                                 dest[1]=clip_table[((Y1 + Cg) >>13)];
1130                                 dest[2]=clip_table[((Y1 + Cr) >>13)];
1131
1132                                 dest[3]=clip_table[((Y2 + Cb) >>13)];
1133                                 dest[4]=clip_table[((Y2 + Cg) >>13)];
1134                                 dest[5]=clip_table[((Y2 + Cr) >>13)];
1135                                 dest+=6;
1136                         }
1137                 }
1138                 else if(dstbpp==16)
1139                 {
1140                         int i;
1141                         for(i=0; i<dstW-1; i+=2){
1142                                 // vertical linear interpolation && yuv2rgb in a single step:
1143                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1144                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1145                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1146                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1147
1148                                 int Cb= yuvtab_40cf[U];
1149                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1150                                 int Cr= yuvtab_3343[V];
1151
1152                                 ((uint16_t*)dest)[i] =
1153                                         clip_table16b[(Y1 + Cb) >>13] |
1154                                         clip_table16g[(Y1 + Cg) >>13] |
1155                                         clip_table16r[(Y1 + Cr) >>13];
1156
1157                                 ((uint16_t*)dest)[i+1] =
1158                                         clip_table16b[(Y2 + Cb) >>13] |
1159                                         clip_table16g[(Y2 + Cg) >>13] |
1160                                         clip_table16r[(Y2 + Cr) >>13];
1161                         }
1162                 }
1163                 else if(dstbpp==15)
1164                 {
1165                         int i;
1166                         for(i=0; i<dstW-1; i+=2){
1167                                 // vertical linear interpolation && yuv2rgb in a single step:
1168                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1169                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1170                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1171                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1172
1173                                 int Cb= yuvtab_40cf[U];
1174                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1175                                 int Cr= yuvtab_3343[V];
1176
1177                                 ((uint16_t*)dest)[i] =
1178                                         clip_table15b[(Y1 + Cb) >>13] |
1179                                         clip_table15g[(Y1 + Cg) >>13] |
1180                                         clip_table15r[(Y1 + Cr) >>13];
1181
1182                                 ((uint16_t*)dest)[i+1] =
1183                                         clip_table15b[(Y2 + Cb) >>13] |
1184                                         clip_table15g[(Y2 + Cg) >>13] |
1185                                         clip_table15r[(Y2 + Cr) >>13];
1186                         }
1187                 }
1188 #endif
1189         } //!FULL_UV_IPOL
1190 }
1191
1192 /**
1193  * YV12 to RGB without scaling or interpolating
1194  */
1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1196                             uint8_t *dest, int dstW, int uvalpha, int dstbpp)
1197 {
1198         int uvalpha1=uvalpha^4095;
1199         const int yalpha1=0;
1200
1201         if(fullUVIpol || allwaysIpol)
1202         {
1203                 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
1204                 return;
1205         }
1206
1207 #ifdef HAVE_MMX
1208         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1209         {
1210                 if(dstbpp == 32)
1211                 {
1212                         asm volatile(
1213                                 YSCALEYUV2RGB1
1214                                 WRITEBGR32
1215                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1216                         "m" (yalpha1), "m" (uvalpha1)
1217                         : "%eax"
1218                         );
1219                 }
1220                 else if(dstbpp==24)
1221                 {
1222                         asm volatile(
1223                                 "movl %4, %%ebx                 \n\t"
1224                                 YSCALEYUV2RGB1
1225                                 WRITEBGR24
1226                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1227                         "m" (yalpha1), "m" (uvalpha1)
1228                         : "%eax", "%ebx"
1229                         );
1230                 }
1231                 else if(dstbpp==15)
1232                 {
1233                         asm volatile(
1234                                 YSCALEYUV2RGB1
1235                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236 #ifdef DITHER1XBPP
1237                                 "paddusb b5Dither, %%mm2        \n\t"
1238                                 "paddusb g5Dither, %%mm4        \n\t"
1239                                 "paddusb r5Dither, %%mm5        \n\t"
1240 #endif
1241                                 WRITEBGR15
1242                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243                         "m" (yalpha1), "m" (uvalpha1)
1244                         : "%eax"
1245                         );
1246                 }
1247                 else if(dstbpp==16)
1248                 {
1249                         asm volatile(
1250                                 YSCALEYUV2RGB1
1251                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 #ifdef DITHER1XBPP
1253                                 "paddusb b5Dither, %%mm2        \n\t"
1254                                 "paddusb g6Dither, %%mm4        \n\t"
1255                                 "paddusb r5Dither, %%mm5        \n\t"
1256 #endif
1257
1258                                 WRITEBGR16
1259                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1260                         "m" (yalpha1), "m" (uvalpha1)
1261                         : "%eax"
1262                         );
1263                 }
1264         }
1265         else
1266         {
1267                 if(dstbpp == 32)
1268                 {
1269                         asm volatile(
1270                                 YSCALEYUV2RGB1b
1271                                 WRITEBGR32
1272                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1273                         "m" (yalpha1), "m" (uvalpha1)
1274                         : "%eax"
1275                         );
1276                 }
1277                 else if(dstbpp==24)
1278                 {
1279                         asm volatile(
1280                                 "movl %4, %%ebx                 \n\t"
1281                                 YSCALEYUV2RGB1b
1282                                 WRITEBGR24
1283                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1284                         "m" (yalpha1), "m" (uvalpha1)
1285                         : "%eax", "%ebx"
1286                         );
1287                 }
1288                 else if(dstbpp==15)
1289                 {
1290                         asm volatile(
1291                                 YSCALEYUV2RGB1b
1292                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1293 #ifdef DITHER1XBPP
1294                                 "paddusb b5Dither, %%mm2        \n\t"
1295                                 "paddusb g5Dither, %%mm4        \n\t"
1296                                 "paddusb r5Dither, %%mm5        \n\t"
1297 #endif
1298                                 WRITEBGR15
1299                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300                         "m" (yalpha1), "m" (uvalpha1)
1301                         : "%eax"
1302                         );
1303                 }
1304                 else if(dstbpp==16)
1305                 {
1306                         asm volatile(
1307                                 YSCALEYUV2RGB1b
1308                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309 #ifdef DITHER1XBPP
1310                                 "paddusb b5Dither, %%mm2        \n\t"
1311                                 "paddusb g6Dither, %%mm4        \n\t"
1312                                 "paddusb r5Dither, %%mm5        \n\t"
1313 #endif
1314
1315                                 WRITEBGR16
1316                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1317                         "m" (yalpha1), "m" (uvalpha1)
1318                         : "%eax"
1319                         );
1320                 }
1321         }
1322 #else
1323 //FIXME write 2 versions (for even & odd lines)
1324
1325         if(dstbpp==32)
1326         {
1327                 int i;
1328                 for(i=0; i<dstW-1; i+=2){
1329                         // vertical linear interpolation && yuv2rgb in a single step:
1330                         int Y1=yuvtab_2568[buf0[i]>>7];
1331                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1332                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1333                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1334
1335                         int Cb= yuvtab_40cf[U];
1336                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1337                         int Cr= yuvtab_3343[V];
1338
1339                         dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1340                         dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1341                         dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1342
1343                         dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1344                         dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1345                         dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1346                 }
1347         }
1348         else if(dstbpp==24)
1349         {
1350                 int i;
1351                 for(i=0; i<dstW-1; i+=2){
1352                         // vertical linear interpolation && yuv2rgb in a single step:
1353                         int Y1=yuvtab_2568[buf0[i]>>7];
1354                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1355                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1356                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1357
1358                         int Cb= yuvtab_40cf[U];
1359                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1360                         int Cr= yuvtab_3343[V];
1361
1362                         dest[0]=clip_table[((Y1 + Cb) >>13)];
1363                         dest[1]=clip_table[((Y1 + Cg) >>13)];
1364                         dest[2]=clip_table[((Y1 + Cr) >>13)];
1365
1366                         dest[3]=clip_table[((Y2 + Cb) >>13)];
1367                         dest[4]=clip_table[((Y2 + Cg) >>13)];
1368                         dest[5]=clip_table[((Y2 + Cr) >>13)];
1369                         dest+=6;
1370                 }
1371         }
1372         else if(dstbpp==16)
1373         {
1374                 int i;
1375                 for(i=0; i<dstW-1; i+=2){
1376                         // vertical linear interpolation && yuv2rgb in a single step:
1377                         int Y1=yuvtab_2568[buf0[i]>>7];
1378                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1379                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1380                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1381
1382                         int Cb= yuvtab_40cf[U];
1383                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1384                         int Cr= yuvtab_3343[V];
1385
1386                         ((uint16_t*)dest)[i] =
1387                                 clip_table16b[(Y1 + Cb) >>13] |
1388                                 clip_table16g[(Y1 + Cg) >>13] |
1389                                 clip_table16r[(Y1 + Cr) >>13];
1390
1391                         ((uint16_t*)dest)[i+1] =
1392                                 clip_table16b[(Y2 + Cb) >>13] |
1393                                 clip_table16g[(Y2 + Cg) >>13] |
1394                                 clip_table16r[(Y2 + Cr) >>13];
1395                 }
1396         }
1397         else if(dstbpp==15)
1398         {
1399                 int i;
1400                 for(i=0; i<dstW-1; i+=2){
1401                         // vertical linear interpolation && yuv2rgb in a single step:
1402                         int Y1=yuvtab_2568[buf0[i]>>7];
1403                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1404                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1405                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1406
1407                         int Cb= yuvtab_40cf[U];
1408                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1409                         int Cr= yuvtab_3343[V];
1410
1411                         ((uint16_t*)dest)[i] =
1412                                 clip_table15b[(Y1 + Cb) >>13] |
1413                                 clip_table15g[(Y1 + Cg) >>13] |
1414                                 clip_table15r[(Y1 + Cr) >>13];
1415
1416                         ((uint16_t*)dest)[i+1] =
1417                                 clip_table15b[(Y2 + Cb) >>13] |
1418                                 clip_table15g[(Y2 + Cg) >>13] |
1419                                 clip_table15r[(Y2 + Cr) >>13];
1420                 }
1421         }
1422 #endif
1423 }
1424
1425 // Bilinear / Bicubic scaling
1426 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1427                                   int16_t *filter, int16_t *filterPos, int filterSize)
1428 {
1429 #ifdef HAVE_MMX
1430         if(filterSize==4) // allways true for upscaling, sometimes for down too
1431         {
1432                 int counter= -2*dstW;
1433                 filter-= counter*2;
1434                 filterPos-= counter/2;
1435                 dst-= counter/2;
1436                 asm volatile(
1437                         "pxor %%mm7, %%mm7              \n\t"
1438                         "movq w02, %%mm6                \n\t"
1439                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
1440                         "movl %%eax, %%ebp              \n\t"
1441                         ".balign 16                     \n\t"
1442                         "1:                             \n\t"
1443                         "movzwl (%2, %%ebp), %%eax      \n\t"
1444                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
1445                         "movq (%1, %%ebp, 4), %%mm1     \n\t"
1446                         "movq 8(%1, %%ebp, 4), %%mm3    \n\t"
1447                         "movd (%3, %%eax), %%mm0        \n\t"
1448                         "movd (%3, %%ebx), %%mm2        \n\t"
1449                         "punpcklbw %%mm7, %%mm0         \n\t"
1450                         "punpcklbw %%mm7, %%mm2         \n\t"
1451                         "pmaddwd %%mm1, %%mm0           \n\t"
1452                         "pmaddwd %%mm2, %%mm3           \n\t"
1453                         "psrad $8, %%mm0                \n\t"
1454                         "psrad $8, %%mm3                \n\t"
1455                         "packssdw %%mm3, %%mm0          \n\t"
1456                         "pmaddwd %%mm6, %%mm0           \n\t"
1457                         "packssdw %%mm0, %%mm0          \n\t"
1458                         "movd %%mm0, (%4, %%ebp)        \n\t"
1459                         "addl $4, %%ebp                 \n\t"
1460                         " jnc 1b                        \n\t"
1461
1462                         "popl %%ebp                     \n\t"
1463                         : "+a" (counter)
1464                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1465                         : "%ebx"
1466                 );
1467         }
1468         else if(filterSize==8)
1469         {
1470                 int counter= -2*dstW;
1471                 filter-= counter*4;
1472                 filterPos-= counter/2;
1473                 dst-= counter/2;
1474                 asm volatile(
1475                         "pxor %%mm7, %%mm7              \n\t"
1476                         "movq w02, %%mm6                \n\t"
1477                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
1478                         "movl %%eax, %%ebp              \n\t"
1479                         ".balign 16                     \n\t"
1480                         "1:                             \n\t"
1481                         "movzwl (%2, %%ebp), %%eax      \n\t"
1482                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
1483                         "movq (%1, %%ebp, 8), %%mm1     \n\t"
1484                         "movq 16(%1, %%ebp, 8), %%mm3   \n\t"
1485                         "movd (%3, %%eax), %%mm0        \n\t"
1486                         "movd (%3, %%ebx), %%mm2        \n\t"
1487                         "punpcklbw %%mm7, %%mm0         \n\t"
1488                         "punpcklbw %%mm7, %%mm2         \n\t"
1489                         "pmaddwd %%mm1, %%mm0           \n\t"
1490                         "pmaddwd %%mm2, %%mm3           \n\t"
1491
1492                         "movq 8(%1, %%ebp, 8), %%mm1    \n\t"
1493                         "movq 24(%1, %%ebp, 8), %%mm5   \n\t"
1494                         "movd 4(%3, %%eax), %%mm4       \n\t"
1495                         "movd 4(%3, %%ebx), %%mm2       \n\t"
1496                         "punpcklbw %%mm7, %%mm4         \n\t"
1497                         "punpcklbw %%mm7, %%mm2         \n\t"
1498                         "pmaddwd %%mm1, %%mm4           \n\t"
1499                         "pmaddwd %%mm2, %%mm5           \n\t"
1500                         "paddd %%mm4, %%mm0             \n\t"
1501                         "paddd %%mm5, %%mm3             \n\t"
1502                                                 
1503                         "psrad $8, %%mm0                \n\t"
1504                         "psrad $8, %%mm3                \n\t"
1505                         "packssdw %%mm3, %%mm0          \n\t"
1506                         "pmaddwd %%mm6, %%mm0           \n\t"
1507                         "packssdw %%mm0, %%mm0          \n\t"
1508                         "movd %%mm0, (%4, %%ebp)        \n\t"
1509                         "addl $4, %%ebp                 \n\t"
1510                         " jnc 1b                        \n\t"
1511
1512                         "popl %%ebp                     \n\t"
1513                         : "+a" (counter)
1514                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1515                         : "%ebx"
1516                 );
1517         }
1518         else
1519         {
1520                 int counter= -2*dstW;
1521 //              filter-= counter*filterSize/2;
1522                 filterPos-= counter/2;
1523                 dst-= counter/2;
1524                 asm volatile(
1525                         "pxor %%mm7, %%mm7              \n\t"
1526                         "movq w02, %%mm6                \n\t"
1527                         ".balign 16                     \n\t"
1528                         "1:                             \n\t"
1529                         "movl %2, %%ecx                 \n\t"
1530                         "movzwl (%%ecx, %0), %%eax      \n\t"
1531                         "movzwl 2(%%ecx, %0), %%ebx     \n\t"
1532                         "movl %5, %%ecx                 \n\t"
1533                         "pxor %%mm4, %%mm4              \n\t"
1534                         "pxor %%mm5, %%mm5              \n\t"
1535                         "2:                             \n\t"
1536                         "movq (%1), %%mm1               \n\t"
1537                         "movq (%1, %6), %%mm3           \n\t"
1538                         "movd (%%ecx, %%eax), %%mm0     \n\t"
1539                         "movd (%%ecx, %%ebx), %%mm2     \n\t"
1540                         "punpcklbw %%mm7, %%mm0         \n\t"
1541                         "punpcklbw %%mm7, %%mm2         \n\t"
1542                         "pmaddwd %%mm1, %%mm0           \n\t"
1543                         "pmaddwd %%mm2, %%mm3           \n\t"
1544                         "paddd %%mm3, %%mm5             \n\t"
1545                         "paddd %%mm0, %%mm4             \n\t"
1546                         "addl $8, %1                    \n\t"
1547                         "addl $4, %%ecx                 \n\t"
1548                         "cmpl %4, %%ecx                 \n\t"
1549                         " jb 2b                         \n\t"
1550                         "addl %6, %1                    \n\t"
1551                         "psrad $8, %%mm4                \n\t"
1552                         "psrad $8, %%mm5                \n\t"
1553                         "packssdw %%mm5, %%mm4          \n\t"
1554                         "pmaddwd %%mm6, %%mm4           \n\t"
1555                         "packssdw %%mm4, %%mm4          \n\t"
1556                         "movl %3, %%eax                 \n\t"
1557                         "movd %%mm4, (%%eax, %0)        \n\t"
1558                         "addl $4, %0                    \n\t"
1559                         " jnc 1b                        \n\t"
1560
1561                         : "+r" (counter), "+r" (filter)
1562                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1563                           "m" (src), "r" (filterSize*2)
1564                         : "%ebx", "%eax", "%ecx"
1565                 );
1566         }
1567 #else
1568         int i;
1569         for(i=0; i<dstW; i++)
1570         {
1571                 int j;
1572                 int srcPos= filterPos[i];
1573                 int val=0;
1574 //              printf("filterPos: %d\n", filterPos[i]);
1575                 for(j=0; j<filterSize; j++)
1576                 {
1577 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1578                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1579                 }
1580 //              filter += hFilterSize;
1581                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1582 //              dst[i] = val>>7;
1583         }
1584 #endif
1585 }
1586       // *** horizontal scale Y line to temp buffer
1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1588 {
1589 #ifdef HAVE_MMX
1590         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591     if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1592 #else
1593     if(sws_flags != SWS_FAST_BILINEAR)
1594 #endif
1595     {
1596         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1597     }
1598     else // Fast Bilinear upscale / crap downscale
1599     {
1600 #ifdef ARCH_X86
1601 #ifdef HAVE_MMX2
1602         int i;
1603         if(canMMX2BeUsed)
1604         {
1605                 asm volatile(
1606                         "pxor %%mm7, %%mm7              \n\t"
1607                         "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
1608                         "movd %5, %%mm6                 \n\t" // xInc&0xFFFF
1609                         "punpcklwd %%mm6, %%mm6         \n\t"
1610                         "punpcklwd %%mm6, %%mm6         \n\t"
1611                         "movq %%mm6, %%mm2              \n\t"
1612                         "psllq $16, %%mm2               \n\t"
1613                         "paddw %%mm6, %%mm2             \n\t"
1614                         "psllq $16, %%mm2               \n\t"
1615                         "paddw %%mm6, %%mm2             \n\t"
1616                         "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=xInc&0xFF
1617                         "movq %%mm2, temp0              \n\t"
1618                         "movd %4, %%mm6                 \n\t" //(xInc*4)&0xFFFF
1619                         "punpcklwd %%mm6, %%mm6         \n\t"
1620                         "punpcklwd %%mm6, %%mm6         \n\t"
1621                         "xorl %%eax, %%eax              \n\t" // i
1622                         "movl %0, %%esi                 \n\t" // src
1623                         "movl %1, %%edi                 \n\t" // buf1
1624                         "movl %3, %%edx                 \n\t" // (xInc*4)>>16
1625                         "xorl %%ecx, %%ecx              \n\t"
1626                         "xorl %%ebx, %%ebx              \n\t"
1627                         "movw %4, %%bx                  \n\t" // (xInc*4)&0xFFFF
1628
1629 #define FUNNY_Y_CODE \
1630                         PREFETCH" 1024(%%esi)           \n\t"\
1631                         PREFETCH" 1056(%%esi)           \n\t"\
1632                         PREFETCH" 1088(%%esi)           \n\t"\
1633                         "call funnyYCode                \n\t"\
1634                         "movq temp0, %%mm2              \n\t"\
1635                         "xorl %%ecx, %%ecx              \n\t"
1636
1637 FUNNY_Y_CODE
1638 FUNNY_Y_CODE
1639 FUNNY_Y_CODE
1640 FUNNY_Y_CODE
1641 FUNNY_Y_CODE
1642 FUNNY_Y_CODE
1643 FUNNY_Y_CODE
1644 FUNNY_Y_CODE
1645
1646                         :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1647                         "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1648                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1649                 );
1650                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
1651         }
1652         else
1653         {
1654 #endif
1655         //NO MMX just normal asm ...
1656         asm volatile(
1657                 "xorl %%eax, %%eax              \n\t" // i
1658                 "xorl %%ebx, %%ebx              \n\t" // xx
1659                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
1660                 ".balign 16                     \n\t"
1661                 "1:                             \n\t"
1662                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
1663                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
1664                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
1665                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
1666                 "shll $16, %%edi                \n\t"
1667                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1668                 "movl %1, %%edi                 \n\t"
1669                 "shrl $9, %%esi                 \n\t"
1670                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
1671                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
1672                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
1673
1674                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
1675                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
1676                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
1677                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
1678                 "shll $16, %%edi                \n\t"
1679                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1680                 "movl %1, %%edi                 \n\t"
1681                 "shrl $9, %%esi                 \n\t"
1682                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
1683                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
1684                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
1685
1686
1687                 "addl $2, %%eax                 \n\t"
1688                 "cmpl %2, %%eax                 \n\t"
1689                 " jb 1b                         \n\t"
1690
1691
1692                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1693                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1694                 );
1695 #ifdef HAVE_MMX2
1696         } //if MMX2 cant be used
1697 #endif
1698 #else
1699         int i;
1700         unsigned int xpos=0;
1701         for(i=0;i<dstWidth;i++)
1702         {
1703                 register unsigned int xx=xpos>>16;
1704                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1705                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1706                 xpos+=xInc;
1707         }
1708 #endif
1709     }
1710 }
1711
1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
1713                                 uint8_t *src1, uint8_t *src2, int srcW, int xInc)
1714 {
1715 #ifdef HAVE_MMX
1716         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717     if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1718 #else
1719     if(sws_flags != SWS_FAST_BILINEAR)
1720 #endif
1721     {
1722         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1723         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1724     }
1725     else // Fast Bilinear upscale / crap downscale
1726     {
1727 #ifdef ARCH_X86
1728 #ifdef HAVE_MMX2
1729         int i;
1730         if(canMMX2BeUsed)
1731         {
1732                 asm volatile(
1733                 "pxor %%mm7, %%mm7              \n\t"
1734                 "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
1735                 "movd %5, %%mm6                 \n\t" // xInc&0xFFFF
1736                 "punpcklwd %%mm6, %%mm6         \n\t"
1737                 "punpcklwd %%mm6, %%mm6         \n\t"
1738                 "movq %%mm6, %%mm2              \n\t"
1739                 "psllq $16, %%mm2               \n\t"
1740                 "paddw %%mm6, %%mm2             \n\t"
1741                 "psllq $16, %%mm2               \n\t"
1742                 "paddw %%mm6, %%mm2             \n\t"
1743                 "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=xInc&0xFFFF
1744                 "movq %%mm2, temp0              \n\t"
1745                 "movd %4, %%mm6                 \n\t" //(xInc*4)&0xFFFF
1746                 "punpcklwd %%mm6, %%mm6         \n\t"
1747                 "punpcklwd %%mm6, %%mm6         \n\t"
1748                 "xorl %%eax, %%eax              \n\t" // i
1749                 "movl %0, %%esi                 \n\t" // src
1750                 "movl %1, %%edi                 \n\t" // buf1
1751                 "movl %3, %%edx                 \n\t" // (xInc*4)>>16
1752                 "xorl %%ecx, %%ecx              \n\t"
1753                 "xorl %%ebx, %%ebx              \n\t"
1754                 "movw %4, %%bx                  \n\t" // (xInc*4)&0xFFFF
1755
1756 #define FUNNYUVCODE \
1757                         PREFETCH" 1024(%%esi)           \n\t"\
1758                         PREFETCH" 1056(%%esi)           \n\t"\
1759                         PREFETCH" 1088(%%esi)           \n\t"\
1760                         "call funnyUVCode               \n\t"\
1761                         "movq temp0, %%mm2              \n\t"\
1762                         "xorl %%ecx, %%ecx              \n\t"
1763
1764 FUNNYUVCODE
1765 FUNNYUVCODE
1766 FUNNYUVCODE
1767 FUNNYUVCODE
1768
1769 FUNNYUVCODE
1770 FUNNYUVCODE
1771 FUNNYUVCODE
1772 FUNNYUVCODE
1773                 "xorl %%eax, %%eax              \n\t" // i
1774                 "movl %6, %%esi                 \n\t" // src
1775                 "movl %1, %%edi                 \n\t" // buf1
1776                 "addl $4096, %%edi              \n\t"
1777
1778 FUNNYUVCODE
1779 FUNNYUVCODE
1780 FUNNYUVCODE
1781 FUNNYUVCODE
1782
1783 FUNNYUVCODE
1784 FUNNYUVCODE
1785 FUNNYUVCODE
1786 FUNNYUVCODE
1787
1788                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1789                   "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1790                 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1791         );
1792                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1793                 {
1794 //                      printf("%d %d %d\n", dstWidth, i, srcW);
1795                         dst[i] = src1[srcW-1]*128;
1796                         dst[i+2048] = src2[srcW-1]*128;
1797                 }
1798         }
1799         else
1800         {
1801 #endif
1802         asm volatile(
1803                 "xorl %%eax, %%eax              \n\t" // i
1804                 "xorl %%ebx, %%ebx              \n\t" // xx
1805                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
1806                 ".balign 16                     \n\t"
1807                 "1:                             \n\t"
1808                 "movl %0, %%esi                 \n\t"
1809                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
1810                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
1811                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
1812                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
1813                 "shll $16, %%edi                \n\t"
1814                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1815                 "movl %1, %%edi                 \n\t"
1816                 "shrl $9, %%esi                 \n\t"
1817                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
1818
1819                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
1820                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
1821                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
1822                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
1823                 "shll $16, %%edi                \n\t"
1824                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1825                 "movl %1, %%edi                 \n\t"
1826                 "shrl $9, %%esi                 \n\t"
1827                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1828
1829                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
1830                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
1831                 "addl $1, %%eax                 \n\t"
1832                 "cmpl %2, %%eax                 \n\t"
1833                 " jb 1b                         \n\t"
1834
1835                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1836                 "r" (src2)
1837                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1838                 );
1839 #ifdef HAVE_MMX2
1840         } //if MMX2 cant be used
1841 #endif
1842 #else
1843         int i;
1844         unsigned int xpos=0;
1845         for(i=0;i<dstWidth;i++)
1846         {
1847                 register unsigned int xx=xpos>>16;
1848                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1849                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1850                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1851 /* slower
1852           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1853           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1854 */
1855                 xpos+=xInc;
1856         }
1857 #endif
1858    }
1859 }
1860
1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
1862                                       int srcW, int dstW, int filterAlign, int one)
1863 {
1864         int i;
1865         double filter[8000];
1866 #ifdef HAVE_MMX
1867         asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
1868 #endif
1869
1870         if(ABS(xInc - 0x10000) <10) // unscaled
1871         {
1872                 int i;
1873                 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
1874                 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
1875
1876                 for(i=0; i<dstW; i++)
1877                 {
1878                         filter[i*(*filterSize)]=1;
1879                         filterPos[i]=i;
1880                 }
1881
1882         }
1883         else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
1884         {
1885                 int i;
1886                 int xDstInSrc;
1887                 if(sws_flags==SWS_BICUBIC) *filterSize= 4;
1888                 else                       *filterSize= 2;
1889 //              printf("%d %d %d\n", filterSize, srcW, dstW);
1890                 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1891
1892                 xDstInSrc= xInc/2 - 0x8000;
1893                 for(i=0; i<dstW; i++)
1894                 {
1895                         int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
1896                         int j;
1897
1898                         filterPos[i]= xx;
1899                         if(sws_flags == SWS_BICUBIC)
1900                         {
1901                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
1902                                 double y1,y2,y3,y4;
1903                                 double A= -0.75;
1904                                         // Equation is from VirtualDub
1905                                 y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
1906                                 y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
1907                                 y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
1908                                 y4 = (                  +           A*d*d -       A*d*d*d);
1909
1910 //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1911                                 filter[i*(*filterSize) + 0]= y1;
1912                                 filter[i*(*filterSize) + 1]= y2;
1913                                 filter[i*(*filterSize) + 2]= y3;
1914                                 filter[i*(*filterSize) + 3]= y4;
1915 //                              printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
1916                         }
1917                         else
1918                         {
1919                                 for(j=0; j<*filterSize; j++)
1920                                 {
1921                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
1922                                         double coeff= 1.0 - d;
1923                                         if(coeff<0) coeff=0;
1924         //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1925                                         filter[i*(*filterSize) + j]= coeff;
1926                                         xx++;
1927                                 }
1928                         }
1929                         xDstInSrc+= xInc;
1930                 }
1931         }
1932         else // downscale
1933         {
1934                 int xDstInSrc;
1935                 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
1936                 else                       *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
1937 //              printf("%d %d %d\n", *filterSize, srcW, dstW);
1938                 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1939
1940                 xDstInSrc= xInc/2 - 0x8000;
1941                 for(i=0; i<dstW; i++)
1942                 {
1943                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - ((*filterSize)-1)*0.5 + 0.5);
1944                         int j;
1945
1946                         filterPos[i]= xx;
1947                         for(j=0; j<*filterSize; j++)
1948                         {
1949                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
1950                                 double coeff;
1951                                 if(sws_flags == SWS_BICUBIC)
1952                                 {
1953                                         double A= -0.75;
1954 //                                      d*=2;
1955                                         // Equation is from VirtualDub
1956                                         if(d<1.0)
1957                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1958                                         else if(d<2.0)
1959                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
1960                                         else
1961                                                 coeff=0.0;
1962                                 }
1963                                 else
1964                                 {
1965                                         coeff= 1.0 - d;
1966                                         if(coeff<0) coeff=0;
1967                                 }
1968 //                              if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1969                                 filter[i*(*filterSize) + j]= coeff;
1970                                 xx++;
1971                         }
1972                         xDstInSrc+= xInc;
1973                 }
1974         }
1975
1976         //fix borders
1977         for(i=0; i<dstW; i++)
1978         {
1979                 int j;
1980                 if(filterPos[i] < 0)
1981                 {
1982                         // Move filter coeffs left to compensate for filterPos
1983                         for(j=1; j<*filterSize; j++)
1984                         {
1985                                 int left= MAX(j + filterPos[i], 0);
1986                                 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
1987                                 filter[i*(*filterSize) + j]=0;
1988                         }
1989                         filterPos[i]= 0;
1990                 }
1991
1992                 if(filterPos[i] + (*filterSize) > srcW)
1993                 {
1994                         int shift= filterPos[i] + (*filterSize) - srcW;
1995                         // Move filter coeffs right to compensate for filterPos
1996                         for(j=(*filterSize)-2; j>=0; j--)
1997                         {
1998                                 int right= MIN(j + shift, (*filterSize)-1);
1999                                 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2000                                 filter[i*(*filterSize) +j]=0;
2001                         }
2002                         filterPos[i]= srcW - (*filterSize);
2003                 }
2004         }
2005
2006         //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2007         // and skip these than later
2008
2009         //Normalize
2010         for(i=0; i<dstW; i++)
2011         {
2012                 int j;
2013                 double sum=0;
2014                 double scale= one;
2015                 for(j=0; j<*filterSize; j++)
2016                 {
2017                         sum+= filter[i*(*filterSize) + j];
2018                 }
2019                 scale/= sum;
2020                 for(j=0; j<*filterSize; j++)
2021                 {
2022                         dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
2023                 }
2024         }
2025 }
2026
2027 #ifdef HAVE_MMX2
2028 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
2029 {
2030         uint8_t *fragment;
2031         int imm8OfPShufW1;
2032         int imm8OfPShufW2;
2033         int fragmentLength;
2034
2035         int xpos, i;
2036
2037         // create an optimized horizontal scaling routine
2038
2039         //code fragment
2040
2041         asm volatile(
2042                 "jmp 9f                         \n\t"
2043         // Begin
2044                 "0:                             \n\t"
2045                 "movq (%%esi), %%mm0            \n\t" //FIXME Alignment
2046                 "movq %%mm0, %%mm1              \n\t"
2047                 "psrlq $8, %%mm0                \n\t"
2048                 "punpcklbw %%mm7, %%mm1 \n\t"
2049                 "movq %%mm2, %%mm3              \n\t"
2050                 "punpcklbw %%mm7, %%mm0 \n\t"
2051                 "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
2052                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
2053                 "1:                             \n\t"
2054                 "adcl %%edx, %%esi              \n\t" //xx+= (4*lumXInc)>>16 + carry
2055                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
2056                 "2:                             \n\t"
2057                 "psrlw $9, %%mm3                \n\t"
2058                 "psubw %%mm1, %%mm0             \n\t"
2059                 "pmullw %%mm3, %%mm0            \n\t"
2060                 "paddw %%mm6, %%mm2             \n\t" // 2*alpha += xpos&0xFFFF
2061                 "psllw $7, %%mm1                \n\t"
2062                 "paddw %%mm1, %%mm0             \n\t"
2063
2064                 "movq %%mm0, (%%edi, %%eax)     \n\t"
2065
2066                 "addl $8, %%eax                 \n\t"
2067         // End
2068                 "9:                             \n\t"
2069 //              "int $3\n\t"
2070                 "leal 0b, %0                    \n\t"
2071                 "leal 1b, %1                    \n\t"
2072                 "leal 2b, %2                    \n\t"
2073                 "decl %1                        \n\t"
2074                 "decl %2                        \n\t"
2075                 "subl %0, %1                    \n\t"
2076                 "subl %0, %2                    \n\t"
2077                 "leal 9b, %3                    \n\t"
2078                 "subl %0, %3                    \n\t"
2079                 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
2080                 "=r" (fragmentLength)
2081         );
2082
2083         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
2084
2085         for(i=0; i<dstW/8; i++)
2086         {
2087                 int xx=xpos>>16;
2088
2089                 if((i&3) == 0)
2090                 {
2091                         int a=0;
2092                         int b=((xpos+xInc)>>16) - xx;
2093                         int c=((xpos+xInc*2)>>16) - xx;
2094                         int d=((xpos+xInc*3)>>16) - xx;
2095
2096                         memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
2097
2098                         funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
2099                         funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
2100                                 a | (b<<2) | (c<<4) | (d<<6);
2101
2102                         // if we dont need to read 8 bytes than dont :), reduces the chance of
2103                         // crossing a cache line
2104                         if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
2105
2106                         funnyCode[fragmentLength*(i+4)/4]= RET;
2107                 }
2108                 xpos+=xInc;
2109         }
2110 /*
2111         xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
2112         for(i=0; i<dstUVw/8; i++)
2113         {
2114                 int xx=xpos>>16;
2115
2116                 if((i&3) == 0)
2117                 {
2118                         int a=0;
2119                         int b=((xpos+chrXInc)>>16) - xx;
2120                         int c=((xpos+chrXInc*2)>>16) - xx;
2121                         int d=((xpos+chrXInc*3)>>16) - xx;
2122
2123                         memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
2124
2125                         funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
2126                         funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
2127                                 a | (b<<2) | (c<<4) | (d<<6);
2128
2129                         // if we dont need to read 8 bytes than dont :), reduces the chance of
2130                         // crossing a cache line
2131                         if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
2132
2133                         funnyUVCode[fragmentLength*(i+4)/4]= RET;
2134                 }
2135                 xpos+=chrXInc;
2136         }
2137 */
2138 //      funnyCode[0]= RET;
2139 }
2140 #endif // HAVE_MMX2
2141
2142 static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
2143                              int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
2144                              int srcW, int srcH, int dstW, int dstH){
2145
2146
2147 unsigned int lumXInc= (srcW << 16) / dstW;
2148 unsigned int lumYInc= (srcH << 16) / dstH;
2149 unsigned int chrXInc;
2150 unsigned int chrYInc;
2151
2152 static int dstY;
2153
2154 // used to detect a size change
2155 static int oldDstW= -1;
2156 static int oldSrcW= -1;
2157 static int oldDstH= -1;
2158 static int oldSrcH= -1;
2159 static int oldFlags=-1;
2160
2161 static int lastInLumBuf;
2162 static int lastInChrBuf;
2163
2164 int chrDstW, chrDstH;
2165
2166 static int lumBufIndex=0;
2167 static int chrBufIndex=0;
2168
2169 static int firstTime=1;
2170
2171 const int widthAlign= dstbpp==12 ? 16 : 8;
2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173 const int over= dstbpp==12 ?      (((dstW+15)&(~15))) - dststride
2174                                 : (((dstW+7)&(~7)))*bytespp - dststride;
2175 if(dststride%widthAlign !=0 )
2176 {
2177         if(firstTime)
2178                 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
2179                                 "SwScaler:          ->cannot do aligned memory acesses anymore\n",
2180                                 widthAlign);
2181 }
2182
2183 if(over>0)
2184 {
2185         if(firstTime)
2186                 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187                                 "SwScaler:          and dststride is not large enough to handle %d extra bytes\n"
2188                                 "SwScaler:          ->using unoptimized C version for last line(s)\n",
2189                                 over);
2190 }
2191
2192
2193
2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
2196
2197 #ifdef HAVE_MMX2
2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2200 {
2201         if(firstTime)
2202                 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2203 }
2204 #else
2205 canMMX2BeUsed=0; // should be 0 anyway but ...
2206 #endif
2207
2208 if(firstTime)
2209 {
2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX)
2211         char *dither= " dithered";
2212 #else
2213         char *dither= "";
2214 #endif
2215         if(sws_flags==SWS_FAST_BILINEAR)
2216                 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
2217         else if(sws_flags==SWS_BILINEAR)
2218                 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
2219         else if(sws_flags==SWS_BICUBIC)
2220                 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
2221         else
2222                 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
2223
2224         if(dstbpp==15)
2225                 fprintf(stderr, "with%s BGR15 output ", dither);
2226         else if(dstbpp==16)
2227                 fprintf(stderr, "with%s BGR16 output ", dither);
2228         else if(dstbpp==24)
2229                 fprintf(stderr, "with BGR24 output ");
2230         else if(dstbpp==32)
2231                 fprintf(stderr, "with BGR32 output ");
2232         else if(dstbpp==12)
2233                 fprintf(stderr, "with YV12 output ");
2234         else
2235                 fprintf(stderr, "without output ");
2236
2237 #ifdef HAVE_MMX2
2238                 fprintf(stderr, "using MMX2\n");
2239 #elif defined (HAVE_3DNOW)
2240                 fprintf(stderr, "using 3DNOW\n");
2241 #elif defined (HAVE_MMX)
2242                 fprintf(stderr, "using MMX\n");
2243 #elif defined (ARCH_X86)
2244                 fprintf(stderr, "using X86 ASM\n");
2245 #else
2246                 fprintf(stderr, "using C\n");
2247 #endif
2248 }
2249
2250
2251 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2252 // n-2 is the last chrominance sample available
2253 // this is not perfect, but noone shuld notice the difference, the more correct variant
2254 // would be like the vertical one, but that would require some special code for the
2255 // first and last pixel
2256 if(sws_flags==SWS_FAST_BILINEAR)
2257 {
2258         if(canMMX2BeUsed)       lumXInc+= 20;
2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
2260         else                    lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2261 #endif
2262 }
2263
2264 if(fullUVIpol && !(dstbpp==12))         chrXInc= lumXInc>>1, chrDstW= dstW;
2265 else                                    chrXInc= lumXInc,    chrDstW= (dstW+1)>>1;
2266
2267 if(dstbpp==12)  chrYInc= lumYInc,    chrDstH= (dstH+1)>>1;
2268 else            chrYInc= lumYInc>>1, chrDstH= dstH;
2269
2270   // force calculation of the horizontal interpolation of the first line
2271
2272   if(srcSliceY ==0){
2273 //      printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
2274         lumBufIndex=0;
2275         chrBufIndex=0;
2276         dstY=0;
2277
2278         //precalculate horizontal scaler filter coefficients
2279         if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
2280         {
2281 #ifdef HAVE_MMX
2282                 const int filterAlign=4;
2283 #else
2284                 const int filterAlign=1;
2285 #endif
2286                 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2287
2288                 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2289                                 srcW   , dstW   , filterAlign, 1<<14);
2290                 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2291                                 (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
2292
2293 #ifdef HAVE_MMX2
2294 // cant downscale !!!
2295                 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2296                 {
2297                         initMMX2HScaler(dstW   , lumXInc, funnyYCode);
2298                         initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
2299                 }
2300 #endif
2301         } // Init Horizontal stuff
2302
2303         if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
2304         {
2305                 int i;
2306                 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
2307
2308                 // deallocate pixbufs
2309                 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
2310                 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
2311
2312                 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2313                                 srcH   , dstH,    1, (1<<12)-4);
2314                 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
2315                                 (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
2316
2317                 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2318                 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2319                 for(i=0; i<dstH; i++)
2320                 {
2321                         int chrI= i*chrDstH / dstH;
2322                         int nextSlice= MAX(vLumFilterPos[i   ] + vLumFilterSize - 1,
2323                                          ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
2324                         nextSlice&= ~1; // Slices start at even boundaries
2325                         if(vLumFilterPos[i   ] + vLumBufSize < nextSlice)
2326                                 vLumBufSize= nextSlice - vLumFilterPos[i   ];
2327                         if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
2328                                 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
2329                 }
2330
2331                 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2332                 // allocate several megabytes to handle all possible cases)
2333                 for(i=0; i<vLumBufSize; i++)
2334                         lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
2335                 for(i=0; i<vChrBufSize; i++)
2336                         chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
2337
2338                 //try to avoid drawing green stuff between the right end and the stride end
2339                 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
2340                 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
2341
2342                 ASSERT(chrDstH<=dstH)
2343                 ASSERT(vLumFilterSize*dstH*4<16000)
2344                 ASSERT(vChrFilterSize*chrDstH*4<16000)
2345 #ifdef HAVE_MMX
2346                 // pack filter data for mmx code
2347                 for(i=0; i<vLumFilterSize*dstH; i++)
2348                         lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
2349                                 vLumFilter[i];
2350                 for(i=0; i<vChrFilterSize*chrDstH; i++)
2351                         chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2352                                 vChrFilter[i];
2353 #endif
2354         }
2355
2356         if(firstTime && verbose)
2357         {
2358 #ifdef HAVE_MMX2
2359                 int mmx2=1;
2360 #else
2361                 int mmx2=0;
2362 #endif
2363 #ifdef HAVE_MMX
2364                 int mmx=1;
2365 #else
2366                 int mmx=0;
2367 #endif
2368
2369 #ifdef HAVE_MMX
2370                 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2371                         printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2372                 else
2373                 {
2374                         if(hLumFilterSize==4)
2375                                 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2376                         else if(hLumFilterSize==8)
2377                                 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2378                         else
2379                                 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2380
2381                         if(hChrFilterSize==4)
2382                                 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2383                         else if(hChrFilterSize==8)
2384                                 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2385                         else
2386                                 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2387                 }
2388 #elif defined (ARCH_X86)
2389                 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2390 #else
2391                 if(sws_flags==SWS_FAST_BILINEAR)
2392                         printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2393                 else
2394                         printf("SwScaler: using C scaler for horizontal scaling\n");
2395 #endif
2396
2397                 if(dstbpp==12)
2398                 {
2399                         if(vLumFilterSize==1)
2400                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2401                         else
2402                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2403                 }
2404                 else
2405                 {
2406                         if(vLumFilterSize==1 && vChrFilterSize==2)
2407                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2408                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2409                         else if(vLumFilterSize==2 && vChrFilterSize==2)
2410                                 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2411                         else
2412                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2413                 }
2414
2415                 if(dstbpp==24)
2416                         printf("SwScaler: using %s YV12->BGR24 Converter\n",
2417                                 mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2418                 else
2419                         printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2420
2421                 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2422         }
2423
2424         lastInLumBuf= -1;
2425         lastInChrBuf= -1;
2426   } // if(firstLine)
2427
2428         for(;dstY < dstH; dstY++){
2429                 unsigned char *dest =dstptr[0]+dststride*dstY;
2430                 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
2431                 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
2432                 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
2433
2434                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2435                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2436                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2437                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2438
2439                 if(sws_flags == SWS_FAST_BILINEAR)
2440                 {
2441                         //handle holes
2442                         if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2443                         if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2444                 }
2445
2446                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2447                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2448
2449                 // Do we have enough lines in this slice to output the dstY line
2450                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2451                 {
2452                         //Do horizontal scaling
2453                         while(lastInLumBuf < lastLumSrcY)
2454                         {
2455                                 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2456                                 lumBufIndex++;
2457                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2458                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2459                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2460 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2461                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2462                                 lastInLumBuf++;
2463                         }
2464                         while(lastInChrBuf < lastChrSrcY)
2465                         {
2466                                 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2467                                 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2468                                 chrBufIndex++;
2469                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2470                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2471                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2472                                 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2473                                 lastInChrBuf++;
2474                         }
2475                         //wrap buf index around to stay inside the ring buffer
2476                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2477                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2478                 }
2479                 else // not enough lines left in this slice -> load the rest in the buffer
2480                 {
2481 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2482                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2483                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2484                         vChrBufSize, vLumBufSize);
2485 */
2486                         //Do horizontal scaling
2487                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2488                         {
2489                                 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2490                                 lumBufIndex++;
2491                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2492                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2493                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2494                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2495                                 lastInLumBuf++;
2496                         }
2497                         while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2498                         {
2499                                 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2500                                 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2501                                 chrBufIndex++;
2502                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2503                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2504                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2505                                 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2506                                 lastInChrBuf++;
2507                         }
2508                         //wrap buf index around to stay inside the ring buffer
2509                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2510                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2511                         break; //we cant output a dstY line so lets try with the next slice
2512                 }
2513
2514 #ifdef HAVE_MMX
2515                 b5Dither= dither8[dstY&1];
2516                 g6Dither= dither4[dstY&1];
2517                 g5Dither= dither8[dstY&1];
2518                 r5Dither= dither8[(dstY+1)&1];
2519 #endif
2520             if(dstY < dstH-2 || over<=0)
2521             {
2522                 if(dstbpp==12) //YV12
2523                 {
2524                         if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2525                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2526                         {
2527                                 int16_t *lumBuf = lumPixBuf[0];
2528                                 int16_t *chrBuf= chrPixBuf[0];
2529                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2530                         }
2531                         else //General YV12
2532                         {
2533                                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2534                                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2535                                 RENAME(yuv2yuvX)(
2536                                         vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2537                                         vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2538                                         dest, uDest, vDest, dstW,
2539                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2540                         }
2541                 }
2542                 else
2543                 {
2544                         int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2545                         int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2546
2547                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2548                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2549                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2550                         {
2551                                 int chrAlpha= vChrFilter[2*dstY+1];
2552
2553                                 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2554                                                  dest, dstW, chrAlpha, dstbpp);
2555                         }
2556                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2557                         {
2558                                 int lumAlpha= vLumFilter[2*dstY+1];
2559                                 int chrAlpha= vChrFilter[2*dstY+1];
2560
2561                                 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2562                                                  dest, dstW, lumAlpha, chrAlpha, dstbpp);
2563                         }
2564                         else //General RGB
2565                         {
2566                                 RENAME(yuv2rgbX)(
2567                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2568                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2569                                         dest, dstW, dstbpp,
2570                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2571                         }
2572                 }
2573             }
2574             else // hmm looks like we cant use MMX here without overwriting this arrays tail
2575             {
2576                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2577                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2578                 if(dstbpp==12) //YV12
2579                 {
2580                         if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2581                         yuv2yuvXinC(
2582                                 vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2583                                 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2584                                 dest, uDest, vDest, dstW);
2585                 }
2586                 else
2587                 {
2588                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2589                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2590                         yuv2rgbXinC(
2591                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2592                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2593                                 dest, dstW, dstbpp);
2594                 }
2595             }
2596         }
2597
2598 #ifdef HAVE_MMX
2599         __asm __volatile(SFENCE:::"memory");
2600         __asm __volatile(EMMS:::"memory");
2601 #endif
2602         firstTime=0;
2603 }