]> git.sesse.net Git - ffmpeg/blob - libswscale/swscale_template.c
simplify
[ffmpeg] / libswscale / swscale_template.c
1 /*
2     Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #include "asmalign.h"
20
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef PAVGB
24 #undef PREFETCH
25 #undef PREFETCHW
26 #undef EMMS
27 #undef SFENCE
28
29 #ifdef HAVE_3DNOW
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
31 #define EMMS     "femms"
32 #else
33 #define EMMS     "emms"
34 #endif
35
36 #ifdef HAVE_3DNOW
37 #define PREFETCH  "prefetch"
38 #define PREFETCHW "prefetchw"
39 #elif defined ( HAVE_MMX2 )
40 #define PREFETCH "prefetchnta"
41 #define PREFETCHW "prefetcht0"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_MMX2
48 #define SFENCE "sfence"
49 #else
50 #define SFENCE "/nop"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
55 #elif defined (HAVE_3DNOW)
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #endif
58
59 #ifdef HAVE_MMX2
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
61 #else
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #endif
64 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
65
66 #ifdef HAVE_ALTIVEC
67 #include "swscale_altivec_template.c"
68 #endif
69
70 #define YSCALEYUV2YV12X(x, offset) \
71                         "xor %%"REG_a", %%"REG_a"       \n\t"\
72                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
73                         "movq %%mm3, %%mm4              \n\t"\
74                         "lea " offset "(%0), %%"REG_d"  \n\t"\
75                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
76                         ASMALIGN16 /* FIXME Unroll? */\
77                         "1:                             \n\t"\
78                         "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
79                         "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
80                         "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
81                         "add $16, %%"REG_d"             \n\t"\
82                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
83                         "test %%"REG_S", %%"REG_S"      \n\t"\
84                         "pmulhw %%mm0, %%mm2            \n\t"\
85                         "pmulhw %%mm0, %%mm5            \n\t"\
86                         "paddw %%mm2, %%mm3             \n\t"\
87                         "paddw %%mm5, %%mm4             \n\t"\
88                         " jnz 1b                        \n\t"\
89                         "psraw $3, %%mm3                \n\t"\
90                         "psraw $3, %%mm4                \n\t"\
91                         "packuswb %%mm4, %%mm3          \n\t"\
92                         MOVNTQ(%%mm3, (%1, %%REGa))\
93                         "add $8, %%"REG_a"              \n\t"\
94                         "cmp %2, %%"REG_a"              \n\t"\
95                         "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
96                         "movq %%mm3, %%mm4              \n\t"\
97                         "lea " offset "(%0), %%"REG_d"  \n\t"\
98                         "mov (%%"REG_d"), %%"REG_S"     \n\t"\
99                         "jb 1b                          \n\t"
100
101 #define YSCALEYUV2YV121 \
102                         "mov %2, %%"REG_a"              \n\t"\
103                         ASMALIGN16 /* FIXME Unroll? */\
104                         "1:                             \n\t"\
105                         "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
106                         "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
107                         "psraw $7, %%mm0                \n\t"\
108                         "psraw $7, %%mm1                \n\t"\
109                         "packuswb %%mm1, %%mm0          \n\t"\
110                         MOVNTQ(%%mm0, (%1, %%REGa))\
111                         "add $8, %%"REG_a"              \n\t"\
112                         "jnc 1b                         \n\t"
113
114 /*
115                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
116                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
117                            "r" (dest), "m" (dstW),
118                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
119                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
120 */
121 #define YSCALEYUV2PACKEDX \
122                 "xor %%"REG_a", %%"REG_a"       \n\t"\
123                 ASMALIGN16\
124                 "nop                            \n\t"\
125                 "1:                             \n\t"\
126                 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
127                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
128                 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
129                 "movq %%mm3, %%mm4              \n\t"\
130                 ASMALIGN16\
131                 "2:                             \n\t"\
132                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
133                 "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
134                 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
135                 "add $16, %%"REG_d"             \n\t"\
136                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
137                 "pmulhw %%mm0, %%mm2            \n\t"\
138                 "pmulhw %%mm0, %%mm5            \n\t"\
139                 "paddw %%mm2, %%mm3             \n\t"\
140                 "paddw %%mm5, %%mm4             \n\t"\
141                 "test %%"REG_S", %%"REG_S"      \n\t"\
142                 " jnz 2b                        \n\t"\
143 \
144                 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
145                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
146                 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
147                 "movq %%mm1, %%mm7              \n\t"\
148                 ASMALIGN16\
149                 "2:                             \n\t"\
150                 "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
151                 "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
152                 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
153                 "add $16, %%"REG_d"             \n\t"\
154                 "mov (%%"REG_d"), %%"REG_S"     \n\t"\
155                 "pmulhw %%mm0, %%mm2            \n\t"\
156                 "pmulhw %%mm0, %%mm5            \n\t"\
157                 "paddw %%mm2, %%mm1             \n\t"\
158                 "paddw %%mm5, %%mm7             \n\t"\
159                 "test %%"REG_S", %%"REG_S"      \n\t"\
160                 " jnz 2b                        \n\t"\
161
162
163 #define YSCALEYUV2RGBX \
164                 YSCALEYUV2PACKEDX\
165                 "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
166                 "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
167                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
168                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
169                 "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
170                 "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
171         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
172                 "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
173                 "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
174                 "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
175                 "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
176                 "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
177                 "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
178         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
179                 "paddw %%mm3, %%mm4             \n\t"\
180                 "movq %%mm2, %%mm0              \n\t"\
181                 "movq %%mm5, %%mm6              \n\t"\
182                 "movq %%mm4, %%mm3              \n\t"\
183                 "punpcklwd %%mm2, %%mm2         \n\t"\
184                 "punpcklwd %%mm5, %%mm5         \n\t"\
185                 "punpcklwd %%mm4, %%mm4         \n\t"\
186                 "paddw %%mm1, %%mm2             \n\t"\
187                 "paddw %%mm1, %%mm5             \n\t"\
188                 "paddw %%mm1, %%mm4             \n\t"\
189                 "punpckhwd %%mm0, %%mm0         \n\t"\
190                 "punpckhwd %%mm6, %%mm6         \n\t"\
191                 "punpckhwd %%mm3, %%mm3         \n\t"\
192                 "paddw %%mm7, %%mm0             \n\t"\
193                 "paddw %%mm7, %%mm6             \n\t"\
194                 "paddw %%mm7, %%mm3             \n\t"\
195                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
196                 "packuswb %%mm0, %%mm2          \n\t"\
197                 "packuswb %%mm6, %%mm5          \n\t"\
198                 "packuswb %%mm3, %%mm4          \n\t"\
199                 "pxor %%mm7, %%mm7              \n\t"
200 #if 0
201 #define FULL_YSCALEYUV2RGB \
202                 "pxor %%mm7, %%mm7              \n\t"\
203                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
204                 "punpcklwd %%mm6, %%mm6         \n\t"\
205                 "punpcklwd %%mm6, %%mm6         \n\t"\
206                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
207                 "punpcklwd %%mm5, %%mm5         \n\t"\
208                 "punpcklwd %%mm5, %%mm5         \n\t"\
209                 "xor %%"REG_a", %%"REG_a"               \n\t"\
210                 ASMALIGN16\
211                 "1:                             \n\t"\
212                 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
213                 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
214                 "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
215                 "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
216                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
217                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
218                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
219                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
220                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
221                 "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
222                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
223                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
224                 "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
225                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
226                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
227                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
228                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
229                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 \
231 \
232                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
233                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
234                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
235                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
236                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
237                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
238                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
239 \
240 \
241                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
242                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
243                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
244                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
245                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
246                 "packuswb %%mm3, %%mm3          \n\t"\
247 \
248                 "packuswb %%mm0, %%mm0          \n\t"\
249                 "paddw %%mm4, %%mm2             \n\t"\
250                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
251 \
252                 "packuswb %%mm1, %%mm1          \n\t"
253 #endif
254
255 #define REAL_YSCALEYUV2PACKED(index, c) \
256                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
257                 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
258                 "psraw $3, %%mm0                \n\t"\
259                 "psraw $3, %%mm1                \n\t"\
260                 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
261                 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
262                 "xor "#index", "#index"         \n\t"\
263                 ASMALIGN16\
264                 "1:                             \n\t"\
265                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
266                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
267                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
268                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
269                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
270                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
271                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
272                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
273                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
274                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
275                 "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
276                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
277                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
278                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
279                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
280                 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
281                 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
282                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
283                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
284                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
285                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
286                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
287                 "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
288                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
290                 
291 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
292                 
293 #define REAL_YSCALEYUV2RGB(index, c) \
294                 "xor "#index", "#index" \n\t"\
295                 ASMALIGN16\
296                 "1:                             \n\t"\
297                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
298                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
299                 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
300                 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
301                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
302                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
303                 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
304                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
305                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
306                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
309                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
310                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
311                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
312                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
313                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
314                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
315                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
316         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
317                 "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
318                 "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
319                 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
320                 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
321                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
322                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
323                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
324                 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
325                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
326                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
327                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
328                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
329                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
330                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
331                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
332                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
333                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
334                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
335         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336                 "paddw %%mm3, %%mm4             \n\t"\
337                 "movq %%mm2, %%mm0              \n\t"\
338                 "movq %%mm5, %%mm6              \n\t"\
339                 "movq %%mm4, %%mm3              \n\t"\
340                 "punpcklwd %%mm2, %%mm2         \n\t"\
341                 "punpcklwd %%mm5, %%mm5         \n\t"\
342                 "punpcklwd %%mm4, %%mm4         \n\t"\
343                 "paddw %%mm1, %%mm2             \n\t"\
344                 "paddw %%mm1, %%mm5             \n\t"\
345                 "paddw %%mm1, %%mm4             \n\t"\
346                 "punpckhwd %%mm0, %%mm0         \n\t"\
347                 "punpckhwd %%mm6, %%mm6         \n\t"\
348                 "punpckhwd %%mm3, %%mm3         \n\t"\
349                 "paddw %%mm7, %%mm0             \n\t"\
350                 "paddw %%mm7, %%mm6             \n\t"\
351                 "paddw %%mm7, %%mm3             \n\t"\
352                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353                 "packuswb %%mm0, %%mm2          \n\t"\
354                 "packuswb %%mm6, %%mm5          \n\t"\
355                 "packuswb %%mm3, %%mm4          \n\t"\
356                 "pxor %%mm7, %%mm7              \n\t"
357 #define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
358                 
359 #define REAL_YSCALEYUV2PACKED1(index, c) \
360                 "xor "#index", "#index"         \n\t"\
361                 ASMALIGN16\
362                 "1:                             \n\t"\
363                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
364                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
365                 "psraw $7, %%mm3                \n\t" \
366                 "psraw $7, %%mm4                \n\t" \
367                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
368                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
369                 "psraw $7, %%mm1                \n\t" \
370                 "psraw $7, %%mm7                \n\t" \
371                 
372 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
373                 
374 #define REAL_YSCALEYUV2RGB1(index, c) \
375                 "xor "#index", "#index" \n\t"\
376                 ASMALIGN16\
377                 "1:                             \n\t"\
378                 "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
379                 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
380                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
381                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
382                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
383                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
384                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
385                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
386                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
387                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
388         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
389                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
390                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
391                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
392                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
393                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
394                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
395                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
396                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
397                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
398                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
399         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
400                 "paddw %%mm3, %%mm4             \n\t"\
401                 "movq %%mm2, %%mm0              \n\t"\
402                 "movq %%mm5, %%mm6              \n\t"\
403                 "movq %%mm4, %%mm3              \n\t"\
404                 "punpcklwd %%mm2, %%mm2         \n\t"\
405                 "punpcklwd %%mm5, %%mm5         \n\t"\
406                 "punpcklwd %%mm4, %%mm4         \n\t"\
407                 "paddw %%mm1, %%mm2             \n\t"\
408                 "paddw %%mm1, %%mm5             \n\t"\
409                 "paddw %%mm1, %%mm4             \n\t"\
410                 "punpckhwd %%mm0, %%mm0         \n\t"\
411                 "punpckhwd %%mm6, %%mm6         \n\t"\
412                 "punpckhwd %%mm3, %%mm3         \n\t"\
413                 "paddw %%mm7, %%mm0             \n\t"\
414                 "paddw %%mm7, %%mm6             \n\t"\
415                 "paddw %%mm7, %%mm3             \n\t"\
416                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
417                 "packuswb %%mm0, %%mm2          \n\t"\
418                 "packuswb %%mm6, %%mm5          \n\t"\
419                 "packuswb %%mm3, %%mm4          \n\t"\
420                 "pxor %%mm7, %%mm7              \n\t"
421 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
422
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \
424                 "xor "#index", "#index"         \n\t"\
425                 ASMALIGN16\
426                 "1:                             \n\t"\
427                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
428                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
429                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
430                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
431                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
432                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
433                 "psrlw $8, %%mm3                \n\t" \
434                 "psrlw $8, %%mm4                \n\t" \
435                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
436                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
437                 "psraw $7, %%mm1                \n\t" \
438                 "psraw $7, %%mm7                \n\t" 
439 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
440                 
441 // do vertical chrominance interpolation
442 #define REAL_YSCALEYUV2RGB1b(index, c) \
443                 "xor "#index", "#index"         \n\t"\
444                 ASMALIGN16\
445                 "1:                             \n\t"\
446                 "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
447                 "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
448                 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
449                 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
450                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
451                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
452                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
453                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
454                 "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
455                 "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
456                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
457                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
458                 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
459                 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
460         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461                 "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
462                 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
463                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
464                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
465                 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
466                 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
467                 "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
468                 "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
469                 "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
470                 "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
471         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
472                 "paddw %%mm3, %%mm4             \n\t"\
473                 "movq %%mm2, %%mm0              \n\t"\
474                 "movq %%mm5, %%mm6              \n\t"\
475                 "movq %%mm4, %%mm3              \n\t"\
476                 "punpcklwd %%mm2, %%mm2         \n\t"\
477                 "punpcklwd %%mm5, %%mm5         \n\t"\
478                 "punpcklwd %%mm4, %%mm4         \n\t"\
479                 "paddw %%mm1, %%mm2             \n\t"\
480                 "paddw %%mm1, %%mm5             \n\t"\
481                 "paddw %%mm1, %%mm4             \n\t"\
482                 "punpckhwd %%mm0, %%mm0         \n\t"\
483                 "punpckhwd %%mm6, %%mm6         \n\t"\
484                 "punpckhwd %%mm3, %%mm3         \n\t"\
485                 "paddw %%mm7, %%mm0             \n\t"\
486                 "paddw %%mm7, %%mm6             \n\t"\
487                 "paddw %%mm7, %%mm3             \n\t"\
488                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
489                 "packuswb %%mm0, %%mm2          \n\t"\
490                 "packuswb %%mm6, %%mm5          \n\t"\
491                 "packuswb %%mm3, %%mm4          \n\t"\
492                 "pxor %%mm7, %%mm7              \n\t"
493 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
494
495 #define REAL_WRITEBGR32(dst, dstw, index) \
496                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
497                         "movq %%mm2, %%mm1              \n\t" /* B */\
498                         "movq %%mm5, %%mm6              \n\t" /* R */\
499                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
500                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
501                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
502                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
503                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
504                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
505                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
506                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
507                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
508                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
509 \
510                         MOVNTQ(%%mm0, (dst, index, 4))\
511                         MOVNTQ(%%mm2, 8(dst, index, 4))\
512                         MOVNTQ(%%mm1, 16(dst, index, 4))\
513                         MOVNTQ(%%mm3, 24(dst, index, 4))\
514 \
515                         "add $8, "#index"               \n\t"\
516                         "cmp "#dstw", "#index"          \n\t"\
517                         " jb 1b                         \n\t"
518 #define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
519
520 #define REAL_WRITEBGR16(dst, dstw, index) \
521                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
522                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
523                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
524                         "psrlq $3, %%mm2                \n\t"\
525 \
526                         "movq %%mm2, %%mm1              \n\t"\
527                         "movq %%mm4, %%mm3              \n\t"\
528 \
529                         "punpcklbw %%mm7, %%mm3         \n\t"\
530                         "punpcklbw %%mm5, %%mm2         \n\t"\
531                         "punpckhbw %%mm7, %%mm4         \n\t"\
532                         "punpckhbw %%mm5, %%mm1         \n\t"\
533 \
534                         "psllq $3, %%mm3                \n\t"\
535                         "psllq $3, %%mm4                \n\t"\
536 \
537                         "por %%mm3, %%mm2               \n\t"\
538                         "por %%mm4, %%mm1               \n\t"\
539 \
540                         MOVNTQ(%%mm2, (dst, index, 2))\
541                         MOVNTQ(%%mm1, 8(dst, index, 2))\
542 \
543                         "add $8, "#index"               \n\t"\
544                         "cmp "#dstw", "#index"          \n\t"\
545                         " jb 1b                         \n\t"
546 #define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
547
548 #define REAL_WRITEBGR15(dst, dstw, index) \
549                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
550                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
551                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
552                         "psrlq $3, %%mm2                \n\t"\
553                         "psrlq $1, %%mm5                \n\t"\
554 \
555                         "movq %%mm2, %%mm1              \n\t"\
556                         "movq %%mm4, %%mm3              \n\t"\
557 \
558                         "punpcklbw %%mm7, %%mm3         \n\t"\
559                         "punpcklbw %%mm5, %%mm2         \n\t"\
560                         "punpckhbw %%mm7, %%mm4         \n\t"\
561                         "punpckhbw %%mm5, %%mm1         \n\t"\
562 \
563                         "psllq $2, %%mm3                \n\t"\
564                         "psllq $2, %%mm4                \n\t"\
565 \
566                         "por %%mm3, %%mm2               \n\t"\
567                         "por %%mm4, %%mm1               \n\t"\
568 \
569                         MOVNTQ(%%mm2, (dst, index, 2))\
570                         MOVNTQ(%%mm1, 8(dst, index, 2))\
571 \
572                         "add $8, "#index"               \n\t"\
573                         "cmp "#dstw", "#index"          \n\t"\
574                         " jb 1b                         \n\t"
575 #define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
576
577 #define WRITEBGR24OLD(dst, dstw, index) \
578                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
579                         "movq %%mm2, %%mm1              \n\t" /* B */\
580                         "movq %%mm5, %%mm6              \n\t" /* R */\
581                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
582                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
583                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
584                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
585                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
586                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
587                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
588                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
589                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
590                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
591 \
592                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
593                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
594                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
595                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
596                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
597                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
598                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
599                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
600 \
601                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
602                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
603                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
604                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
605                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
606                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
607                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
608                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
609                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
610                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
611                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
612                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
613                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
614 \
615                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
616                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
617                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
618                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
619                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
620                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
621                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
622                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
623 \
624                         MOVNTQ(%%mm0, (dst))\
625                         MOVNTQ(%%mm2, 8(dst))\
626                         MOVNTQ(%%mm3, 16(dst))\
627                         "add $24, "#dst"                \n\t"\
628 \
629                         "add $8, "#index"               \n\t"\
630                         "cmp "#dstw", "#index"          \n\t"\
631                         " jb 1b                         \n\t"
632
633 #define WRITEBGR24MMX(dst, dstw, index) \
634                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
635                         "movq %%mm2, %%mm1              \n\t" /* B */\
636                         "movq %%mm5, %%mm6              \n\t" /* R */\
637                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
638                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
639                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
640                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
641                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
642                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
643                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
644                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
645                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
646                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
647 \
648                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
649                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
650                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
651                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
652 \
653                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
654                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
655                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
656                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
657 \
658                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
659                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
660                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
661                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
662 \
663                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
664                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
665                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
666                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
667                         MOVNTQ(%%mm0, (dst))\
668 \
669                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
670                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
671                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
672                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
673                         MOVNTQ(%%mm6, 8(dst))\
674 \
675                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
676                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
677                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
678                         MOVNTQ(%%mm5, 16(dst))\
679 \
680                         "add $24, "#dst"                \n\t"\
681 \
682                         "add $8, "#index"                       \n\t"\
683                         "cmp "#dstw", "#index"                  \n\t"\
684                         " jb 1b                         \n\t"
685
686 #define WRITEBGR24MMX2(dst, dstw, index) \
687                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
688                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
689                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
690                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
691                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
692                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
693 \
694                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
695                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
696                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
697 \
698                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
699                         "por %%mm1, %%mm6               \n\t"\
700                         "por %%mm3, %%mm6               \n\t"\
701                         MOVNTQ(%%mm6, (dst))\
702 \
703                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
704                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
705                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
706                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
707 \
708                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
709                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
710                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
711 \
712                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
713                         "por %%mm3, %%mm6               \n\t"\
714                         MOVNTQ(%%mm6, 8(dst))\
715 \
716                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
717                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
718                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
719 \
720                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
721                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
722                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
723 \
724                         "por %%mm1, %%mm3               \n\t"\
725                         "por %%mm3, %%mm6               \n\t"\
726                         MOVNTQ(%%mm6, 16(dst))\
727 \
728                         "add $24, "#dst"                \n\t"\
729 \
730                         "add $8, "#index"               \n\t"\
731                         "cmp "#dstw", "#index"          \n\t"\
732                         " jb 1b                         \n\t"
733
734 #ifdef HAVE_MMX2
735 #undef WRITEBGR24
736 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
737 #else
738 #undef WRITEBGR24
739 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
740 #endif
741
742 #define REAL_WRITEYUY2(dst, dstw, index) \
743                         "packuswb %%mm3, %%mm3          \n\t"\
744                         "packuswb %%mm4, %%mm4          \n\t"\
745                         "packuswb %%mm7, %%mm1          \n\t"\
746                         "punpcklbw %%mm4, %%mm3         \n\t"\
747                         "movq %%mm1, %%mm7              \n\t"\
748                         "punpcklbw %%mm3, %%mm1         \n\t"\
749                         "punpckhbw %%mm3, %%mm7         \n\t"\
750 \
751                         MOVNTQ(%%mm1, (dst, index, 2))\
752                         MOVNTQ(%%mm7, 8(dst, index, 2))\
753 \
754                         "add $8, "#index"               \n\t"\
755                         "cmp "#dstw", "#index"          \n\t"\
756                         " jb 1b                         \n\t"
757 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
758
759
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
761                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
762                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
763 {
764 #ifdef HAVE_MMX
765         if(uDest != NULL)
766         {
767                 asm volatile(
768                                 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
769                                 :: "r" (&c->redDither),
770                                 "r" (uDest), "p" (chrDstW)
771                                 : "%"REG_a, "%"REG_d, "%"REG_S
772                         );
773
774                 asm volatile(
775                                 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
776                                 :: "r" (&c->redDither),
777                                 "r" (vDest), "p" (chrDstW)
778                                 : "%"REG_a, "%"REG_d, "%"REG_S
779                         );
780         }
781
782         asm volatile(
783                         YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
784                         :: "r" (&c->redDither),
785                            "r" (dest), "p" (dstW)
786                         : "%"REG_a, "%"REG_d, "%"REG_S
787                 );
788 #else
789 #ifdef HAVE_ALTIVEC
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
791                       chrFilter, chrSrc, chrFilterSize,
792                       dest, uDest, vDest, dstW, chrDstW);
793 #else //HAVE_ALTIVEC
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
795             chrFilter, chrSrc, chrFilterSize,
796             dest, uDest, vDest, dstW, chrDstW);
797 #endif //!HAVE_ALTIVEC
798 #endif
799 }
800
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
802                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
803                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
804 {
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
806              chrFilter, chrSrc, chrFilterSize,
807              dest, uDest, dstW, chrDstW, dstFormat);
808 }
809
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
811                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
812 {
813 #ifdef HAVE_MMX
814         if(uDest != NULL)
815         {
816                 asm volatile(
817                                 YSCALEYUV2YV121
818                                 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
819                                 "g" (-chrDstW)
820                                 : "%"REG_a
821                         );
822
823                 asm volatile(
824                                 YSCALEYUV2YV121
825                                 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
826                                 "g" (-chrDstW)
827                                 : "%"REG_a
828                         );
829         }
830
831         asm volatile(
832                 YSCALEYUV2YV121
833                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
834                 "g" (-dstW)
835                 : "%"REG_a
836         );
837 #else
838         int i;
839         for(i=0; i<dstW; i++)
840         {
841                 int val= lumSrc[i]>>7;
842                 
843                 if(val&256){
844                         if(val<0) val=0;
845                         else      val=255;
846                 }
847
848                 dest[i]= val;
849         }
850
851         if(uDest != NULL)
852                 for(i=0; i<chrDstW; i++)
853                 {
854                         int u=chrSrc[i]>>7;
855                         int v=chrSrc[i + 2048]>>7;
856
857                         if((u|v)&256){
858                                 if(u<0)         u=0;
859                                 else if (u>255) u=255;
860                                 if(v<0)         v=0;
861                                 else if (v>255) v=255;
862                         }
863
864                         uDest[i]= u;
865                         vDest[i]= v;
866                 }
867 #endif
868 }
869
870
871 /**
872  * vertical scale YV12 to RGB
873  */
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
875                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
876                             uint8_t *dest, long dstW, long dstY)
877 {
878         long dummy=0;
879         switch(c->dstFormat)
880         {
881 #ifdef HAVE_MMX
882         case IMGFMT_BGR32:
883                 {
884                         asm volatile(
885                                 YSCALEYUV2RGBX
886                                 WRITEBGR32(%4, %5, %%REGa)
887
888                         :: "r" (&c->redDither), 
889                            "m" (dummy), "m" (dummy), "m" (dummy),
890                            "r" (dest), "m" (dstW)
891                         : "%"REG_a, "%"REG_d, "%"REG_S
892                         );
893                 }
894                 break;
895         case IMGFMT_BGR24:
896                 {
897                         asm volatile(
898                                 YSCALEYUV2RGBX
899                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
900                                 "add %4, %%"REG_b"                      \n\t"
901                                 WRITEBGR24(%%REGb, %5, %%REGa)
902
903                         :: "r" (&c->redDither), 
904                            "m" (dummy), "m" (dummy), "m" (dummy),
905                            "r" (dest), "m" (dstW)
906                         : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
907                         );
908                 }
909                 break;
910         case IMGFMT_BGR15:
911                 {
912                         asm volatile(
913                                 YSCALEYUV2RGBX
914                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
915 #ifdef DITHER1XBPP
916                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
917                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
918                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919 #endif
920
921                                 WRITEBGR15(%4, %5, %%REGa)
922
923                         :: "r" (&c->redDither), 
924                            "m" (dummy), "m" (dummy), "m" (dummy),
925                            "r" (dest), "m" (dstW)
926                         : "%"REG_a, "%"REG_d, "%"REG_S
927                         );
928                 }
929                 break;
930         case IMGFMT_BGR16:
931                 {
932                         asm volatile(
933                                 YSCALEYUV2RGBX
934                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
935 #ifdef DITHER1XBPP
936                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
937                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
938                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939 #endif
940
941                                 WRITEBGR16(%4, %5, %%REGa)
942
943                         :: "r" (&c->redDither), 
944                            "m" (dummy), "m" (dummy), "m" (dummy),
945                            "r" (dest), "m" (dstW)
946                         : "%"REG_a, "%"REG_d, "%"REG_S
947                         );
948                 }
949                 break;
950         case IMGFMT_YUY2:
951                 {
952                         asm volatile(
953                                 YSCALEYUV2PACKEDX
954                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
955
956                                 "psraw $3, %%mm3                \n\t"
957                                 "psraw $3, %%mm4                \n\t"
958                                 "psraw $3, %%mm1                \n\t"
959                                 "psraw $3, %%mm7                \n\t"
960                                 WRITEYUY2(%4, %5, %%REGa)
961
962                         :: "r" (&c->redDither), 
963                            "m" (dummy), "m" (dummy), "m" (dummy),
964                            "r" (dest), "m" (dstW)
965                         : "%"REG_a, "%"REG_d, "%"REG_S
966                         );
967                 }
968                 break;
969 #endif
970         default:
971 #ifdef HAVE_ALTIVEC
972                 /* The following list of supported dstFormat values should
973                    match what's found in the body of altivec_yuv2packedX() */
974                 if(c->dstFormat==IMGFMT_ABGR  || c->dstFormat==IMGFMT_BGRA  ||
975                    c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
976                    c->dstFormat==IMGFMT_RGBA  || c->dstFormat==IMGFMT_ARGB)
977                         altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
978                                     chrFilter, chrSrc, chrFilterSize,
979                                     dest, dstW, dstY);
980                 else
981 #endif
982                         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
983                                     chrFilter, chrSrc, chrFilterSize,
984                                     dest, dstW, dstY);
985                 break;
986         }
987 }
988
989 /**
990  * vertical bilinear scale YV12 to RGB
991  */
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
993                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
994 {
995         int yalpha1=yalpha^4095;
996         int uvalpha1=uvalpha^4095;
997         int i;
998
999 #if 0 //isn't used
1000         if(flags&SWS_FULL_CHR_H_INT)
1001         {
1002                 switch(dstFormat)
1003                 {
1004 #ifdef HAVE_MMX
1005                 case IMGFMT_BGR32:
1006                         asm volatile(
1007
1008
1009 FULL_YSCALEYUV2RGB
1010                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1011                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1012
1013                         "movq %%mm3, %%mm1              \n\t"
1014                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1015                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1016
1017                         MOVNTQ(%%mm3, (%4, %%REGa, 4))
1018                         MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1019
1020                         "add $4, %%"REG_a"              \n\t"
1021                         "cmp %5, %%"REG_a"              \n\t"
1022                         " jb 1b                         \n\t"
1023
1024
1025                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1026                         "m" (yalpha1), "m" (uvalpha1)
1027                         : "%"REG_a
1028                         );
1029                         break;
1030                 case IMGFMT_BGR24:
1031                         asm volatile(
1032
1033 FULL_YSCALEYUV2RGB
1034
1035                                                                 // lsb ... msb
1036                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1037                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1038
1039                         "movq %%mm3, %%mm1              \n\t"
1040                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1041                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1042
1043                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1044                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
1045                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1046                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1047                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
1048                         "movq %%mm1, %%mm2              \n\t"
1049                         "psllq $48, %%mm1               \n\t" // 000000BG
1050                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1051
1052                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1053                         "psrld $16, %%mm2               \n\t" // R000R000
1054                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
1055                         "por %%mm2, %%mm1               \n\t" // RBGRR000
1056
1057                         "mov %4, %%"REG_b"              \n\t"
1058                         "add %%"REG_a", %%"REG_b"       \n\t"
1059
1060 #ifdef HAVE_MMX2
1061                         //FIXME Alignment
1062                         "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1063                         "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1064 #else
1065                         "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1066                         "psrlq $32, %%mm3               \n\t"
1067                         "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1068                         "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1069 #endif
1070                         "add $4, %%"REG_a"              \n\t"
1071                         "cmp %5, %%"REG_a"              \n\t"
1072                         " jb 1b                         \n\t"
1073
1074                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075                         "m" (yalpha1), "m" (uvalpha1)
1076                         : "%"REG_a, "%"REG_b
1077                         );
1078                         break;
1079                 case IMGFMT_BGR15:
1080                         asm volatile(
1081
1082 FULL_YSCALEYUV2RGB
1083 #ifdef DITHER1XBPP
1084                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1085                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1086                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1087 #endif
1088                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1089                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1090                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1091
1092                         "psrlw $3, %%mm3                \n\t"
1093                         "psllw $2, %%mm1                \n\t"
1094                         "psllw $7, %%mm0                \n\t"
1095                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1096                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1097
1098                         "por %%mm3, %%mm1               \n\t"
1099                         "por %%mm1, %%mm0               \n\t"
1100
1101                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1102
1103                         "add $4, %%"REG_a"              \n\t"
1104                         "cmp %5, %%"REG_a"              \n\t"
1105                         " jb 1b                         \n\t"
1106
1107                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1108                         "m" (yalpha1), "m" (uvalpha1)
1109                         : "%"REG_a
1110                         );
1111                         break;
1112                 case IMGFMT_BGR16:
1113                         asm volatile(
1114
1115 FULL_YSCALEYUV2RGB
1116 #ifdef DITHER1XBPP
1117                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1118                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1119                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1120 #endif
1121                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1122                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1123                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1124
1125                         "psrlw $3, %%mm3                \n\t"
1126                         "psllw $3, %%mm1                \n\t"
1127                         "psllw $8, %%mm0                \n\t"
1128                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1129                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1130
1131                         "por %%mm3, %%mm1               \n\t"
1132                         "por %%mm1, %%mm0               \n\t"
1133
1134                         MOVNTQ(%%mm0, (%4, %%REGa, 2))
1135
1136                         "add $4, %%"REG_a"              \n\t"
1137                         "cmp %5, %%"REG_a"              \n\t"
1138                         " jb 1b                         \n\t"
1139
1140                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1141                         "m" (yalpha1), "m" (uvalpha1)
1142                         : "%"REG_a
1143                         );
1144                 break;
1145 #endif
1146                 case IMGFMT_RGB32:
1147 #ifndef HAVE_MMX
1148                 case IMGFMT_BGR32:
1149 #endif
1150                 if(dstFormat==IMGFMT_BGR32)
1151                 {
1152                         int i;
1153 #ifdef WORDS_BIGENDIAN
1154                         dest++;
1155 #endif
1156                         for(i=0;i<dstW;i++){
1157                                 // vertical linear interpolation && yuv2rgb in a single step:
1158                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1159                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1160                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1161                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1162                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1163                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1164                                 dest+= 4;
1165                         }
1166                 }
1167                 else if(dstFormat==IMGFMT_BGR24)
1168                 {
1169                         int i;
1170                         for(i=0;i<dstW;i++){
1171                                 // vertical linear interpolation && yuv2rgb in a single step:
1172                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1173                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1174                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1175                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1176                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1177                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1178                                 dest+= 3;
1179                         }
1180                 }
1181                 else if(dstFormat==IMGFMT_BGR16)
1182                 {
1183                         int i;
1184                         for(i=0;i<dstW;i++){
1185                                 // vertical linear interpolation && yuv2rgb in a single step:
1186                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1187                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1188                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1189
1190                                 ((uint16_t*)dest)[i] =
1191                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1192                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1193                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1194                         }
1195                 }
1196                 else if(dstFormat==IMGFMT_BGR15)
1197                 {
1198                         int i;
1199                         for(i=0;i<dstW;i++){
1200                                 // vertical linear interpolation && yuv2rgb in a single step:
1201                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1202                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1203                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1204
1205                                 ((uint16_t*)dest)[i] =
1206                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1207                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1208                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1209                         }
1210                 }
1211         }//FULL_UV_IPOL
1212         else
1213         {
1214 #endif // if 0
1215 #ifdef HAVE_MMX
1216         switch(c->dstFormat)
1217         {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219         case IMGFMT_BGR32:
1220                         asm volatile(
1221                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1222                                 "mov %4, %%"REG_b"                      \n\t"
1223                                 "push %%"REG_BP"                        \n\t"
1224                                 YSCALEYUV2RGB(%%REGBP, %5)
1225                                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1226                                 "pop %%"REG_BP"                         \n\t"
1227                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1228
1229                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1230                         "a" (&c->redDither)
1231                         );
1232                         return;
1233         case IMGFMT_BGR24:
1234                         asm volatile(
1235                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1236                                 "mov %4, %%"REG_b"                      \n\t"
1237                                 "push %%"REG_BP"                        \n\t"
1238                                 YSCALEYUV2RGB(%%REGBP, %5)
1239                                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1240                                 "pop %%"REG_BP"                         \n\t"
1241                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1242                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1243                         "a" (&c->redDither)
1244                         );
1245                         return;
1246         case IMGFMT_BGR15:
1247                         asm volatile(
1248                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1249                                 "mov %4, %%"REG_b"                      \n\t"
1250                                 "push %%"REG_BP"                        \n\t"
1251                                 YSCALEYUV2RGB(%%REGBP, %5)
1252                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 #ifdef DITHER1XBPP
1254                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1255                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1256                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1257 #endif
1258
1259                                 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1260                                 "pop %%"REG_BP"                         \n\t"
1261                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1262
1263                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264                         "a" (&c->redDither)
1265                         );
1266                         return;
1267         case IMGFMT_BGR16:
1268                         asm volatile(
1269                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1270                                 "mov %4, %%"REG_b"                      \n\t"
1271                                 "push %%"REG_BP"                        \n\t"
1272                                 YSCALEYUV2RGB(%%REGBP, %5)
1273                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1274 #ifdef DITHER1XBPP
1275                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1276                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1277                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1278 #endif
1279
1280                                 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1281                                 "pop %%"REG_BP"                         \n\t"
1282                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1283                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1284                         "a" (&c->redDither)
1285                         );
1286                         return;
1287         case IMGFMT_YUY2:
1288                         asm volatile(
1289                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1290                                 "mov %4, %%"REG_b"                      \n\t"
1291                                 "push %%"REG_BP"                        \n\t"
1292                                 YSCALEYUV2PACKED(%%REGBP, %5)
1293                                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1294                                 "pop %%"REG_BP"                         \n\t"
1295                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1296                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1297                         "a" (&c->redDither)
1298                         );
1299                         return;
1300         default: break;
1301         }
1302 #endif //HAVE_MMX
1303 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1304 }
1305
1306 /**
1307  * YV12 to RGB without scaling or interpolating
1308  */
1309 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1310                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1311 {
1312         const int yalpha1=0;
1313         int i;
1314         
1315         uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1316         const int yalpha= 4096; //FIXME ...
1317
1318         if(flags&SWS_FULL_CHR_H_INT)
1319         {
1320                 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1321                 return;
1322         }
1323
1324 #ifdef HAVE_MMX
1325         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1326         {
1327                 switch(dstFormat)
1328                 {
1329                 case IMGFMT_BGR32:
1330                         asm volatile(
1331                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1332                                 "mov %4, %%"REG_b"                      \n\t"
1333                                 "push %%"REG_BP"                        \n\t"
1334                                 YSCALEYUV2RGB1(%%REGBP, %5)
1335                                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1336                                 "pop %%"REG_BP"                         \n\t"
1337                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1338
1339                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1340                         "a" (&c->redDither)
1341                         );
1342                         return;
1343                 case IMGFMT_BGR24:
1344                         asm volatile(
1345                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1346                                 "mov %4, %%"REG_b"                      \n\t"
1347                                 "push %%"REG_BP"                        \n\t"
1348                                 YSCALEYUV2RGB1(%%REGBP, %5)
1349                                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1350                                 "pop %%"REG_BP"                         \n\t"
1351                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1352
1353                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354                         "a" (&c->redDither)
1355                         );
1356                         return;
1357                 case IMGFMT_BGR15:
1358                         asm volatile(
1359                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1360                                 "mov %4, %%"REG_b"                      \n\t"
1361                                 "push %%"REG_BP"                        \n\t"
1362                                 YSCALEYUV2RGB1(%%REGBP, %5)
1363                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1364 #ifdef DITHER1XBPP
1365                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1366                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1367                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1368 #endif
1369                                 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1370                                 "pop %%"REG_BP"                         \n\t"
1371                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1372
1373                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1374                         "a" (&c->redDither)
1375                         );
1376                         return;
1377                 case IMGFMT_BGR16:
1378                         asm volatile(
1379                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1380                                 "mov %4, %%"REG_b"                      \n\t"
1381                                 "push %%"REG_BP"                        \n\t"
1382                                 YSCALEYUV2RGB1(%%REGBP, %5)
1383                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1384 #ifdef DITHER1XBPP
1385                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1388 #endif
1389
1390                                 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1391                                 "pop %%"REG_BP"                         \n\t"
1392                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1393
1394                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1395                         "a" (&c->redDither)
1396                         );
1397                         return;
1398                 case IMGFMT_YUY2:
1399                         asm volatile(
1400                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1401                                 "mov %4, %%"REG_b"                      \n\t"
1402                                 "push %%"REG_BP"                        \n\t"
1403                                 YSCALEYUV2PACKED1(%%REGBP, %5)
1404                                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1405                                 "pop %%"REG_BP"                         \n\t"
1406                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1407
1408                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1409                         "a" (&c->redDither)
1410                         );
1411                         return;
1412                 }
1413         }
1414         else
1415         {
1416                 switch(dstFormat)
1417                 {
1418                 case IMGFMT_BGR32:
1419                         asm volatile(
1420                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1421                                 "mov %4, %%"REG_b"                      \n\t"
1422                                 "push %%"REG_BP"                        \n\t"
1423                                 YSCALEYUV2RGB1b(%%REGBP, %5)
1424                                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1425                                 "pop %%"REG_BP"                         \n\t"
1426                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1427
1428                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429                         "a" (&c->redDither)
1430                         );
1431                         return;
1432                 case IMGFMT_BGR24:
1433                         asm volatile(
1434                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1435                                 "mov %4, %%"REG_b"                      \n\t"
1436                                 "push %%"REG_BP"                        \n\t"
1437                                 YSCALEYUV2RGB1b(%%REGBP, %5)
1438                                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1439                                 "pop %%"REG_BP"                         \n\t"
1440                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1441
1442                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1443                         "a" (&c->redDither)
1444                         );
1445                         return;
1446                 case IMGFMT_BGR15:
1447                         asm volatile(
1448                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1449                                 "mov %4, %%"REG_b"                      \n\t"
1450                                 "push %%"REG_BP"                        \n\t"
1451                                 YSCALEYUV2RGB1b(%%REGBP, %5)
1452                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 #ifdef DITHER1XBPP
1454                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1455                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1456                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1457 #endif
1458                                 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1459                                 "pop %%"REG_BP"                         \n\t"
1460                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1461
1462                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463                         "a" (&c->redDither)
1464                         );
1465                         return;
1466                 case IMGFMT_BGR16:
1467                         asm volatile(
1468                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1469                                 "mov %4, %%"REG_b"                      \n\t"
1470                                 "push %%"REG_BP"                        \n\t"
1471                                 YSCALEYUV2RGB1b(%%REGBP, %5)
1472                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1473 #ifdef DITHER1XBPP
1474                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1475                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1476                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1477 #endif
1478
1479                                 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1480                                 "pop %%"REG_BP"                         \n\t"
1481                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1482
1483                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484                         "a" (&c->redDither)
1485                         );
1486                         return;
1487                 case IMGFMT_YUY2:
1488                         asm volatile(
1489                                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1490                                 "mov %4, %%"REG_b"                      \n\t"
1491                                 "push %%"REG_BP"                        \n\t"
1492                                 YSCALEYUV2PACKED1b(%%REGBP, %5)
1493                                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1494                                 "pop %%"REG_BP"                         \n\t"
1495                                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1496
1497                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498                         "a" (&c->redDither)
1499                         );
1500                         return;
1501                 }
1502         }
1503 #endif
1504         if( uvalpha < 2048 )
1505         {
1506                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1507         }else{
1508                 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1509         }
1510 }
1511
1512 //FIXME yuy2* can read upto 7 samples to much
1513
1514 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1515 {
1516 #ifdef HAVE_MMX
1517         asm volatile(
1518                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1519                 "mov %0, %%"REG_a"              \n\t"
1520                 "1:                             \n\t"
1521                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1522                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1523                 "pand %%mm2, %%mm0              \n\t"
1524                 "pand %%mm2, %%mm1              \n\t"
1525                 "packuswb %%mm1, %%mm0          \n\t"
1526                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1527                 "add $8, %%"REG_a"              \n\t"
1528                 " js 1b                         \n\t"
1529                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1530                 : "%"REG_a
1531         );
1532 #else
1533         int i;
1534         for(i=0; i<width; i++)
1535                 dst[i]= src[2*i];
1536 #endif
1537 }
1538
1539 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1540 {
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1542         asm volatile(
1543                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1544                 "mov %0, %%"REG_a"              \n\t"
1545                 "1:                             \n\t"
1546                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1547                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1548                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1549                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1550                 PAVGB(%%mm2, %%mm0)
1551                 PAVGB(%%mm3, %%mm1)
1552                 "psrlw $8, %%mm0                \n\t"
1553                 "psrlw $8, %%mm1                \n\t"
1554                 "packuswb %%mm1, %%mm0          \n\t"
1555                 "movq %%mm0, %%mm1              \n\t"
1556                 "psrlw $8, %%mm0                \n\t"
1557                 "pand %%mm4, %%mm1              \n\t"
1558                 "packuswb %%mm0, %%mm0          \n\t"
1559                 "packuswb %%mm1, %%mm1          \n\t"
1560                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1561                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1562                 "add $4, %%"REG_a"              \n\t"
1563                 " js 1b                         \n\t"
1564                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1565                 : "%"REG_a
1566         );
1567 #else
1568         int i;
1569         for(i=0; i<width; i++)
1570         {
1571                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1572                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1573         }
1574 #endif
1575 }
1576
1577 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1578 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1579 {
1580 #ifdef HAVE_MMX
1581         asm volatile(
1582                 "mov %0, %%"REG_a"              \n\t"
1583                 "1:                             \n\t"
1584                 "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1585                 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1586                 "psrlw $8, %%mm0                \n\t"
1587                 "psrlw $8, %%mm1                \n\t"
1588                 "packuswb %%mm1, %%mm0          \n\t"
1589                 "movq %%mm0, (%2, %%"REG_a")    \n\t"
1590                 "add $8, %%"REG_a"              \n\t"
1591                 " js 1b                         \n\t"
1592                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1593                 : "%"REG_a
1594         );
1595 #else
1596         int i;
1597         for(i=0; i<width; i++)
1598                 dst[i]= src[2*i+1];
1599 #endif
1600 }
1601
1602 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1603 {
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1605         asm volatile(
1606                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1607                 "mov %0, %%"REG_a"              \n\t"
1608                 "1:                             \n\t"
1609                 "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1610                 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1611                 "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1612                 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1613                 PAVGB(%%mm2, %%mm0)
1614                 PAVGB(%%mm3, %%mm1)
1615                 "pand %%mm4, %%mm0              \n\t"
1616                 "pand %%mm4, %%mm1              \n\t"
1617                 "packuswb %%mm1, %%mm0          \n\t"
1618                 "movq %%mm0, %%mm1              \n\t"
1619                 "psrlw $8, %%mm0                \n\t"
1620                 "pand %%mm4, %%mm1              \n\t"
1621                 "packuswb %%mm0, %%mm0          \n\t"
1622                 "packuswb %%mm1, %%mm1          \n\t"
1623                 "movd %%mm0, (%4, %%"REG_a")    \n\t"
1624                 "movd %%mm1, (%3, %%"REG_a")    \n\t"
1625                 "add $4, %%"REG_a"              \n\t"
1626                 " js 1b                         \n\t"
1627                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1628                 : "%"REG_a
1629         );
1630 #else
1631         int i;
1632         for(i=0; i<width; i++)
1633         {
1634                 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1635                 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1636         }
1637 #endif
1638 }
1639
1640 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1641 {
1642         int i;
1643         for(i=0; i<width; i++)
1644         {
1645                 int b=  ((uint32_t*)src)[i]&0xFF;
1646                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1647                 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1648
1649                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1650         }
1651 }
1652
1653 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1654 {
1655         int i;
1656         for(i=0; i<width; i++)
1657         {
1658                 const int a= ((uint32_t*)src1)[2*i+0];
1659                 const int e= ((uint32_t*)src1)[2*i+1];
1660                 const int c= ((uint32_t*)src2)[2*i+0];
1661                 const int d= ((uint32_t*)src2)[2*i+1];
1662                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1663                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1664                 const int b=  l&0x3FF;
1665                 const int g=  h>>8;
1666                 const int r=  l>>16;
1667
1668                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1669                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1670         }
1671 }
1672
1673 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1674 {
1675 #ifdef HAVE_MMX
1676         asm volatile(
1677                 "mov %2, %%"REG_a"              \n\t"
1678                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1679                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1680                 "pxor %%mm7, %%mm7              \n\t"
1681                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1682                 ASMALIGN16
1683                 "1:                             \n\t"
1684                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1685                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1686                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1687                 "punpcklbw %%mm7, %%mm0         \n\t"
1688                 "punpcklbw %%mm7, %%mm1         \n\t"
1689                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1690                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1691                 "punpcklbw %%mm7, %%mm2         \n\t"
1692                 "punpcklbw %%mm7, %%mm3         \n\t"
1693                 "pmaddwd %%mm6, %%mm0           \n\t"
1694                 "pmaddwd %%mm6, %%mm1           \n\t"
1695                 "pmaddwd %%mm6, %%mm2           \n\t"
1696                 "pmaddwd %%mm6, %%mm3           \n\t"
1697 #ifndef FAST_BGR2YV12
1698                 "psrad $8, %%mm0                \n\t"
1699                 "psrad $8, %%mm1                \n\t"
1700                 "psrad $8, %%mm2                \n\t"
1701                 "psrad $8, %%mm3                \n\t"
1702 #endif
1703                 "packssdw %%mm1, %%mm0          \n\t"
1704                 "packssdw %%mm3, %%mm2          \n\t"
1705                 "pmaddwd %%mm5, %%mm0           \n\t"
1706                 "pmaddwd %%mm5, %%mm2           \n\t"
1707                 "packssdw %%mm2, %%mm0          \n\t"
1708                 "psraw $7, %%mm0                \n\t"
1709
1710                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1711                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1712                 "punpcklbw %%mm7, %%mm4         \n\t"
1713                 "punpcklbw %%mm7, %%mm1         \n\t"
1714                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1715                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1716                 "punpcklbw %%mm7, %%mm2         \n\t"
1717                 "punpcklbw %%mm7, %%mm3         \n\t"
1718                 "pmaddwd %%mm6, %%mm4           \n\t"
1719                 "pmaddwd %%mm6, %%mm1           \n\t"
1720                 "pmaddwd %%mm6, %%mm2           \n\t"
1721                 "pmaddwd %%mm6, %%mm3           \n\t"
1722 #ifndef FAST_BGR2YV12
1723                 "psrad $8, %%mm4                \n\t"
1724                 "psrad $8, %%mm1                \n\t"
1725                 "psrad $8, %%mm2                \n\t"
1726                 "psrad $8, %%mm3                \n\t"
1727 #endif
1728                 "packssdw %%mm1, %%mm4          \n\t"
1729                 "packssdw %%mm3, %%mm2          \n\t"
1730                 "pmaddwd %%mm5, %%mm4           \n\t"
1731                 "pmaddwd %%mm5, %%mm2           \n\t"
1732                 "add $24, %%"REG_b"             \n\t"
1733                 "packssdw %%mm2, %%mm4          \n\t"
1734                 "psraw $7, %%mm4                \n\t"
1735
1736                 "packuswb %%mm4, %%mm0          \n\t"
1737                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1738
1739                 "movq %%mm0, (%1, %%"REG_a")    \n\t"
1740                 "add $8, %%"REG_a"              \n\t"
1741                 " js 1b                         \n\t"
1742                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1743                 : "%"REG_a, "%"REG_b
1744         );
1745 #else
1746         int i;
1747         for(i=0; i<width; i++)
1748         {
1749                 int b= src[i*3+0];
1750                 int g= src[i*3+1];
1751                 int r= src[i*3+2];
1752
1753                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1754         }
1755 #endif
1756 }
1757
1758 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1759 {
1760 #ifdef HAVE_MMX
1761         asm volatile(
1762                 "mov %4, %%"REG_a"              \n\t"
1763                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1764                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1765                 "pxor %%mm7, %%mm7              \n\t"
1766                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1767                 "add %%"REG_b", %%"REG_b"       \n\t"
1768                 ASMALIGN16
1769                 "1:                             \n\t"
1770                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
1771                 PREFETCH" 64(%1, %%"REG_b")     \n\t"
1772 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1773                 "movq (%0, %%"REG_b"), %%mm0    \n\t"
1774                 "movq (%1, %%"REG_b"), %%mm1    \n\t"
1775                 "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1776                 "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1777                 PAVGB(%%mm1, %%mm0)
1778                 PAVGB(%%mm3, %%mm2)
1779                 "movq %%mm0, %%mm1              \n\t"
1780                 "movq %%mm2, %%mm3              \n\t"
1781                 "psrlq $24, %%mm0               \n\t"
1782                 "psrlq $24, %%mm2               \n\t"
1783                 PAVGB(%%mm1, %%mm0)
1784                 PAVGB(%%mm3, %%mm2)
1785                 "punpcklbw %%mm7, %%mm0         \n\t"
1786                 "punpcklbw %%mm7, %%mm2         \n\t"
1787 #else
1788                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
1789                 "movd (%1, %%"REG_b"), %%mm1    \n\t"
1790                 "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1791                 "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1792                 "punpcklbw %%mm7, %%mm0         \n\t"
1793                 "punpcklbw %%mm7, %%mm1         \n\t"
1794                 "punpcklbw %%mm7, %%mm2         \n\t"
1795                 "punpcklbw %%mm7, %%mm3         \n\t"
1796                 "paddw %%mm1, %%mm0             \n\t"
1797                 "paddw %%mm3, %%mm2             \n\t"
1798                 "paddw %%mm2, %%mm0             \n\t"
1799                 "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1800                 "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1801                 "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1802                 "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1803                 "punpcklbw %%mm7, %%mm4         \n\t"
1804                 "punpcklbw %%mm7, %%mm1         \n\t"
1805                 "punpcklbw %%mm7, %%mm2         \n\t"
1806                 "punpcklbw %%mm7, %%mm3         \n\t"
1807                 "paddw %%mm1, %%mm4             \n\t"
1808                 "paddw %%mm3, %%mm2             \n\t"
1809                 "paddw %%mm4, %%mm2             \n\t"
1810                 "psrlw $2, %%mm0                \n\t"
1811                 "psrlw $2, %%mm2                \n\t"
1812 #endif
1813                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1814                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1815                 
1816                 "pmaddwd %%mm0, %%mm1           \n\t"
1817                 "pmaddwd %%mm2, %%mm3           \n\t"
1818                 "pmaddwd %%mm6, %%mm0           \n\t"
1819                 "pmaddwd %%mm6, %%mm2           \n\t"
1820 #ifndef FAST_BGR2YV12
1821                 "psrad $8, %%mm0                \n\t"
1822                 "psrad $8, %%mm1                \n\t"
1823                 "psrad $8, %%mm2                \n\t"
1824                 "psrad $8, %%mm3                \n\t"
1825 #endif
1826                 "packssdw %%mm2, %%mm0          \n\t"
1827                 "packssdw %%mm3, %%mm1          \n\t"
1828                 "pmaddwd %%mm5, %%mm0           \n\t"
1829                 "pmaddwd %%mm5, %%mm1           \n\t"
1830                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1831                 "psraw $7, %%mm0                \n\t"
1832
1833 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1834                 "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1835                 "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1836                 "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1837                 "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1838                 PAVGB(%%mm1, %%mm4)
1839                 PAVGB(%%mm3, %%mm2)
1840                 "movq %%mm4, %%mm1              \n\t"
1841                 "movq %%mm2, %%mm3              \n\t"
1842                 "psrlq $24, %%mm4               \n\t"
1843                 "psrlq $24, %%mm2               \n\t"
1844                 PAVGB(%%mm1, %%mm4)
1845                 PAVGB(%%mm3, %%mm2)
1846                 "punpcklbw %%mm7, %%mm4         \n\t"
1847                 "punpcklbw %%mm7, %%mm2         \n\t"
1848 #else
1849                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1850                 "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1851                 "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1852                 "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1853                 "punpcklbw %%mm7, %%mm4         \n\t"
1854                 "punpcklbw %%mm7, %%mm1         \n\t"
1855                 "punpcklbw %%mm7, %%mm2         \n\t"
1856                 "punpcklbw %%mm7, %%mm3         \n\t"
1857                 "paddw %%mm1, %%mm4             \n\t"
1858                 "paddw %%mm3, %%mm2             \n\t"
1859                 "paddw %%mm2, %%mm4             \n\t"
1860                 "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1861                 "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1862                 "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1863                 "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1864                 "punpcklbw %%mm7, %%mm5         \n\t"
1865                 "punpcklbw %%mm7, %%mm1         \n\t"
1866                 "punpcklbw %%mm7, %%mm2         \n\t"
1867                 "punpcklbw %%mm7, %%mm3         \n\t"
1868                 "paddw %%mm1, %%mm5             \n\t"
1869                 "paddw %%mm3, %%mm2             \n\t"
1870                 "paddw %%mm5, %%mm2             \n\t"
1871                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1872                 "psrlw $2, %%mm4                \n\t"
1873                 "psrlw $2, %%mm2                \n\t"
1874 #endif
1875                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1876                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1877                 
1878                 "pmaddwd %%mm4, %%mm1           \n\t"
1879                 "pmaddwd %%mm2, %%mm3           \n\t"
1880                 "pmaddwd %%mm6, %%mm4           \n\t"
1881                 "pmaddwd %%mm6, %%mm2           \n\t"
1882 #ifndef FAST_BGR2YV12
1883                 "psrad $8, %%mm4                \n\t"
1884                 "psrad $8, %%mm1                \n\t"
1885                 "psrad $8, %%mm2                \n\t"
1886                 "psrad $8, %%mm3                \n\t"
1887 #endif
1888                 "packssdw %%mm2, %%mm4          \n\t"
1889                 "packssdw %%mm3, %%mm1          \n\t"
1890                 "pmaddwd %%mm5, %%mm4           \n\t"
1891                 "pmaddwd %%mm5, %%mm1           \n\t"
1892                 "add $24, %%"REG_b"             \n\t"
1893                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1894                 "psraw $7, %%mm4                \n\t"
1895                 
1896                 "movq %%mm0, %%mm1              \n\t"
1897                 "punpckldq %%mm4, %%mm0         \n\t"
1898                 "punpckhdq %%mm4, %%mm1         \n\t"
1899                 "packsswb %%mm1, %%mm0          \n\t"
1900                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1901
1902                 "movd %%mm0, (%2, %%"REG_a")    \n\t"
1903                 "punpckhdq %%mm0, %%mm0         \n\t"
1904                 "movd %%mm0, (%3, %%"REG_a")    \n\t"
1905                 "add $4, %%"REG_a"              \n\t"
1906                 " js 1b                         \n\t"
1907                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1908                 : "%"REG_a, "%"REG_b
1909         );
1910 #else
1911         int i;
1912         for(i=0; i<width; i++)
1913         {
1914                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1915                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1916                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1917
1918                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1919                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1920         }
1921 #endif
1922 }
1923
1924 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1925 {
1926         int i;
1927         for(i=0; i<width; i++)
1928         {
1929                 int d= ((uint16_t*)src)[i];
1930                 int b= d&0x1F;
1931                 int g= (d>>5)&0x3F;
1932                 int r= (d>>11)&0x1F;
1933
1934                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1935         }
1936 }
1937
1938 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1939 {
1940         int i;
1941         for(i=0; i<width; i++)
1942         {
1943                 int d0= ((uint32_t*)src1)[i];
1944                 int d1= ((uint32_t*)src2)[i];
1945                 
1946                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1947                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1948
1949                 int dh2= (dh>>11) + (dh<<21);
1950                 int d= dh2 + dl;
1951
1952                 int b= d&0x7F;
1953                 int r= (d>>11)&0x7F;
1954                 int g= d>>21;
1955                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1956                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1957         }
1958 }
1959
1960 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1961 {
1962         int i;
1963         for(i=0; i<width; i++)
1964         {
1965                 int d= ((uint16_t*)src)[i];
1966                 int b= d&0x1F;
1967                 int g= (d>>5)&0x1F;
1968                 int r= (d>>10)&0x1F;
1969
1970                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1971         }
1972 }
1973
1974 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1975 {
1976         int i;
1977         for(i=0; i<width; i++)
1978         {
1979                 int d0= ((uint32_t*)src1)[i];
1980                 int d1= ((uint32_t*)src2)[i];
1981                 
1982                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1983                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1984
1985                 int dh2= (dh>>11) + (dh<<21);
1986                 int d= dh2 + dl;
1987
1988                 int b= d&0x7F;
1989                 int r= (d>>10)&0x7F;
1990                 int g= d>>21;
1991                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1992                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1993         }
1994 }
1995
1996
1997 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1998 {
1999         int i;
2000         for(i=0; i<width; i++)
2001         {
2002                 int r=  ((uint32_t*)src)[i]&0xFF;
2003                 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2004                 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2005
2006                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2007         }
2008 }
2009
2010 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2011 {
2012         int i;
2013         for(i=0; i<width; i++)
2014         {
2015                 const int a= ((uint32_t*)src1)[2*i+0];
2016                 const int e= ((uint32_t*)src1)[2*i+1];
2017                 const int c= ((uint32_t*)src2)[2*i+0];
2018                 const int d= ((uint32_t*)src2)[2*i+1];
2019                 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2020                 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2021                 const int r=  l&0x3FF;
2022                 const int g=  h>>8;
2023                 const int b=  l>>16;
2024
2025                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2026                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2027         }
2028 }
2029
2030 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2031 {
2032         int i;
2033         for(i=0; i<width; i++)
2034         {
2035                 int r= src[i*3+0];
2036                 int g= src[i*3+1];
2037                 int b= src[i*3+2];
2038
2039                 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2040         }
2041 }
2042
2043 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2044 {
2045         int i;
2046         for(i=0; i<width; i++)
2047         {
2048                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2049                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2050                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2051
2052                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2053                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2054         }
2055 }
2056
2057
2058 // Bilinear / Bicubic scaling
2059 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2060                                   int16_t *filter, int16_t *filterPos, long filterSize)
2061 {
2062 #ifdef HAVE_MMX
2063         assert(filterSize % 4 == 0 && filterSize>0);
2064         if(filterSize==4) // allways true for upscaling, sometimes for down too
2065         {
2066                 long counter= -2*dstW;
2067                 filter-= counter*2;
2068                 filterPos-= counter/2;
2069                 dst-= counter/2;
2070                 asm volatile(
2071                         "pxor %%mm7, %%mm7              \n\t"
2072                         "movq "MANGLE(w02)", %%mm6      \n\t"
2073                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2074                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2075                         ASMALIGN16
2076                         "1:                             \n\t"
2077                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2078                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2079                         "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2080                         "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2081                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2082                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2083                         "punpcklbw %%mm7, %%mm0         \n\t"
2084                         "punpcklbw %%mm7, %%mm2         \n\t"
2085                         "pmaddwd %%mm1, %%mm0           \n\t"
2086                         "pmaddwd %%mm2, %%mm3           \n\t"
2087                         "psrad $8, %%mm0                \n\t"
2088                         "psrad $8, %%mm3                \n\t"
2089                         "packssdw %%mm3, %%mm0          \n\t"
2090                         "pmaddwd %%mm6, %%mm0           \n\t"
2091                         "packssdw %%mm0, %%mm0          \n\t"
2092                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2093                         "add $4, %%"REG_BP"             \n\t"
2094                         " jnc 1b                        \n\t"
2095
2096                         "pop %%"REG_BP"                 \n\t"
2097                         : "+a" (counter)
2098                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2099                         : "%"REG_b
2100                 );
2101         }
2102         else if(filterSize==8)
2103         {
2104                 long counter= -2*dstW;
2105                 filter-= counter*4;
2106                 filterPos-= counter/2;
2107                 dst-= counter/2;
2108                 asm volatile(
2109                         "pxor %%mm7, %%mm7              \n\t"
2110                         "movq "MANGLE(w02)", %%mm6      \n\t"
2111                         "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2112                         "mov %%"REG_a", %%"REG_BP"      \n\t"
2113                         ASMALIGN16
2114                         "1:                             \n\t"
2115                         "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2116                         "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2117                         "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2118                         "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2119                         "movd (%3, %%"REG_a"), %%mm0    \n\t"
2120                         "movd (%3, %%"REG_b"), %%mm2    \n\t"
2121                         "punpcklbw %%mm7, %%mm0         \n\t"
2122                         "punpcklbw %%mm7, %%mm2         \n\t"
2123                         "pmaddwd %%mm1, %%mm0           \n\t"
2124                         "pmaddwd %%mm2, %%mm3           \n\t"
2125
2126                         "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2127                         "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2128                         "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2129                         "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2130                         "punpcklbw %%mm7, %%mm4         \n\t"
2131                         "punpcklbw %%mm7, %%mm2         \n\t"
2132                         "pmaddwd %%mm1, %%mm4           \n\t"
2133                         "pmaddwd %%mm2, %%mm5           \n\t"
2134                         "paddd %%mm4, %%mm0             \n\t"
2135                         "paddd %%mm5, %%mm3             \n\t"
2136                                                 
2137                         "psrad $8, %%mm0                \n\t"
2138                         "psrad $8, %%mm3                \n\t"
2139                         "packssdw %%mm3, %%mm0          \n\t"
2140                         "pmaddwd %%mm6, %%mm0           \n\t"
2141                         "packssdw %%mm0, %%mm0          \n\t"
2142                         "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2143                         "add $4, %%"REG_BP"             \n\t"
2144                         " jnc 1b                        \n\t"
2145
2146                         "pop %%"REG_BP"                 \n\t"
2147                         : "+a" (counter)
2148                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2149                         : "%"REG_b
2150                 );
2151         }
2152         else
2153         {
2154                 uint8_t *offset = src+filterSize;
2155                 long counter= -2*dstW;
2156 //              filter-= counter*filterSize/2;
2157                 filterPos-= counter/2;
2158                 dst-= counter/2;
2159                 asm volatile(
2160                         "pxor %%mm7, %%mm7              \n\t"
2161                         "movq "MANGLE(w02)", %%mm6      \n\t"
2162                         ASMALIGN16
2163                         "1:                             \n\t"
2164                         "mov %2, %%"REG_c"              \n\t"
2165                         "movzwl (%%"REG_c", %0), %%eax  \n\t"
2166                         "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2167                         "mov %5, %%"REG_c"              \n\t"
2168                         "pxor %%mm4, %%mm4              \n\t"
2169                         "pxor %%mm5, %%mm5              \n\t"
2170                         "2:                             \n\t"
2171                         "movq (%1), %%mm1               \n\t"
2172                         "movq (%1, %6), %%mm3           \n\t"
2173                         "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2174                         "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2175                         "punpcklbw %%mm7, %%mm0         \n\t"
2176                         "punpcklbw %%mm7, %%mm2         \n\t"
2177                         "pmaddwd %%mm1, %%mm0           \n\t"
2178                         "pmaddwd %%mm2, %%mm3           \n\t"
2179                         "paddd %%mm3, %%mm5             \n\t"
2180                         "paddd %%mm0, %%mm4             \n\t"
2181                         "add $8, %1                     \n\t"
2182                         "add $4, %%"REG_c"              \n\t"
2183                         "cmp %4, %%"REG_c"              \n\t"
2184                         " jb 2b                         \n\t"
2185                         "add %6, %1                     \n\t"
2186                         "psrad $8, %%mm4                \n\t"
2187                         "psrad $8, %%mm5                \n\t"
2188                         "packssdw %%mm5, %%mm4          \n\t"
2189                         "pmaddwd %%mm6, %%mm4           \n\t"
2190                         "packssdw %%mm4, %%mm4          \n\t"
2191                         "mov %3, %%"REG_a"              \n\t"
2192                         "movd %%mm4, (%%"REG_a", %0)    \n\t"
2193                         "add $4, %0                     \n\t"
2194                         " jnc 1b                        \n\t"
2195
2196                         : "+r" (counter), "+r" (filter)
2197                         : "m" (filterPos), "m" (dst), "m"(offset),
2198                           "m" (src), "r" (filterSize*2)
2199                         : "%"REG_b, "%"REG_a, "%"REG_c
2200                 );
2201         }
2202 #else
2203 #ifdef HAVE_ALTIVEC
2204         hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2205 #else
2206         int i;
2207         for(i=0; i<dstW; i++)
2208         {
2209                 int j;
2210                 int srcPos= filterPos[i];
2211                 int val=0;
2212 //              printf("filterPos: %d\n", filterPos[i]);
2213                 for(j=0; j<filterSize; j++)
2214                 {
2215 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2216                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2217                 }
2218 //              filter += hFilterSize;
2219                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2220 //              dst[i] = val>>7;
2221         }
2222 #endif
2223 #endif
2224 }
2225       // *** horizontal scale Y line to temp buffer
2226 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2227                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2228                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2229                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2230                                    int32_t *mmx2FilterPos)
2231 {
2232     if(srcFormat==IMGFMT_YUY2)
2233     {
2234         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2235         src= formatConvBuffer;
2236     }
2237     else if(srcFormat==IMGFMT_UYVY)
2238     {
2239         RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2240         src= formatConvBuffer;
2241     }
2242     else if(srcFormat==IMGFMT_BGR32)
2243     {
2244         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2245         src= formatConvBuffer;
2246     }
2247     else if(srcFormat==IMGFMT_BGR24)
2248     {
2249         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2250         src= formatConvBuffer;
2251     }
2252     else if(srcFormat==IMGFMT_BGR16)
2253     {
2254         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2255         src= formatConvBuffer;
2256     }
2257     else if(srcFormat==IMGFMT_BGR15)
2258     {
2259         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2260         src= formatConvBuffer;
2261     }
2262     else if(srcFormat==IMGFMT_RGB32)
2263     {
2264         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2265         src= formatConvBuffer;
2266     }
2267     else if(srcFormat==IMGFMT_RGB24)
2268     {
2269         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2270         src= formatConvBuffer;
2271     }
2272
2273 #ifdef HAVE_MMX
2274         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2275     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2276 #else
2277     if(!(flags&SWS_FAST_BILINEAR))
2278 #endif
2279     {
2280         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2281     }
2282     else // Fast Bilinear upscale / crap downscale
2283     {
2284 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2285 #ifdef HAVE_MMX2
2286         int i;
2287         if(canMMX2BeUsed)
2288         {
2289                 asm volatile(
2290                         "pxor %%mm7, %%mm7              \n\t"
2291                         "mov %0, %%"REG_c"              \n\t"
2292                         "mov %1, %%"REG_D"              \n\t"
2293                         "mov %2, %%"REG_d"              \n\t"
2294                         "mov %3, %%"REG_b"              \n\t"
2295                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2296                         PREFETCH" (%%"REG_c")           \n\t"
2297                         PREFETCH" 32(%%"REG_c")         \n\t"
2298                         PREFETCH" 64(%%"REG_c")         \n\t"
2299
2300 #ifdef ARCH_X86_64
2301
2302 #define FUNNY_Y_CODE \
2303                         "movl (%%"REG_b"), %%esi        \n\t"\
2304                         "call *%4                       \n\t"\
2305                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2306                         "add %%"REG_S", %%"REG_c"       \n\t"\
2307                         "add %%"REG_a", %%"REG_D"       \n\t"\
2308                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2309
2310 #else
2311
2312 #define FUNNY_Y_CODE \
2313                         "movl (%%"REG_b"), %%esi        \n\t"\
2314                         "call *%4                       \n\t"\
2315                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2316                         "add %%"REG_a", %%"REG_D"       \n\t"\
2317                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2318
2319 #endif
2320
2321 FUNNY_Y_CODE
2322 FUNNY_Y_CODE
2323 FUNNY_Y_CODE
2324 FUNNY_Y_CODE
2325 FUNNY_Y_CODE
2326 FUNNY_Y_CODE
2327 FUNNY_Y_CODE
2328 FUNNY_Y_CODE
2329
2330                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2331                         "m" (funnyYCode)
2332                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2333                 );
2334                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2335         }
2336         else
2337         {
2338 #endif
2339         long xInc_shr16 = xInc >> 16;
2340         uint16_t xInc_mask = xInc & 0xffff;
2341         //NO MMX just normal asm ...
2342         asm volatile(
2343                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2344                 "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2345                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2346                 ASMALIGN16
2347                 "1:                             \n\t"
2348                 "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2349                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2350                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2351                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2352                 "shll $16, %%edi                \n\t"
2353                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2354                 "mov %1, %%"REG_D"              \n\t"
2355                 "shrl $9, %%esi                 \n\t"
2356                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2357                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2358                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2359
2360                 "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2361                 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2362                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2363                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2364                 "shll $16, %%edi                \n\t"
2365                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2366                 "mov %1, %%"REG_D"              \n\t"
2367                 "shrl $9, %%esi                 \n\t"
2368                 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2369                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2370                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2371
2372
2373                 "add $2, %%"REG_a"              \n\t"
2374                 "cmp %2, %%"REG_a"              \n\t"
2375                 " jb 1b                         \n\t"
2376
2377
2378                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2379                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2380                 );
2381 #ifdef HAVE_MMX2
2382         } //if MMX2 can't be used
2383 #endif
2384 #else
2385         int i;
2386         unsigned int xpos=0;
2387         for(i=0;i<dstWidth;i++)
2388         {
2389                 register unsigned int xx=xpos>>16;
2390                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2391                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2392                 xpos+=xInc;
2393         }
2394 #endif
2395     }
2396 }
2397
2398 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2399                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2400                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2401                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2402                                    int32_t *mmx2FilterPos)
2403 {
2404     if(srcFormat==IMGFMT_YUY2)
2405     {
2406         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407         src1= formatConvBuffer;
2408         src2= formatConvBuffer+2048;
2409     }
2410     else if(srcFormat==IMGFMT_UYVY)
2411     {
2412         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2413         src1= formatConvBuffer;
2414         src2= formatConvBuffer+2048;
2415     }
2416     else if(srcFormat==IMGFMT_BGR32)
2417     {
2418         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2419         src1= formatConvBuffer;
2420         src2= formatConvBuffer+2048;
2421     }
2422     else if(srcFormat==IMGFMT_BGR24)
2423     {
2424         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2425         src1= formatConvBuffer;
2426         src2= formatConvBuffer+2048;
2427     }
2428     else if(srcFormat==IMGFMT_BGR16)
2429     {
2430         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2431         src1= formatConvBuffer;
2432         src2= formatConvBuffer+2048;
2433     }
2434     else if(srcFormat==IMGFMT_BGR15)
2435     {
2436         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2437         src1= formatConvBuffer;
2438         src2= formatConvBuffer+2048;
2439     }
2440     else if(srcFormat==IMGFMT_RGB32)
2441     {
2442         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2443         src1= formatConvBuffer;
2444         src2= formatConvBuffer+2048;
2445     }
2446     else if(srcFormat==IMGFMT_RGB24)
2447     {
2448         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2449         src1= formatConvBuffer;
2450         src2= formatConvBuffer+2048;
2451     }
2452     else if(isGray(srcFormat))
2453     {
2454         return;
2455     }
2456
2457 #ifdef HAVE_MMX
2458         // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2459     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2460 #else
2461     if(!(flags&SWS_FAST_BILINEAR))
2462 #endif
2463     {
2464         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2465         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2466     }
2467     else // Fast Bilinear upscale / crap downscale
2468     {
2469 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2470 #ifdef HAVE_MMX2
2471         int i;
2472         if(canMMX2BeUsed)
2473         {
2474                 asm volatile(
2475                         "pxor %%mm7, %%mm7              \n\t"
2476                         "mov %0, %%"REG_c"              \n\t"
2477                         "mov %1, %%"REG_D"              \n\t"
2478                         "mov %2, %%"REG_d"              \n\t"
2479                         "mov %3, %%"REG_b"              \n\t"
2480                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2481                         PREFETCH" (%%"REG_c")           \n\t"
2482                         PREFETCH" 32(%%"REG_c")         \n\t"
2483                         PREFETCH" 64(%%"REG_c")         \n\t"
2484
2485 #ifdef ARCH_X86_64
2486
2487 #define FUNNY_UV_CODE \
2488                         "movl (%%"REG_b"), %%esi        \n\t"\
2489                         "call *%4                       \n\t"\
2490                         "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2491                         "add %%"REG_S", %%"REG_c"       \n\t"\
2492                         "add %%"REG_a", %%"REG_D"       \n\t"\
2493                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2494
2495 #else
2496
2497 #define FUNNY_UV_CODE \
2498                         "movl (%%"REG_b"), %%esi        \n\t"\
2499                         "call *%4                       \n\t"\
2500                         "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2501                         "add %%"REG_a", %%"REG_D"       \n\t"\
2502                         "xor %%"REG_a", %%"REG_a"       \n\t"\
2503
2504 #endif
2505
2506 FUNNY_UV_CODE
2507 FUNNY_UV_CODE
2508 FUNNY_UV_CODE
2509 FUNNY_UV_CODE
2510                         "xor %%"REG_a", %%"REG_a"       \n\t" // i
2511                         "mov %5, %%"REG_c"              \n\t" // src
2512                         "mov %1, %%"REG_D"              \n\t" // buf1
2513                         "add $4096, %%"REG_D"           \n\t"
2514                         PREFETCH" (%%"REG_c")           \n\t"
2515                         PREFETCH" 32(%%"REG_c")         \n\t"
2516                         PREFETCH" 64(%%"REG_c")         \n\t"
2517
2518 FUNNY_UV_CODE
2519 FUNNY_UV_CODE
2520 FUNNY_UV_CODE
2521 FUNNY_UV_CODE
2522
2523                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2524                         "m" (funnyUVCode), "m" (src2)
2525                         : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2526                 );
2527                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2528                 {
2529 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2530                         dst[i] = src1[srcW-1]*128;
2531                         dst[i+2048] = src2[srcW-1]*128;
2532                 }
2533         }
2534         else
2535         {
2536 #endif
2537         long xInc_shr16 = (long) (xInc >> 16);
2538         uint16_t xInc_mask = xInc & 0xffff; 
2539         asm volatile(
2540                 "xor %%"REG_a", %%"REG_a"       \n\t" // i
2541                 "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2542                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2543                 ASMALIGN16
2544                 "1:                             \n\t"
2545                 "mov %0, %%"REG_S"              \n\t"
2546                 "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2547                 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2548                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2549                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2550                 "shll $16, %%edi                \n\t"
2551                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2552                 "mov %1, %%"REG_D"              \n\t"
2553                 "shrl $9, %%esi                 \n\t"
2554                 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2555
2556                 "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2557                 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2558                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2559                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2560                 "shll $16, %%edi                \n\t"
2561                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2562                 "mov %1, %%"REG_D"              \n\t"
2563                 "shrl $9, %%esi                 \n\t"
2564                 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2565
2566                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2567                 "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2568                 "add $1, %%"REG_a"              \n\t"
2569                 "cmp %2, %%"REG_a"              \n\t"
2570                 " jb 1b                         \n\t"
2571
2572 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2573    which is needed to support GCC-4.0 */
2574 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2575                 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2576 #else
2577                 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2578 #endif
2579                 "r" (src2)
2580                 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2581                 );
2582 #ifdef HAVE_MMX2
2583         } //if MMX2 can't be used
2584 #endif
2585 #else
2586         int i;
2587         unsigned int xpos=0;
2588         for(i=0;i<dstWidth;i++)
2589         {
2590                 register unsigned int xx=xpos>>16;
2591                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2592                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2593                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2594 /* slower
2595           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2596           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2597 */
2598                 xpos+=xInc;
2599         }
2600 #endif
2601    }
2602 }
2603
2604 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2605              int srcSliceH, uint8_t* dst[], int dstStride[]){
2606
2607         /* load a few things into local vars to make the code more readable? and faster */
2608         const int srcW= c->srcW;
2609         const int dstW= c->dstW;
2610         const int dstH= c->dstH;
2611         const int chrDstW= c->chrDstW;
2612         const int chrSrcW= c->chrSrcW;
2613         const int lumXInc= c->lumXInc;
2614         const int chrXInc= c->chrXInc;
2615         const int dstFormat= c->dstFormat;
2616         const int srcFormat= c->srcFormat;
2617         const int flags= c->flags;
2618         const int canMMX2BeUsed= c->canMMX2BeUsed;
2619         int16_t *vLumFilterPos= c->vLumFilterPos;
2620         int16_t *vChrFilterPos= c->vChrFilterPos;
2621         int16_t *hLumFilterPos= c->hLumFilterPos;
2622         int16_t *hChrFilterPos= c->hChrFilterPos;
2623         int16_t *vLumFilter= c->vLumFilter;
2624         int16_t *vChrFilter= c->vChrFilter;
2625         int16_t *hLumFilter= c->hLumFilter;
2626         int16_t *hChrFilter= c->hChrFilter;
2627         int32_t *lumMmxFilter= c->lumMmxFilter;
2628         int32_t *chrMmxFilter= c->chrMmxFilter;
2629         const int vLumFilterSize= c->vLumFilterSize;
2630         const int vChrFilterSize= c->vChrFilterSize;
2631         const int hLumFilterSize= c->hLumFilterSize;
2632         const int hChrFilterSize= c->hChrFilterSize;
2633         int16_t **lumPixBuf= c->lumPixBuf;
2634         int16_t **chrPixBuf= c->chrPixBuf;
2635         const int vLumBufSize= c->vLumBufSize;
2636         const int vChrBufSize= c->vChrBufSize;
2637         uint8_t *funnyYCode= c->funnyYCode;
2638         uint8_t *funnyUVCode= c->funnyUVCode;
2639         uint8_t *formatConvBuffer= c->formatConvBuffer;
2640         const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2641         const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2642         int lastDstY;
2643
2644         /* vars whch will change and which we need to storw back in the context */
2645         int dstY= c->dstY;
2646         int lumBufIndex= c->lumBufIndex;
2647         int chrBufIndex= c->chrBufIndex;
2648         int lastInLumBuf= c->lastInLumBuf;
2649         int lastInChrBuf= c->lastInChrBuf;
2650         
2651         if(isPacked(c->srcFormat)){
2652                 src[0]=
2653                 src[1]=
2654                 src[2]= src[0];
2655                 srcStride[0]=
2656                 srcStride[1]=
2657                 srcStride[2]= srcStride[0];
2658         }
2659         srcStride[1]<<= c->vChrDrop;
2660         srcStride[2]<<= c->vChrDrop;
2661
2662 //      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2663 //              (int)dst[0], (int)dst[1], (int)dst[2]);
2664
2665 #if 0 //self test FIXME move to a vfilter or something
2666 {
2667 static volatile int i=0;
2668 i++;
2669 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2670         selfTest(src, srcStride, c->srcW, c->srcH);
2671 i--;
2672 }
2673 #endif
2674
2675 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2676 //dstStride[0],dstStride[1],dstStride[2]);
2677
2678         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2679         {
2680                 static int firstTime=1; //FIXME move this into the context perhaps
2681                 if(flags & SWS_PRINT_INFO && firstTime)
2682                 {
2683                         MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2684                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2685                         firstTime=0;
2686                 }
2687         }
2688
2689         /* Note the user might start scaling the picture in the middle so this will not get executed
2690            this is not really intended but works currently, so ppl might do it */
2691         if(srcSliceY ==0){
2692                 lumBufIndex=0;
2693                 chrBufIndex=0;
2694                 dstY=0; 
2695                 lastInLumBuf= -1;
2696                 lastInChrBuf= -1;
2697         }
2698
2699         lastDstY= dstY;
2700
2701         for(;dstY < dstH; dstY++){
2702                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2703                 const int chrDstY= dstY>>c->chrDstVSubSample;
2704                 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2705                 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2706
2707                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2708                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2709                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2710                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2711
2712 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2713 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2714                 //handle holes (FAST_BILINEAR & weird filters)
2715                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2716                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2717 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2718                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2719                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2720
2721                 // Do we have enough lines in this slice to output the dstY line
2722                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2723                 {
2724                         //Do horizontal scaling
2725                         while(lastInLumBuf < lastLumSrcY)
2726                         {
2727                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2728                                 lumBufIndex++;
2729 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2730                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2731                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2732                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2733 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2734                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2735                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2736                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2737                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2738                                 lastInLumBuf++;
2739                         }
2740                         while(lastInChrBuf < lastChrSrcY)
2741                         {
2742                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2743                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2744                                 chrBufIndex++;
2745                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2746                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2747                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2748                                 //FIXME replace parameters through context struct (some at least)
2749
2750                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2751                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2752                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2753                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2754                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2755                                 lastInChrBuf++;
2756                         }
2757                         //wrap buf index around to stay inside the ring buffer
2758                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2759                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2760                 }
2761                 else // not enough lines left in this slice -> load the rest in the buffer
2762                 {
2763 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2764                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2765                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2766                         vChrBufSize, vLumBufSize);*/
2767
2768                         //Do horizontal scaling
2769                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2770                         {
2771                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2772                                 lumBufIndex++;
2773                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2774                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2775                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2776                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2777                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2778                                                 funnyYCode, c->srcFormat, formatConvBuffer, 
2779                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2780                                 lastInLumBuf++;
2781                         }
2782                         while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2783                         {
2784                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2785                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2786                                 chrBufIndex++;
2787                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2788                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2789                                 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2790
2791                                 if(!(isGray(srcFormat) || isGray(dstFormat)))
2792                                         RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2793                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2794                                                 funnyUVCode, c->srcFormat, formatConvBuffer, 
2795                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2796                                 lastInChrBuf++;
2797                         }
2798                         //wrap buf index around to stay inside the ring buffer
2799                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2800                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2801                         break; //we can't output a dstY line so let's try with the next slice
2802                 }
2803
2804 #ifdef HAVE_MMX
2805                 b5Dither= dither8[dstY&1];
2806                 g6Dither= dither4[dstY&1];
2807                 g5Dither= dither8[dstY&1];
2808                 r5Dither= dither8[(dstY+1)&1];
2809 #endif
2810             if(dstY < dstH-2)
2811             {
2812                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2813                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2814 #ifdef HAVE_MMX
2815                 int i;
2816                 for(i=0; i<vLumFilterSize; i++)
2817                 {
2818                         lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2819                         lumMmxFilter[4*i+2]= 
2820                         lumMmxFilter[4*i+3]= 
2821                                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2822                 }
2823                 for(i=0; i<vChrFilterSize; i++)
2824                 {
2825                         chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2826                         chrMmxFilter[4*i+2]= 
2827                         chrMmxFilter[4*i+3]= 
2828                                 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2829                 }
2830 #endif
2831                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2832                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2833                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2834                         RENAME(yuv2nv12X)(c,
2835                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2836                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837                                 dest, uDest, dstW, chrDstW, dstFormat);
2838                 }
2839                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2840                 {
2841                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2842                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2843                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2844                         {
2845                                 int16_t *lumBuf = lumPixBuf[0];
2846                                 int16_t *chrBuf= chrPixBuf[0];
2847                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2848                         }
2849                         else //General YV12
2850                         {
2851                                 RENAME(yuv2yuvX)(c,
2852                                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2853                                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2854                                         dest, uDest, vDest, dstW, chrDstW);
2855                         }
2856                 }
2857                 else
2858                 {
2859                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2860                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2861                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2862                         {
2863                                 int chrAlpha= vChrFilter[2*dstY+1];
2864                                 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2865                                                  dest, dstW, chrAlpha, dstFormat, flags, dstY);
2866                         }
2867                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2868                         {
2869                                 int lumAlpha= vLumFilter[2*dstY+1];
2870                                 int chrAlpha= vChrFilter[2*dstY+1];
2871                                 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2872                                                  dest, dstW, lumAlpha, chrAlpha, dstY);
2873                         }
2874                         else //General RGB
2875                         {
2876                                 RENAME(yuv2packedX)(c,
2877                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879                                         dest, dstW, dstY);
2880                         }
2881                 }
2882             }
2883             else // hmm looks like we can't use MMX here without overwriting this array's tail
2884             {
2885                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2886                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2887                 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2888                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889                         if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2890                         yuv2nv12XinC(
2891                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2892                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893                                 dest, uDest, dstW, chrDstW, dstFormat);
2894                 }
2895                 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2896                 {
2897                         const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2898                         if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2899                         yuv2yuvXinC(
2900                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2901                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2902                                 dest, uDest, vDest, dstW, chrDstW);
2903                 }
2904                 else
2905                 {
2906                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2907                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2908                         yuv2packedXinC(c, 
2909                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2911                                 dest, dstW, dstY);
2912                 }
2913             }
2914         }
2915
2916 #ifdef HAVE_MMX
2917         __asm __volatile(SFENCE:::"memory");
2918         __asm __volatile(EMMS:::"memory");
2919 #endif
2920         /* store changed local vars back in the context */
2921         c->dstY= dstY;
2922         c->lumBufIndex= lumBufIndex;
2923         c->chrBufIndex= chrBufIndex;
2924         c->lastInLumBuf= lastInLumBuf;
2925         c->lastInChrBuf= lastInChrBuf;
2926
2927         return dstY - lastDstY;
2928 }