2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #elif defined ( HAVE_MMX2 )
40 #define PREFETCH "prefetchnta"
41 #define PREFETCHW "prefetcht0"
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
48 #define SFENCE "sfence"
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
55 #elif defined (HAVE_3DNOW)
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
67 #include "swscale_altivec_template.c"
70 #define YSCALEYUV2YV12X(x, offset) \
71 "xor %%"REG_a", %%"REG_a" \n\t"\
72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
73 "movq %%mm3, %%mm4 \n\t"\
74 "lea " offset "(%0), %%"REG_d" \n\t"\
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\
76 ASMALIGN16 /* FIXME Unroll? */\
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
81 "add $16, %%"REG_d" \n\t"\
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\
83 "test %%"REG_S", %%"REG_S" \n\t"\
84 "pmulhw %%mm0, %%mm2 \n\t"\
85 "pmulhw %%mm0, %%mm5 \n\t"\
86 "paddw %%mm2, %%mm3 \n\t"\
87 "paddw %%mm5, %%mm4 \n\t"\
89 "psraw $3, %%mm3 \n\t"\
90 "psraw $3, %%mm4 \n\t"\
91 "packuswb %%mm4, %%mm3 \n\t"\
92 MOVNTQ(%%mm3, (%1, %%REGa))\
93 "add $8, %%"REG_a" \n\t"\
94 "cmp %2, %%"REG_a" \n\t"\
95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
96 "movq %%mm3, %%mm4 \n\t"\
97 "lea " offset "(%0), %%"REG_d" \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 #define YSCALEYUV2YV121 \
102 "mov %2, %%"REG_a" \n\t"\
103 ASMALIGN16 /* FIXME Unroll? */\
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
107 "psraw $7, %%mm0 \n\t"\
108 "psraw $7, %%mm1 \n\t"\
109 "packuswb %%mm1, %%mm0 \n\t"\
110 MOVNTQ(%%mm0, (%1, %%REGa))\
111 "add $8, %%"REG_a" \n\t"\
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
117 "r" (dest), "m" (dstW),
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
121 #define YSCALEYUV2PACKEDX \
122 "xor %%"REG_a", %%"REG_a" \n\t"\
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\
128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
129 "movq %%mm3, %%mm4 \n\t"\
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
135 "add $16, %%"REG_d" \n\t"\
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\
137 "pmulhw %%mm0, %%mm2 \n\t"\
138 "pmulhw %%mm0, %%mm5 \n\t"\
139 "paddw %%mm2, %%mm3 \n\t"\
140 "paddw %%mm5, %%mm4 \n\t"\
141 "test %%"REG_S", %%"REG_S" \n\t"\
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
147 "movq %%mm1, %%mm7 \n\t"\
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
153 "add $16, %%"REG_d" \n\t"\
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\
155 "pmulhw %%mm0, %%mm2 \n\t"\
156 "pmulhw %%mm0, %%mm5 \n\t"\
157 "paddw %%mm2, %%mm1 \n\t"\
158 "paddw %%mm5, %%mm7 \n\t"\
159 "test %%"REG_S", %%"REG_S" \n\t"\
163 #define YSCALEYUV2RGBX \
165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
179 "paddw %%mm3, %%mm4 \n\t"\
180 "movq %%mm2, %%mm0 \n\t"\
181 "movq %%mm5, %%mm6 \n\t"\
182 "movq %%mm4, %%mm3 \n\t"\
183 "punpcklwd %%mm2, %%mm2 \n\t"\
184 "punpcklwd %%mm5, %%mm5 \n\t"\
185 "punpcklwd %%mm4, %%mm4 \n\t"\
186 "paddw %%mm1, %%mm2 \n\t"\
187 "paddw %%mm1, %%mm5 \n\t"\
188 "paddw %%mm1, %%mm4 \n\t"\
189 "punpckhwd %%mm0, %%mm0 \n\t"\
190 "punpckhwd %%mm6, %%mm6 \n\t"\
191 "punpckhwd %%mm3, %%mm3 \n\t"\
192 "paddw %%mm7, %%mm0 \n\t"\
193 "paddw %%mm7, %%mm6 \n\t"\
194 "paddw %%mm7, %%mm3 \n\t"\
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
196 "packuswb %%mm0, %%mm2 \n\t"\
197 "packuswb %%mm6, %%mm5 \n\t"\
198 "packuswb %%mm3, %%mm4 \n\t"\
199 "pxor %%mm7, %%mm7 \n\t"
201 #define FULL_YSCALEYUV2RGB \
202 "pxor %%mm7, %%mm7 \n\t"\
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\
204 "punpcklwd %%mm6, %%mm6 \n\t"\
205 "punpcklwd %%mm6, %%mm6 \n\t"\
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
207 "punpcklwd %%mm5, %%mm5 \n\t"\
208 "punpcklwd %%mm5, %%mm5 \n\t"\
209 "xor %%"REG_a", %%"REG_a" \n\t"\
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\
246 "packuswb %%mm3, %%mm3 \n\t"\
248 "packuswb %%mm0, %%mm0 \n\t"\
249 "paddw %%mm4, %%mm2 \n\t"\
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\
252 "packuswb %%mm1, %%mm1 \n\t"
255 #define REAL_YSCALEYUV2PACKED(index, c) \
256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
258 "psraw $3, %%mm0 \n\t"\
259 "psraw $3, %%mm1 \n\t"\
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
262 "xor "#index", "#index" \n\t"\
265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
293 #define REAL_YSCALEYUV2RGB(index, c) \
294 "xor "#index", "#index" \n\t"\
297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
359 #define REAL_YSCALEYUV2PACKED1(index, c) \
360 "xor "#index", "#index" \n\t"\
363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
365 "psraw $7, %%mm3 \n\t" \
366 "psraw $7, %%mm4 \n\t" \
367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
369 "psraw $7, %%mm1 \n\t" \
370 "psraw $7, %%mm7 \n\t" \
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
374 #define REAL_YSCALEYUV2RGB1(index, c) \
375 "xor "#index", "#index" \n\t"\
378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
400 "paddw %%mm3, %%mm4 \n\t"\
401 "movq %%mm2, %%mm0 \n\t"\
402 "movq %%mm5, %%mm6 \n\t"\
403 "movq %%mm4, %%mm3 \n\t"\
404 "punpcklwd %%mm2, %%mm2 \n\t"\
405 "punpcklwd %%mm5, %%mm5 \n\t"\
406 "punpcklwd %%mm4, %%mm4 \n\t"\
407 "paddw %%mm1, %%mm2 \n\t"\
408 "paddw %%mm1, %%mm5 \n\t"\
409 "paddw %%mm1, %%mm4 \n\t"\
410 "punpckhwd %%mm0, %%mm0 \n\t"\
411 "punpckhwd %%mm6, %%mm6 \n\t"\
412 "punpckhwd %%mm3, %%mm3 \n\t"\
413 "paddw %%mm7, %%mm0 \n\t"\
414 "paddw %%mm7, %%mm6 \n\t"\
415 "paddw %%mm7, %%mm3 \n\t"\
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
417 "packuswb %%mm0, %%mm2 \n\t"\
418 "packuswb %%mm6, %%mm5 \n\t"\
419 "packuswb %%mm3, %%mm4 \n\t"\
420 "pxor %%mm7, %%mm7 \n\t"
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \
424 "xor "#index", "#index" \n\t"\
427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
433 "psrlw $8, %%mm3 \n\t" \
434 "psrlw $8, %%mm4 \n\t" \
435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
437 "psraw $7, %%mm1 \n\t" \
438 "psraw $7, %%mm7 \n\t"
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
441 // do vertical chrominance interpolation
442 #define REAL_YSCALEYUV2RGB1b(index, c) \
443 "xor "#index", "#index" \n\t"\
446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
472 "paddw %%mm3, %%mm4 \n\t"\
473 "movq %%mm2, %%mm0 \n\t"\
474 "movq %%mm5, %%mm6 \n\t"\
475 "movq %%mm4, %%mm3 \n\t"\
476 "punpcklwd %%mm2, %%mm2 \n\t"\
477 "punpcklwd %%mm5, %%mm5 \n\t"\
478 "punpcklwd %%mm4, %%mm4 \n\t"\
479 "paddw %%mm1, %%mm2 \n\t"\
480 "paddw %%mm1, %%mm5 \n\t"\
481 "paddw %%mm1, %%mm4 \n\t"\
482 "punpckhwd %%mm0, %%mm0 \n\t"\
483 "punpckhwd %%mm6, %%mm6 \n\t"\
484 "punpckhwd %%mm3, %%mm3 \n\t"\
485 "paddw %%mm7, %%mm0 \n\t"\
486 "paddw %%mm7, %%mm6 \n\t"\
487 "paddw %%mm7, %%mm3 \n\t"\
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
489 "packuswb %%mm0, %%mm2 \n\t"\
490 "packuswb %%mm6, %%mm5 \n\t"\
491 "packuswb %%mm3, %%mm4 \n\t"\
492 "pxor %%mm7, %%mm7 \n\t"
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
495 #define REAL_WRITEBGR32(dst, dstw, index) \
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
497 "movq %%mm2, %%mm1 \n\t" /* B */\
498 "movq %%mm5, %%mm6 \n\t" /* R */\
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
510 MOVNTQ(%%mm0, (dst, index, 4))\
511 MOVNTQ(%%mm2, 8(dst, index, 4))\
512 MOVNTQ(%%mm1, 16(dst, index, 4))\
513 MOVNTQ(%%mm3, 24(dst, index, 4))\
515 "add $8, "#index" \n\t"\
516 "cmp "#dstw", "#index" \n\t"\
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
520 #define REAL_WRITEBGR16(dst, dstw, index) \
521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
524 "psrlq $3, %%mm2 \n\t"\
526 "movq %%mm2, %%mm1 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
529 "punpcklbw %%mm7, %%mm3 \n\t"\
530 "punpcklbw %%mm5, %%mm2 \n\t"\
531 "punpckhbw %%mm7, %%mm4 \n\t"\
532 "punpckhbw %%mm5, %%mm1 \n\t"\
534 "psllq $3, %%mm3 \n\t"\
535 "psllq $3, %%mm4 \n\t"\
537 "por %%mm3, %%mm2 \n\t"\
538 "por %%mm4, %%mm1 \n\t"\
540 MOVNTQ(%%mm2, (dst, index, 2))\
541 MOVNTQ(%%mm1, 8(dst, index, 2))\
543 "add $8, "#index" \n\t"\
544 "cmp "#dstw", "#index" \n\t"\
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
548 #define REAL_WRITEBGR15(dst, dstw, index) \
549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
552 "psrlq $3, %%mm2 \n\t"\
553 "psrlq $1, %%mm5 \n\t"\
555 "movq %%mm2, %%mm1 \n\t"\
556 "movq %%mm4, %%mm3 \n\t"\
558 "punpcklbw %%mm7, %%mm3 \n\t"\
559 "punpcklbw %%mm5, %%mm2 \n\t"\
560 "punpckhbw %%mm7, %%mm4 \n\t"\
561 "punpckhbw %%mm5, %%mm1 \n\t"\
563 "psllq $2, %%mm3 \n\t"\
564 "psllq $2, %%mm4 \n\t"\
566 "por %%mm3, %%mm2 \n\t"\
567 "por %%mm4, %%mm1 \n\t"\
569 MOVNTQ(%%mm2, (dst, index, 2))\
570 MOVNTQ(%%mm1, 8(dst, index, 2))\
572 "add $8, "#index" \n\t"\
573 "cmp "#dstw", "#index" \n\t"\
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
577 #define WRITEBGR24OLD(dst, dstw, index) \
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
579 "movq %%mm2, %%mm1 \n\t" /* B */\
580 "movq %%mm5, %%mm6 \n\t" /* R */\
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
624 MOVNTQ(%%mm0, (dst))\
625 MOVNTQ(%%mm2, 8(dst))\
626 MOVNTQ(%%mm3, 16(dst))\
627 "add $24, "#dst" \n\t"\
629 "add $8, "#index" \n\t"\
630 "cmp "#dstw", "#index" \n\t"\
633 #define WRITEBGR24MMX(dst, dstw, index) \
634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
635 "movq %%mm2, %%mm1 \n\t" /* B */\
636 "movq %%mm5, %%mm6 \n\t" /* R */\
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
667 MOVNTQ(%%mm0, (dst))\
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
673 MOVNTQ(%%mm6, 8(dst))\
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
678 MOVNTQ(%%mm5, 16(dst))\
680 "add $24, "#dst" \n\t"\
682 "add $8, "#index" \n\t"\
683 "cmp "#dstw", "#index" \n\t"\
686 #define WRITEBGR24MMX2(dst, dstw, index) \
687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
688 "movq "MANGLE(M24A)", %%mm0 \n\t"\
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\
690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
699 "por %%mm1, %%mm6 \n\t"\
700 "por %%mm3, %%mm6 \n\t"\
701 MOVNTQ(%%mm6, (dst))\
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
713 "por %%mm3, %%mm6 \n\t"\
714 MOVNTQ(%%mm6, 8(dst))\
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
724 "por %%mm1, %%mm3 \n\t"\
725 "por %%mm3, %%mm6 \n\t"\
726 MOVNTQ(%%mm6, 16(dst))\
728 "add $24, "#dst" \n\t"\
730 "add $8, "#index" \n\t"\
731 "cmp "#dstw", "#index" \n\t"\
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
742 #define REAL_WRITEYUY2(dst, dstw, index) \
743 "packuswb %%mm3, %%mm3 \n\t"\
744 "packuswb %%mm4, %%mm4 \n\t"\
745 "packuswb %%mm7, %%mm1 \n\t"\
746 "punpcklbw %%mm4, %%mm3 \n\t"\
747 "movq %%mm1, %%mm7 \n\t"\
748 "punpcklbw %%mm3, %%mm1 \n\t"\
749 "punpckhbw %%mm3, %%mm7 \n\t"\
751 MOVNTQ(%%mm1, (dst, index, 2))\
752 MOVNTQ(%%mm7, 8(dst, index, 2))\
754 "add $8, "#index" \n\t"\
755 "cmp "#dstw", "#index" \n\t"\
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
769 :: "r" (&c->redDither),
770 "r" (uDest), "p" (chrDstW)
771 : "%"REG_a, "%"REG_d, "%"REG_S
775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
776 :: "r" (&c->redDither),
777 "r" (vDest), "p" (chrDstW)
778 : "%"REG_a, "%"REG_d, "%"REG_S
783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
784 :: "r" (&c->redDither),
785 "r" (dest), "p" (dstW)
786 : "%"REG_a, "%"REG_d, "%"REG_S
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
791 chrFilter, chrSrc, chrFilterSize,
792 dest, uDest, vDest, dstW, chrDstW);
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
795 chrFilter, chrSrc, chrFilterSize,
796 dest, uDest, vDest, dstW, chrDstW);
797 #endif //!HAVE_ALTIVEC
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
806 chrFilter, chrSrc, chrFilterSize,
807 dest, uDest, dstW, chrDstW, dstFormat);
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
833 :: "r" (lumSrc + dstW), "r" (dest + dstW),
839 for(i=0; i<dstW; i++)
841 int val= lumSrc[i]>>7;
852 for(i=0; i<chrDstW; i++)
855 int v=chrSrc[i + 2048]>>7;
859 else if (u>255) u=255;
861 else if (v>255) v=255;
872 * vertical scale YV12 to RGB
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
876 uint8_t *dest, long dstW, long dstY)
886 WRITEBGR32(%4, %5, %%REGa)
888 :: "r" (&c->redDither),
889 "m" (dummy), "m" (dummy), "m" (dummy),
890 "r" (dest), "m" (dstW)
891 : "%"REG_a, "%"REG_d, "%"REG_S
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
900 "add %4, %%"REG_b" \n\t"
901 WRITEBGR24(%%REGb, %5, %%REGa)
903 :: "r" (&c->redDither),
904 "m" (dummy), "m" (dummy), "m" (dummy),
905 "r" (dest), "m" (dstW)
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
921 WRITEBGR15(%4, %5, %%REGa)
923 :: "r" (&c->redDither),
924 "m" (dummy), "m" (dummy), "m" (dummy),
925 "r" (dest), "m" (dstW)
926 : "%"REG_a, "%"REG_d, "%"REG_S
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
941 WRITEBGR16(%4, %5, %%REGa)
943 :: "r" (&c->redDither),
944 "m" (dummy), "m" (dummy), "m" (dummy),
945 "r" (dest), "m" (dstW)
946 : "%"REG_a, "%"REG_d, "%"REG_S
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
956 "psraw $3, %%mm3 \n\t"
957 "psraw $3, %%mm4 \n\t"
958 "psraw $3, %%mm1 \n\t"
959 "psraw $3, %%mm7 \n\t"
960 WRITEYUY2(%4, %5, %%REGa)
962 :: "r" (&c->redDither),
963 "m" (dummy), "m" (dummy), "m" (dummy),
964 "r" (dest), "m" (dstW)
965 : "%"REG_a, "%"REG_d, "%"REG_S
972 /* The following list of supported dstFormat values should
973 match what's found in the body of altivec_yuv2packedX() */
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
983 chrFilter, chrSrc, chrFilterSize,
990 * vertical bilinear scale YV12 to RGB
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
995 int yalpha1=yalpha^4095;
996 int uvalpha1=uvalpha^4095;
1000 if(flags&SWS_FULL_CHR_H_INT)
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1013 "movq %%mm3, %%mm1 \n\t"
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1020 "add $4, %%"REG_a" \n\t"
1021 "cmp %5, %%"REG_a" \n\t"
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1026 "m" (yalpha1), "m" (uvalpha1)
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1039 "movq %%mm3, %%mm1 \n\t"
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1048 "movq %%mm1, %%mm2 \n\t"
1049 "psllq $48, %%mm1 \n\t" // 000000BG
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1053 "psrld $16, %%mm2 \n\t" // R000R000
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000
1057 "mov %4, %%"REG_b" \n\t"
1058 "add %%"REG_a", %%"REG_b" \n\t"
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1066 "psrlq $32, %%mm3 \n\t"
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1070 "add $4, %%"REG_a" \n\t"
1071 "cmp %5, %%"REG_a" \n\t"
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075 "m" (yalpha1), "m" (uvalpha1)
1076 : "%"REG_a, "%"REG_b
1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1092 "psrlw $3, %%mm3 \n\t"
1093 "psllw $2, %%mm1 \n\t"
1094 "psllw $7, %%mm0 \n\t"
1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1098 "por %%mm3, %%mm1 \n\t"
1099 "por %%mm1, %%mm0 \n\t"
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1103 "add $4, %%"REG_a" \n\t"
1104 "cmp %5, %%"REG_a" \n\t"
1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1108 "m" (yalpha1), "m" (uvalpha1)
1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1125 "psrlw $3, %%mm3 \n\t"
1126 "psllw $3, %%mm1 \n\t"
1127 "psllw $8, %%mm0 \n\t"
1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1131 "por %%mm3, %%mm1 \n\t"
1132 "por %%mm1, %%mm0 \n\t"
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1136 "add $4, %%"REG_a" \n\t"
1137 "cmp %5, %%"REG_a" \n\t"
1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1141 "m" (yalpha1), "m" (uvalpha1)
1150 if(dstFormat==IMGFMT_BGR32)
1153 #ifdef WORDS_BIGENDIAN
1156 for(i=0;i<dstW;i++){
1157 // vertical linear interpolation && yuv2rgb in a single step:
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1167 else if(dstFormat==IMGFMT_BGR24)
1170 for(i=0;i<dstW;i++){
1171 // vertical linear interpolation && yuv2rgb in a single step:
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1181 else if(dstFormat==IMGFMT_BGR16)
1184 for(i=0;i<dstW;i++){
1185 // vertical linear interpolation && yuv2rgb in a single step:
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1190 ((uint16_t*)dest)[i] =
1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1196 else if(dstFormat==IMGFMT_BGR15)
1199 for(i=0;i<dstW;i++){
1200 // vertical linear interpolation && yuv2rgb in a single step:
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1205 ((uint16_t*)dest)[i] =
1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1216 switch(c->dstFormat)
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1222 "mov %4, %%"REG_b" \n\t"
1223 "push %%"REG_BP" \n\t"
1224 YSCALEYUV2RGB(%%REGBP, %5)
1225 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1226 "pop %%"REG_BP" \n\t"
1227 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1235 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1236 "mov %4, %%"REG_b" \n\t"
1237 "push %%"REG_BP" \n\t"
1238 YSCALEYUV2RGB(%%REGBP, %5)
1239 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1240 "pop %%"REG_BP" \n\t"
1241 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1242 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1248 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1249 "mov %4, %%"REG_b" \n\t"
1250 "push %%"REG_BP" \n\t"
1251 YSCALEYUV2RGB(%%REGBP, %5)
1252 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1254 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1255 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1256 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1259 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1275 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1276 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1277 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1280 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1281 "pop %%"REG_BP" \n\t"
1282 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1283 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2PACKED(%%REGBP, %5)
1293 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1294 "pop %%"REG_BP" \n\t"
1295 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1296 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1303 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1307 * YV12 to RGB without scaling or interpolating
1309 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1310 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1312 const int yalpha1=0;
1315 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1316 const int yalpha= 4096; //FIXME ...
1318 if(flags&SWS_FULL_CHR_H_INT)
1320 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1325 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1332 "mov %4, %%"REG_b" \n\t"
1333 "push %%"REG_BP" \n\t"
1334 YSCALEYUV2RGB1(%%REGBP, %5)
1335 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1336 "pop %%"REG_BP" \n\t"
1337 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1339 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1345 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_b" \n\t"
1347 "push %%"REG_BP" \n\t"
1348 YSCALEYUV2RGB1(%%REGBP, %5)
1349 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1350 "pop %%"REG_BP" \n\t"
1351 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1359 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1360 "mov %4, %%"REG_b" \n\t"
1361 "push %%"REG_BP" \n\t"
1362 YSCALEYUV2RGB1(%%REGBP, %5)
1363 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1365 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1366 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1367 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1369 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1370 "pop %%"REG_BP" \n\t"
1371 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1373 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1380 "mov %4, %%"REG_b" \n\t"
1381 "push %%"REG_BP" \n\t"
1382 YSCALEYUV2RGB1(%%REGBP, %5)
1383 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1385 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1390 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1391 "pop %%"REG_BP" \n\t"
1392 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1394 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1400 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1401 "mov %4, %%"REG_b" \n\t"
1402 "push %%"REG_BP" \n\t"
1403 YSCALEYUV2PACKED1(%%REGBP, %5)
1404 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1405 "pop %%"REG_BP" \n\t"
1406 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1408 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_b" \n\t"
1422 "push %%"REG_BP" \n\t"
1423 YSCALEYUV2RGB1b(%%REGBP, %5)
1424 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
1438 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1439 "pop %%"REG_BP" \n\t"
1440 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1442 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1448 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_b" \n\t"
1450 "push %%"REG_BP" \n\t"
1451 YSCALEYUV2RGB1b(%%REGBP, %5)
1452 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1454 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1455 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1456 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1458 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2RGB1b(%%REGBP, %5)
1472 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1474 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1475 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1476 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1479 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1480 "pop %%"REG_BP" \n\t"
1481 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1483 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1489 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1490 "mov %4, %%"REG_b" \n\t"
1491 "push %%"REG_BP" \n\t"
1492 YSCALEYUV2PACKED1b(%%REGBP, %5)
1493 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1494 "pop %%"REG_BP" \n\t"
1495 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1497 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 if( uvalpha < 2048 )
1506 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1508 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1512 //FIXME yuy2* can read upto 7 samples to much
1514 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1518 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1519 "mov %0, %%"REG_a" \n\t"
1521 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1522 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1523 "pand %%mm2, %%mm0 \n\t"
1524 "pand %%mm2, %%mm1 \n\t"
1525 "packuswb %%mm1, %%mm0 \n\t"
1526 "movq %%mm0, (%2, %%"REG_a") \n\t"
1527 "add $8, %%"REG_a" \n\t"
1529 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1534 for(i=0; i<width; i++)
1539 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1543 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1544 "mov %0, %%"REG_a" \n\t"
1546 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1547 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1548 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1549 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1552 "psrlw $8, %%mm0 \n\t"
1553 "psrlw $8, %%mm1 \n\t"
1554 "packuswb %%mm1, %%mm0 \n\t"
1555 "movq %%mm0, %%mm1 \n\t"
1556 "psrlw $8, %%mm0 \n\t"
1557 "pand %%mm4, %%mm1 \n\t"
1558 "packuswb %%mm0, %%mm0 \n\t"
1559 "packuswb %%mm1, %%mm1 \n\t"
1560 "movd %%mm0, (%4, %%"REG_a") \n\t"
1561 "movd %%mm1, (%3, %%"REG_a") \n\t"
1562 "add $4, %%"REG_a" \n\t"
1564 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1569 for(i=0; i<width; i++)
1571 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1572 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1577 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1578 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1582 "mov %0, %%"REG_a" \n\t"
1584 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1585 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1586 "psrlw $8, %%mm0 \n\t"
1587 "psrlw $8, %%mm1 \n\t"
1588 "packuswb %%mm1, %%mm0 \n\t"
1589 "movq %%mm0, (%2, %%"REG_a") \n\t"
1590 "add $8, %%"REG_a" \n\t"
1592 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1597 for(i=0; i<width; i++)
1602 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1606 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1607 "mov %0, %%"REG_a" \n\t"
1609 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1610 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1611 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1612 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1615 "pand %%mm4, %%mm0 \n\t"
1616 "pand %%mm4, %%mm1 \n\t"
1617 "packuswb %%mm1, %%mm0 \n\t"
1618 "movq %%mm0, %%mm1 \n\t"
1619 "psrlw $8, %%mm0 \n\t"
1620 "pand %%mm4, %%mm1 \n\t"
1621 "packuswb %%mm0, %%mm0 \n\t"
1622 "packuswb %%mm1, %%mm1 \n\t"
1623 "movd %%mm0, (%4, %%"REG_a") \n\t"
1624 "movd %%mm1, (%3, %%"REG_a") \n\t"
1625 "add $4, %%"REG_a" \n\t"
1627 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1632 for(i=0; i<width; i++)
1634 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1635 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1640 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1643 for(i=0; i<width; i++)
1645 int b= ((uint32_t*)src)[i]&0xFF;
1646 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1647 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1649 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1653 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1656 for(i=0; i<width; i++)
1658 const int a= ((uint32_t*)src1)[2*i+0];
1659 const int e= ((uint32_t*)src1)[2*i+1];
1660 const int c= ((uint32_t*)src2)[2*i+0];
1661 const int d= ((uint32_t*)src2)[2*i+1];
1662 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1663 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1664 const int b= l&0x3FF;
1668 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1669 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1673 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1677 "mov %2, %%"REG_a" \n\t"
1678 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1679 "movq "MANGLE(w1111)", %%mm5 \n\t"
1680 "pxor %%mm7, %%mm7 \n\t"
1681 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1684 PREFETCH" 64(%0, %%"REG_b") \n\t"
1685 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1686 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1687 "punpcklbw %%mm7, %%mm0 \n\t"
1688 "punpcklbw %%mm7, %%mm1 \n\t"
1689 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1690 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1691 "punpcklbw %%mm7, %%mm2 \n\t"
1692 "punpcklbw %%mm7, %%mm3 \n\t"
1693 "pmaddwd %%mm6, %%mm0 \n\t"
1694 "pmaddwd %%mm6, %%mm1 \n\t"
1695 "pmaddwd %%mm6, %%mm2 \n\t"
1696 "pmaddwd %%mm6, %%mm3 \n\t"
1697 #ifndef FAST_BGR2YV12
1698 "psrad $8, %%mm0 \n\t"
1699 "psrad $8, %%mm1 \n\t"
1700 "psrad $8, %%mm2 \n\t"
1701 "psrad $8, %%mm3 \n\t"
1703 "packssdw %%mm1, %%mm0 \n\t"
1704 "packssdw %%mm3, %%mm2 \n\t"
1705 "pmaddwd %%mm5, %%mm0 \n\t"
1706 "pmaddwd %%mm5, %%mm2 \n\t"
1707 "packssdw %%mm2, %%mm0 \n\t"
1708 "psraw $7, %%mm0 \n\t"
1710 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1711 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1712 "punpcklbw %%mm7, %%mm4 \n\t"
1713 "punpcklbw %%mm7, %%mm1 \n\t"
1714 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1715 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm6, %%mm4 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm6, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 #ifndef FAST_BGR2YV12
1723 "psrad $8, %%mm4 \n\t"
1724 "psrad $8, %%mm1 \n\t"
1725 "psrad $8, %%mm2 \n\t"
1726 "psrad $8, %%mm3 \n\t"
1728 "packssdw %%mm1, %%mm4 \n\t"
1729 "packssdw %%mm3, %%mm2 \n\t"
1730 "pmaddwd %%mm5, %%mm4 \n\t"
1731 "pmaddwd %%mm5, %%mm2 \n\t"
1732 "add $24, %%"REG_b" \n\t"
1733 "packssdw %%mm2, %%mm4 \n\t"
1734 "psraw $7, %%mm4 \n\t"
1736 "packuswb %%mm4, %%mm0 \n\t"
1737 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1739 "movq %%mm0, (%1, %%"REG_a") \n\t"
1740 "add $8, %%"REG_a" \n\t"
1742 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1743 : "%"REG_a, "%"REG_b
1747 for(i=0; i<width; i++)
1753 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1758 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1762 "mov %4, %%"REG_a" \n\t"
1763 "movq "MANGLE(w1111)", %%mm5 \n\t"
1764 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1765 "pxor %%mm7, %%mm7 \n\t"
1766 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1767 "add %%"REG_b", %%"REG_b" \n\t"
1770 PREFETCH" 64(%0, %%"REG_b") \n\t"
1771 PREFETCH" 64(%1, %%"REG_b") \n\t"
1772 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1773 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1774 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1775 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1776 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1779 "movq %%mm0, %%mm1 \n\t"
1780 "movq %%mm2, %%mm3 \n\t"
1781 "psrlq $24, %%mm0 \n\t"
1782 "psrlq $24, %%mm2 \n\t"
1785 "punpcklbw %%mm7, %%mm0 \n\t"
1786 "punpcklbw %%mm7, %%mm2 \n\t"
1788 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1789 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1790 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1791 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1792 "punpcklbw %%mm7, %%mm0 \n\t"
1793 "punpcklbw %%mm7, %%mm1 \n\t"
1794 "punpcklbw %%mm7, %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm3 \n\t"
1796 "paddw %%mm1, %%mm0 \n\t"
1797 "paddw %%mm3, %%mm2 \n\t"
1798 "paddw %%mm2, %%mm0 \n\t"
1799 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1800 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1801 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1802 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1803 "punpcklbw %%mm7, %%mm4 \n\t"
1804 "punpcklbw %%mm7, %%mm1 \n\t"
1805 "punpcklbw %%mm7, %%mm2 \n\t"
1806 "punpcklbw %%mm7, %%mm3 \n\t"
1807 "paddw %%mm1, %%mm4 \n\t"
1808 "paddw %%mm3, %%mm2 \n\t"
1809 "paddw %%mm4, %%mm2 \n\t"
1810 "psrlw $2, %%mm0 \n\t"
1811 "psrlw $2, %%mm2 \n\t"
1813 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1814 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1816 "pmaddwd %%mm0, %%mm1 \n\t"
1817 "pmaddwd %%mm2, %%mm3 \n\t"
1818 "pmaddwd %%mm6, %%mm0 \n\t"
1819 "pmaddwd %%mm6, %%mm2 \n\t"
1820 #ifndef FAST_BGR2YV12
1821 "psrad $8, %%mm0 \n\t"
1822 "psrad $8, %%mm1 \n\t"
1823 "psrad $8, %%mm2 \n\t"
1824 "psrad $8, %%mm3 \n\t"
1826 "packssdw %%mm2, %%mm0 \n\t"
1827 "packssdw %%mm3, %%mm1 \n\t"
1828 "pmaddwd %%mm5, %%mm0 \n\t"
1829 "pmaddwd %%mm5, %%mm1 \n\t"
1830 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1831 "psraw $7, %%mm0 \n\t"
1833 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1834 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1835 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1836 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1837 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1840 "movq %%mm4, %%mm1 \n\t"
1841 "movq %%mm2, %%mm3 \n\t"
1842 "psrlq $24, %%mm4 \n\t"
1843 "psrlq $24, %%mm2 \n\t"
1846 "punpcklbw %%mm7, %%mm4 \n\t"
1847 "punpcklbw %%mm7, %%mm2 \n\t"
1849 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1850 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1851 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1852 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1853 "punpcklbw %%mm7, %%mm4 \n\t"
1854 "punpcklbw %%mm7, %%mm1 \n\t"
1855 "punpcklbw %%mm7, %%mm2 \n\t"
1856 "punpcklbw %%mm7, %%mm3 \n\t"
1857 "paddw %%mm1, %%mm4 \n\t"
1858 "paddw %%mm3, %%mm2 \n\t"
1859 "paddw %%mm2, %%mm4 \n\t"
1860 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1861 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1862 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1863 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1864 "punpcklbw %%mm7, %%mm5 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "paddw %%mm1, %%mm5 \n\t"
1869 "paddw %%mm3, %%mm2 \n\t"
1870 "paddw %%mm5, %%mm2 \n\t"
1871 "movq "MANGLE(w1111)", %%mm5 \n\t"
1872 "psrlw $2, %%mm4 \n\t"
1873 "psrlw $2, %%mm2 \n\t"
1875 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1876 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1878 "pmaddwd %%mm4, %%mm1 \n\t"
1879 "pmaddwd %%mm2, %%mm3 \n\t"
1880 "pmaddwd %%mm6, %%mm4 \n\t"
1881 "pmaddwd %%mm6, %%mm2 \n\t"
1882 #ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm4 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1888 "packssdw %%mm2, %%mm4 \n\t"
1889 "packssdw %%mm3, %%mm1 \n\t"
1890 "pmaddwd %%mm5, %%mm4 \n\t"
1891 "pmaddwd %%mm5, %%mm1 \n\t"
1892 "add $24, %%"REG_b" \n\t"
1893 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1894 "psraw $7, %%mm4 \n\t"
1896 "movq %%mm0, %%mm1 \n\t"
1897 "punpckldq %%mm4, %%mm0 \n\t"
1898 "punpckhdq %%mm4, %%mm1 \n\t"
1899 "packsswb %%mm1, %%mm0 \n\t"
1900 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1902 "movd %%mm0, (%2, %%"REG_a") \n\t"
1903 "punpckhdq %%mm0, %%mm0 \n\t"
1904 "movd %%mm0, (%3, %%"REG_a") \n\t"
1905 "add $4, %%"REG_a" \n\t"
1907 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1908 : "%"REG_a, "%"REG_b
1912 for(i=0; i<width; i++)
1914 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1915 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1916 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1918 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1919 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1924 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1927 for(i=0; i<width; i++)
1929 int d= ((uint16_t*)src)[i];
1932 int r= (d>>11)&0x1F;
1934 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1938 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1941 for(i=0; i<width; i++)
1943 int d0= ((uint32_t*)src1)[i];
1944 int d1= ((uint32_t*)src2)[i];
1946 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1947 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1949 int dh2= (dh>>11) + (dh<<21);
1953 int r= (d>>11)&0x7F;
1955 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1956 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1960 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1963 for(i=0; i<width; i++)
1965 int d= ((uint16_t*)src)[i];
1968 int r= (d>>10)&0x1F;
1970 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1974 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1977 for(i=0; i<width; i++)
1979 int d0= ((uint32_t*)src1)[i];
1980 int d1= ((uint32_t*)src2)[i];
1982 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1983 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1985 int dh2= (dh>>11) + (dh<<21);
1989 int r= (d>>10)&0x7F;
1991 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1992 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1997 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2000 for(i=0; i<width; i++)
2002 int r= ((uint32_t*)src)[i]&0xFF;
2003 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2004 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2006 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2010 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2013 for(i=0; i<width; i++)
2015 const int a= ((uint32_t*)src1)[2*i+0];
2016 const int e= ((uint32_t*)src1)[2*i+1];
2017 const int c= ((uint32_t*)src2)[2*i+0];
2018 const int d= ((uint32_t*)src2)[2*i+1];
2019 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2020 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2021 const int r= l&0x3FF;
2025 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2026 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2030 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2033 for(i=0; i<width; i++)
2039 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2043 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2046 for(i=0; i<width; i++)
2048 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2049 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2050 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2052 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2053 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2058 // Bilinear / Bicubic scaling
2059 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2060 int16_t *filter, int16_t *filterPos, long filterSize)
2063 assert(filterSize % 4 == 0 && filterSize>0);
2064 if(filterSize==4) // allways true for upscaling, sometimes for down too
2066 long counter= -2*dstW;
2068 filterPos-= counter/2;
2071 "pxor %%mm7, %%mm7 \n\t"
2072 "movq "MANGLE(w02)", %%mm6 \n\t"
2073 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2074 "mov %%"REG_a", %%"REG_BP" \n\t"
2077 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2078 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2079 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2080 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2081 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2082 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2083 "punpcklbw %%mm7, %%mm0 \n\t"
2084 "punpcklbw %%mm7, %%mm2 \n\t"
2085 "pmaddwd %%mm1, %%mm0 \n\t"
2086 "pmaddwd %%mm2, %%mm3 \n\t"
2087 "psrad $8, %%mm0 \n\t"
2088 "psrad $8, %%mm3 \n\t"
2089 "packssdw %%mm3, %%mm0 \n\t"
2090 "pmaddwd %%mm6, %%mm0 \n\t"
2091 "packssdw %%mm0, %%mm0 \n\t"
2092 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2093 "add $4, %%"REG_BP" \n\t"
2096 "pop %%"REG_BP" \n\t"
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2102 else if(filterSize==8)
2104 long counter= -2*dstW;
2106 filterPos-= counter/2;
2109 "pxor %%mm7, %%mm7 \n\t"
2110 "movq "MANGLE(w02)", %%mm6 \n\t"
2111 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2112 "mov %%"REG_a", %%"REG_BP" \n\t"
2115 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2116 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2117 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2118 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2119 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2120 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm0 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm0 \n\t"
2124 "pmaddwd %%mm2, %%mm3 \n\t"
2126 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2127 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2128 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2129 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2130 "punpcklbw %%mm7, %%mm4 \n\t"
2131 "punpcklbw %%mm7, %%mm2 \n\t"
2132 "pmaddwd %%mm1, %%mm4 \n\t"
2133 "pmaddwd %%mm2, %%mm5 \n\t"
2134 "paddd %%mm4, %%mm0 \n\t"
2135 "paddd %%mm5, %%mm3 \n\t"
2137 "psrad $8, %%mm0 \n\t"
2138 "psrad $8, %%mm3 \n\t"
2139 "packssdw %%mm3, %%mm0 \n\t"
2140 "pmaddwd %%mm6, %%mm0 \n\t"
2141 "packssdw %%mm0, %%mm0 \n\t"
2142 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2143 "add $4, %%"REG_BP" \n\t"
2146 "pop %%"REG_BP" \n\t"
2148 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2154 uint8_t *offset = src+filterSize;
2155 long counter= -2*dstW;
2156 // filter-= counter*filterSize/2;
2157 filterPos-= counter/2;
2160 "pxor %%mm7, %%mm7 \n\t"
2161 "movq "MANGLE(w02)", %%mm6 \n\t"
2164 "mov %2, %%"REG_c" \n\t"
2165 "movzwl (%%"REG_c", %0), %%eax \n\t"
2166 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2167 "mov %5, %%"REG_c" \n\t"
2168 "pxor %%mm4, %%mm4 \n\t"
2169 "pxor %%mm5, %%mm5 \n\t"
2171 "movq (%1), %%mm1 \n\t"
2172 "movq (%1, %6), %%mm3 \n\t"
2173 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2174 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2175 "punpcklbw %%mm7, %%mm0 \n\t"
2176 "punpcklbw %%mm7, %%mm2 \n\t"
2177 "pmaddwd %%mm1, %%mm0 \n\t"
2178 "pmaddwd %%mm2, %%mm3 \n\t"
2179 "paddd %%mm3, %%mm5 \n\t"
2180 "paddd %%mm0, %%mm4 \n\t"
2182 "add $4, %%"REG_c" \n\t"
2183 "cmp %4, %%"REG_c" \n\t"
2186 "psrad $8, %%mm4 \n\t"
2187 "psrad $8, %%mm5 \n\t"
2188 "packssdw %%mm5, %%mm4 \n\t"
2189 "pmaddwd %%mm6, %%mm4 \n\t"
2190 "packssdw %%mm4, %%mm4 \n\t"
2191 "mov %3, %%"REG_a" \n\t"
2192 "movd %%mm4, (%%"REG_a", %0) \n\t"
2196 : "+r" (counter), "+r" (filter)
2197 : "m" (filterPos), "m" (dst), "m"(offset),
2198 "m" (src), "r" (filterSize*2)
2199 : "%"REG_b, "%"REG_a, "%"REG_c
2204 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2207 for(i=0; i<dstW; i++)
2210 int srcPos= filterPos[i];
2212 // printf("filterPos: %d\n", filterPos[i]);
2213 for(j=0; j<filterSize; j++)
2215 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2216 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2218 // filter += hFilterSize;
2219 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2225 // *** horizontal scale Y line to temp buffer
2226 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2227 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2228 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2229 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2230 int32_t *mmx2FilterPos)
2232 if(srcFormat==IMGFMT_YUY2)
2234 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2235 src= formatConvBuffer;
2237 else if(srcFormat==IMGFMT_UYVY)
2239 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2240 src= formatConvBuffer;
2242 else if(srcFormat==IMGFMT_BGR32)
2244 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2245 src= formatConvBuffer;
2247 else if(srcFormat==IMGFMT_BGR24)
2249 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2250 src= formatConvBuffer;
2252 else if(srcFormat==IMGFMT_BGR16)
2254 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2255 src= formatConvBuffer;
2257 else if(srcFormat==IMGFMT_BGR15)
2259 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2260 src= formatConvBuffer;
2262 else if(srcFormat==IMGFMT_RGB32)
2264 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2265 src= formatConvBuffer;
2267 else if(srcFormat==IMGFMT_RGB24)
2269 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2270 src= formatConvBuffer;
2274 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2275 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2277 if(!(flags&SWS_FAST_BILINEAR))
2280 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2282 else // Fast Bilinear upscale / crap downscale
2284 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2290 "pxor %%mm7, %%mm7 \n\t"
2291 "mov %0, %%"REG_c" \n\t"
2292 "mov %1, %%"REG_D" \n\t"
2293 "mov %2, %%"REG_d" \n\t"
2294 "mov %3, %%"REG_b" \n\t"
2295 "xor %%"REG_a", %%"REG_a" \n\t" // i
2296 PREFETCH" (%%"REG_c") \n\t"
2297 PREFETCH" 32(%%"REG_c") \n\t"
2298 PREFETCH" 64(%%"REG_c") \n\t"
2302 #define FUNNY_Y_CODE \
2303 "movl (%%"REG_b"), %%esi \n\t"\
2305 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2306 "add %%"REG_S", %%"REG_c" \n\t"\
2307 "add %%"REG_a", %%"REG_D" \n\t"\
2308 "xor %%"REG_a", %%"REG_a" \n\t"\
2312 #define FUNNY_Y_CODE \
2313 "movl (%%"REG_b"), %%esi \n\t"\
2315 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2316 "add %%"REG_a", %%"REG_D" \n\t"\
2317 "xor %%"REG_a", %%"REG_a" \n\t"\
2330 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2332 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2334 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2339 long xInc_shr16 = xInc >> 16;
2340 uint16_t xInc_mask = xInc & 0xffff;
2341 //NO MMX just normal asm ...
2343 "xor %%"REG_a", %%"REG_a" \n\t" // i
2344 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2345 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2348 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2350 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2351 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2352 "shll $16, %%edi \n\t"
2353 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2354 "mov %1, %%"REG_D" \n\t"
2355 "shrl $9, %%esi \n\t"
2356 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2357 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2358 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2360 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2361 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2364 "shll $16, %%edi \n\t"
2365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2366 "mov %1, %%"REG_D" \n\t"
2367 "shrl $9, %%esi \n\t"
2368 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2369 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2370 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2373 "add $2, %%"REG_a" \n\t"
2374 "cmp %2, %%"REG_a" \n\t"
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2379 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2382 } //if MMX2 can't be used
2386 unsigned int xpos=0;
2387 for(i=0;i<dstWidth;i++)
2389 register unsigned int xx=xpos>>16;
2390 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2391 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2398 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2399 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2400 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2401 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2402 int32_t *mmx2FilterPos)
2404 if(srcFormat==IMGFMT_YUY2)
2406 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407 src1= formatConvBuffer;
2408 src2= formatConvBuffer+2048;
2410 else if(srcFormat==IMGFMT_UYVY)
2412 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2413 src1= formatConvBuffer;
2414 src2= formatConvBuffer+2048;
2416 else if(srcFormat==IMGFMT_BGR32)
2418 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2419 src1= formatConvBuffer;
2420 src2= formatConvBuffer+2048;
2422 else if(srcFormat==IMGFMT_BGR24)
2424 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2425 src1= formatConvBuffer;
2426 src2= formatConvBuffer+2048;
2428 else if(srcFormat==IMGFMT_BGR16)
2430 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2431 src1= formatConvBuffer;
2432 src2= formatConvBuffer+2048;
2434 else if(srcFormat==IMGFMT_BGR15)
2436 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2437 src1= formatConvBuffer;
2438 src2= formatConvBuffer+2048;
2440 else if(srcFormat==IMGFMT_RGB32)
2442 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2443 src1= formatConvBuffer;
2444 src2= formatConvBuffer+2048;
2446 else if(srcFormat==IMGFMT_RGB24)
2448 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2449 src1= formatConvBuffer;
2450 src2= formatConvBuffer+2048;
2452 else if(isGray(srcFormat))
2458 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2459 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2461 if(!(flags&SWS_FAST_BILINEAR))
2464 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2465 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2467 else // Fast Bilinear upscale / crap downscale
2469 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2475 "pxor %%mm7, %%mm7 \n\t"
2476 "mov %0, %%"REG_c" \n\t"
2477 "mov %1, %%"REG_D" \n\t"
2478 "mov %2, %%"REG_d" \n\t"
2479 "mov %3, %%"REG_b" \n\t"
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i
2481 PREFETCH" (%%"REG_c") \n\t"
2482 PREFETCH" 32(%%"REG_c") \n\t"
2483 PREFETCH" 64(%%"REG_c") \n\t"
2487 #define FUNNY_UV_CODE \
2488 "movl (%%"REG_b"), %%esi \n\t"\
2490 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2491 "add %%"REG_S", %%"REG_c" \n\t"\
2492 "add %%"REG_a", %%"REG_D" \n\t"\
2493 "xor %%"REG_a", %%"REG_a" \n\t"\
2497 #define FUNNY_UV_CODE \
2498 "movl (%%"REG_b"), %%esi \n\t"\
2500 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2501 "add %%"REG_a", %%"REG_D" \n\t"\
2502 "xor %%"REG_a", %%"REG_a" \n\t"\
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i
2511 "mov %5, %%"REG_c" \n\t" // src
2512 "mov %1, %%"REG_D" \n\t" // buf1
2513 "add $4096, %%"REG_D" \n\t"
2514 PREFETCH" (%%"REG_c") \n\t"
2515 PREFETCH" 32(%%"REG_c") \n\t"
2516 PREFETCH" 64(%%"REG_c") \n\t"
2523 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2524 "m" (funnyUVCode), "m" (src2)
2525 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2527 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2529 // printf("%d %d %d\n", dstWidth, i, srcW);
2530 dst[i] = src1[srcW-1]*128;
2531 dst[i+2048] = src2[srcW-1]*128;
2537 long xInc_shr16 = (long) (xInc >> 16);
2538 uint16_t xInc_mask = xInc & 0xffff;
2540 "xor %%"REG_a", %%"REG_a" \n\t" // i
2541 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2542 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2545 "mov %0, %%"REG_S" \n\t"
2546 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2547 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2548 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2549 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2550 "shll $16, %%edi \n\t"
2551 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2552 "mov %1, %%"REG_D" \n\t"
2553 "shrl $9, %%esi \n\t"
2554 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2556 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2557 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2558 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2559 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2560 "shll $16, %%edi \n\t"
2561 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2562 "mov %1, %%"REG_D" \n\t"
2563 "shrl $9, %%esi \n\t"
2564 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2566 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2567 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2568 "add $1, %%"REG_a" \n\t"
2569 "cmp %2, %%"REG_a" \n\t"
2572 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2573 which is needed to support GCC-4.0 */
2574 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2575 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2577 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2580 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2583 } //if MMX2 can't be used
2587 unsigned int xpos=0;
2588 for(i=0;i<dstWidth;i++)
2590 register unsigned int xx=xpos>>16;
2591 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2592 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2593 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2595 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2596 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2604 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2605 int srcSliceH, uint8_t* dst[], int dstStride[]){
2607 /* load a few things into local vars to make the code more readable? and faster */
2608 const int srcW= c->srcW;
2609 const int dstW= c->dstW;
2610 const int dstH= c->dstH;
2611 const int chrDstW= c->chrDstW;
2612 const int chrSrcW= c->chrSrcW;
2613 const int lumXInc= c->lumXInc;
2614 const int chrXInc= c->chrXInc;
2615 const int dstFormat= c->dstFormat;
2616 const int srcFormat= c->srcFormat;
2617 const int flags= c->flags;
2618 const int canMMX2BeUsed= c->canMMX2BeUsed;
2619 int16_t *vLumFilterPos= c->vLumFilterPos;
2620 int16_t *vChrFilterPos= c->vChrFilterPos;
2621 int16_t *hLumFilterPos= c->hLumFilterPos;
2622 int16_t *hChrFilterPos= c->hChrFilterPos;
2623 int16_t *vLumFilter= c->vLumFilter;
2624 int16_t *vChrFilter= c->vChrFilter;
2625 int16_t *hLumFilter= c->hLumFilter;
2626 int16_t *hChrFilter= c->hChrFilter;
2627 int32_t *lumMmxFilter= c->lumMmxFilter;
2628 int32_t *chrMmxFilter= c->chrMmxFilter;
2629 const int vLumFilterSize= c->vLumFilterSize;
2630 const int vChrFilterSize= c->vChrFilterSize;
2631 const int hLumFilterSize= c->hLumFilterSize;
2632 const int hChrFilterSize= c->hChrFilterSize;
2633 int16_t **lumPixBuf= c->lumPixBuf;
2634 int16_t **chrPixBuf= c->chrPixBuf;
2635 const int vLumBufSize= c->vLumBufSize;
2636 const int vChrBufSize= c->vChrBufSize;
2637 uint8_t *funnyYCode= c->funnyYCode;
2638 uint8_t *funnyUVCode= c->funnyUVCode;
2639 uint8_t *formatConvBuffer= c->formatConvBuffer;
2640 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2641 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2644 /* vars whch will change and which we need to storw back in the context */
2646 int lumBufIndex= c->lumBufIndex;
2647 int chrBufIndex= c->chrBufIndex;
2648 int lastInLumBuf= c->lastInLumBuf;
2649 int lastInChrBuf= c->lastInChrBuf;
2651 if(isPacked(c->srcFormat)){
2657 srcStride[2]= srcStride[0];
2659 srcStride[1]<<= c->vChrDrop;
2660 srcStride[2]<<= c->vChrDrop;
2662 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2663 // (int)dst[0], (int)dst[1], (int)dst[2]);
2665 #if 0 //self test FIXME move to a vfilter or something
2667 static volatile int i=0;
2669 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2670 selfTest(src, srcStride, c->srcW, c->srcH);
2675 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2676 //dstStride[0],dstStride[1],dstStride[2]);
2678 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2680 static int firstTime=1; //FIXME move this into the context perhaps
2681 if(flags & SWS_PRINT_INFO && firstTime)
2683 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2684 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2689 /* Note the user might start scaling the picture in the middle so this will not get executed
2690 this is not really intended but works currently, so ppl might do it */
2701 for(;dstY < dstH; dstY++){
2702 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2703 const int chrDstY= dstY>>c->chrDstVSubSample;
2704 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2705 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2707 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2708 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2709 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2710 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2712 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2713 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2714 //handle holes (FAST_BILINEAR & weird filters)
2715 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2716 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2717 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2718 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2719 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2721 // Do we have enough lines in this slice to output the dstY line
2722 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2724 //Do horizontal scaling
2725 while(lastInLumBuf < lastLumSrcY)
2727 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2729 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2730 ASSERT(lumBufIndex < 2*vLumBufSize)
2731 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2732 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2733 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2734 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2735 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2736 funnyYCode, c->srcFormat, formatConvBuffer,
2737 c->lumMmx2Filter, c->lumMmx2FilterPos);
2740 while(lastInChrBuf < lastChrSrcY)
2742 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2743 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2745 ASSERT(chrBufIndex < 2*vChrBufSize)
2746 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2747 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2748 //FIXME replace parameters through context struct (some at least)
2750 if(!(isGray(srcFormat) || isGray(dstFormat)))
2751 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2752 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2753 funnyUVCode, c->srcFormat, formatConvBuffer,
2754 c->chrMmx2Filter, c->chrMmx2FilterPos);
2757 //wrap buf index around to stay inside the ring buffer
2758 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2759 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2761 else // not enough lines left in this slice -> load the rest in the buffer
2763 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2764 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2765 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2766 vChrBufSize, vLumBufSize);*/
2768 //Do horizontal scaling
2769 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2771 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2773 ASSERT(lumBufIndex < 2*vLumBufSize)
2774 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2775 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2776 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2777 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2778 funnyYCode, c->srcFormat, formatConvBuffer,
2779 c->lumMmx2Filter, c->lumMmx2FilterPos);
2782 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2784 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2785 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2787 ASSERT(chrBufIndex < 2*vChrBufSize)
2788 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2789 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2791 if(!(isGray(srcFormat) || isGray(dstFormat)))
2792 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2793 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2794 funnyUVCode, c->srcFormat, formatConvBuffer,
2795 c->chrMmx2Filter, c->chrMmx2FilterPos);
2798 //wrap buf index around to stay inside the ring buffer
2799 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2800 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2801 break; //we can't output a dstY line so let's try with the next slice
2805 b5Dither= dither8[dstY&1];
2806 g6Dither= dither4[dstY&1];
2807 g5Dither= dither8[dstY&1];
2808 r5Dither= dither8[(dstY+1)&1];
2812 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2813 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2816 for(i=0; i<vLumFilterSize; i++)
2818 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2819 lumMmxFilter[4*i+2]=
2820 lumMmxFilter[4*i+3]=
2821 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2823 for(i=0; i<vChrFilterSize; i++)
2825 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2826 chrMmxFilter[4*i+2]=
2827 chrMmxFilter[4*i+3]=
2828 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2831 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2832 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2833 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2834 RENAME(yuv2nv12X)(c,
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837 dest, uDest, dstW, chrDstW, dstFormat);
2839 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2841 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2842 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2843 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2845 int16_t *lumBuf = lumPixBuf[0];
2846 int16_t *chrBuf= chrPixBuf[0];
2847 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2852 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2853 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2854 dest, uDest, vDest, dstW, chrDstW);
2859 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2860 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2861 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2863 int chrAlpha= vChrFilter[2*dstY+1];
2864 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2865 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2867 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2869 int lumAlpha= vLumFilter[2*dstY+1];
2870 int chrAlpha= vChrFilter[2*dstY+1];
2871 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2872 dest, dstW, lumAlpha, chrAlpha, dstY);
2876 RENAME(yuv2packedX)(c,
2877 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2883 else // hmm looks like we can't use MMX here without overwriting this array's tail
2885 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2886 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2887 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2888 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2891 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2892 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 dest, uDest, dstW, chrDstW, dstFormat);
2895 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2897 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2898 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2900 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2901 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2902 dest, uDest, vDest, dstW, chrDstW);
2906 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2907 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2917 __asm __volatile(SFENCE:::"memory");
2918 __asm __volatile(EMMS:::"memory");
2920 /* store changed local vars back in the context */
2922 c->lumBufIndex= lumBufIndex;
2923 c->chrBufIndex= chrBufIndex;
2924 c->lastInLumBuf= lastInLumBuf;
2925 c->lastInChrBuf= lastInChrBuf;
2927 return dstY - lastDstY;