2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined ( HAVE_MMX2 )
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
191 #define YSCALEYUV2PACKEDX \
193 "xor %%"REG_a", %%"REG_a" \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_END\
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW)\
237 : "%"REG_a, "%"REG_d, "%"REG_S\
240 #define YSCALEYUV2PACKEDX_ACCURATE \
242 "xor %%"REG_a", %%"REG_a" \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
382 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
422 "packuswb %%mm1, %%mm1 \n\t"
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
432 "xor "#index", "#index" \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
470 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
850 "add $24, "#dst" \n\t"\
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
898 "add $24, "#dst" \n\t"\
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
935 if(c->flags & SWS_ACCURATE_RND){
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
959 #endif //!HAVE_ALTIVEC
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
987 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
1001 for(i=0; i<dstW; i++)
1003 int val= lumSrc[i]>>7;
1014 for(i=0; i<chrDstW; i++)
1017 int v=chrSrc[i + 2048]>>7;
1021 else if (u>255) u=255;
1023 else if (v>255) v=255;
1034 * vertical scale YV12 to RGB
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
1042 if(c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1045 YSCALEYUV2PACKEDX_ACCURATE
1047 WRITEBGR32(%4, %5, %%REGa)
1049 YSCALEYUV2PACKEDX_END
1052 YSCALEYUV2PACKEDX_ACCURATE
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1104 switch(c->dstFormat)
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1125 case PIX_FMT_BGR555:
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1130 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1138 case PIX_FMT_BGR565:
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1143 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1151 case PIX_FMT_YUYV422:
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1182 * vertical bilinear scale YV12 to RGB
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1192 if(flags&SWS_FULL_CHR_H_INT)
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1217 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1218 "m" (yalpha1), "m" (uvalpha1)
1228 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1229 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1231 "movq %%mm3, %%mm1 \n\t"
1232 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1233 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1235 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1236 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1237 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1238 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1239 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1240 "movq %%mm1, %%mm2 \n\t"
1241 "psllq $48, %%mm1 \n\t" // 000000BG
1242 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1244 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1245 "psrld $16, %%mm2 \n\t" // R000R000
1246 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1247 "por %%mm2, %%mm1 \n\t" // RBGRR000
1249 "mov %4, %%"REG_b" \n\t"
1250 "add %%"REG_a", %%"REG_b" \n\t"
1254 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1255 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1257 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1258 "psrlq $32, %%mm3 \n\t"
1259 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1260 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1262 "add $4, %%"REG_a" \n\t"
1263 "cmp %5, %%"REG_a" \n\t"
1266 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1267 "m" (yalpha1), "m" (uvalpha1)
1268 : "%"REG_a, "%"REG_b
1271 case PIX_FMT_BGR555:
1276 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1277 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1278 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1280 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1281 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1282 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1284 "psrlw $3, %%mm3 \n\t"
1285 "psllw $2, %%mm1 \n\t"
1286 "psllw $7, %%mm0 \n\t"
1287 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1288 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1290 "por %%mm3, %%mm1 \n\t"
1291 "por %%mm1, %%mm0 \n\t"
1293 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1295 "add $4, %%"REG_a" \n\t"
1296 "cmp %5, %%"REG_a" \n\t"
1299 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300 "m" (yalpha1), "m" (uvalpha1)
1304 case PIX_FMT_BGR565:
1309 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1310 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1311 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1313 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1314 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1315 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1317 "psrlw $3, %%mm3 \n\t"
1318 "psllw $3, %%mm1 \n\t"
1319 "psllw $8, %%mm0 \n\t"
1320 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1321 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1323 "por %%mm3, %%mm1 \n\t"
1324 "por %%mm1, %%mm0 \n\t"
1326 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1328 "add $4, %%"REG_a" \n\t"
1329 "cmp %5, %%"REG_a" \n\t"
1332 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1333 "m" (yalpha1), "m" (uvalpha1)
1342 if(dstFormat==PIX_FMT_RGB32)
1345 #ifdef WORDS_BIGENDIAN
1348 for(i=0;i<dstW;i++){
1349 // vertical linear interpolation && yuv2rgb in a single step:
1350 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1351 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1352 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1353 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1354 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1355 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1359 else if(dstFormat==PIX_FMT_BGR24)
1362 for(i=0;i<dstW;i++){
1363 // vertical linear interpolation && yuv2rgb in a single step:
1364 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1365 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1366 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1367 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1368 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1369 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1373 else if(dstFormat==PIX_FMT_BGR565)
1376 for(i=0;i<dstW;i++){
1377 // vertical linear interpolation && yuv2rgb in a single step:
1378 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1379 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1380 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1382 ((uint16_t*)dest)[i] =
1383 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1384 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1385 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1388 else if(dstFormat==PIX_FMT_BGR555)
1391 for(i=0;i<dstW;i++){
1392 // vertical linear interpolation && yuv2rgb in a single step:
1393 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1394 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1395 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1397 ((uint16_t*)dest)[i] =
1398 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1399 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1400 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1408 switch(c->dstFormat)
1410 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1413 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1414 "mov %4, %%"REG_b" \n\t"
1415 "push %%"REG_BP" \n\t"
1416 YSCALEYUV2RGB(%%REGBP, %5)
1417 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1418 "pop %%"REG_BP" \n\t"
1419 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1421 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1427 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1428 "mov %4, %%"REG_b" \n\t"
1429 "push %%"REG_BP" \n\t"
1430 YSCALEYUV2RGB(%%REGBP, %5)
1431 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1432 "pop %%"REG_BP" \n\t"
1433 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1434 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1438 case PIX_FMT_BGR555:
1440 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1441 "mov %4, %%"REG_b" \n\t"
1442 "push %%"REG_BP" \n\t"
1443 YSCALEYUV2RGB(%%REGBP, %5)
1444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1446 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1447 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1448 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1451 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1452 "pop %%"REG_BP" \n\t"
1453 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1455 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1459 case PIX_FMT_BGR565:
1461 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1462 "mov %4, %%"REG_b" \n\t"
1463 "push %%"REG_BP" \n\t"
1464 YSCALEYUV2RGB(%%REGBP, %5)
1465 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1467 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1468 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1469 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1472 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1475 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1479 case PIX_FMT_YUYV422:
1481 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1482 "mov %4, %%"REG_b" \n\t"
1483 "push %%"REG_BP" \n\t"
1484 YSCALEYUV2PACKED(%%REGBP, %5)
1485 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1486 "pop %%"REG_BP" \n\t"
1487 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1488 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1499 * YV12 to RGB without scaling or interpolating
1501 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1502 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1504 const int yalpha1=0;
1507 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1508 const int yalpha= 4096; //FIXME ...
1510 if(flags&SWS_FULL_CHR_H_INT)
1512 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1517 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1523 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1524 "mov %4, %%"REG_b" \n\t"
1525 "push %%"REG_BP" \n\t"
1526 YSCALEYUV2RGB1(%%REGBP, %5)
1527 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1528 "pop %%"REG_BP" \n\t"
1529 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1531 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1537 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1538 "mov %4, %%"REG_b" \n\t"
1539 "push %%"REG_BP" \n\t"
1540 YSCALEYUV2RGB1(%%REGBP, %5)
1541 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1542 "pop %%"REG_BP" \n\t"
1543 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1545 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1549 case PIX_FMT_BGR555:
1551 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1552 "mov %4, %%"REG_b" \n\t"
1553 "push %%"REG_BP" \n\t"
1554 YSCALEYUV2RGB1(%%REGBP, %5)
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1557 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1558 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1559 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1561 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1562 "pop %%"REG_BP" \n\t"
1563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1565 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569 case PIX_FMT_BGR565:
1571 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1572 "mov %4, %%"REG_b" \n\t"
1573 "push %%"REG_BP" \n\t"
1574 YSCALEYUV2RGB1(%%REGBP, %5)
1575 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1577 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1578 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1579 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1582 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1590 case PIX_FMT_YUYV422:
1592 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1593 "mov %4, %%"REG_b" \n\t"
1594 "push %%"REG_BP" \n\t"
1595 YSCALEYUV2PACKED1(%%REGBP, %5)
1596 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1597 "pop %%"REG_BP" \n\t"
1598 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1600 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1612 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1613 "mov %4, %%"REG_b" \n\t"
1614 "push %%"REG_BP" \n\t"
1615 YSCALEYUV2RGB1b(%%REGBP, %5)
1616 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1617 "pop %%"REG_BP" \n\t"
1618 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1620 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1626 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1627 "mov %4, %%"REG_b" \n\t"
1628 "push %%"REG_BP" \n\t"
1629 YSCALEYUV2RGB1b(%%REGBP, %5)
1630 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1631 "pop %%"REG_BP" \n\t"
1632 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1634 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1638 case PIX_FMT_BGR555:
1640 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1641 "mov %4, %%"REG_b" \n\t"
1642 "push %%"REG_BP" \n\t"
1643 YSCALEYUV2RGB1b(%%REGBP, %5)
1644 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1646 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1647 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1648 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1650 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1651 "pop %%"REG_BP" \n\t"
1652 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1654 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1658 case PIX_FMT_BGR565:
1660 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1661 "mov %4, %%"REG_b" \n\t"
1662 "push %%"REG_BP" \n\t"
1663 YSCALEYUV2RGB1b(%%REGBP, %5)
1664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1666 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1667 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1668 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1671 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1672 "pop %%"REG_BP" \n\t"
1673 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1675 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1679 case PIX_FMT_YUYV422:
1681 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1682 "mov %4, %%"REG_b" \n\t"
1683 "push %%"REG_BP" \n\t"
1684 YSCALEYUV2PACKED1b(%%REGBP, %5)
1685 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1686 "pop %%"REG_BP" \n\t"
1687 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1689 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1696 if( uvalpha < 2048 )
1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1700 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1704 //FIXME yuy2* can read upto 7 samples to much
1706 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1710 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1711 "mov %0, %%"REG_a" \n\t"
1713 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1714 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1715 "pand %%mm2, %%mm0 \n\t"
1716 "pand %%mm2, %%mm1 \n\t"
1717 "packuswb %%mm1, %%mm0 \n\t"
1718 "movq %%mm0, (%2, %%"REG_a") \n\t"
1719 "add $8, %%"REG_a" \n\t"
1721 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1726 for(i=0; i<width; i++)
1731 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1735 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1736 "mov %0, %%"REG_a" \n\t"
1738 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1739 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1740 "psrlw $8, %%mm0 \n\t"
1741 "psrlw $8, %%mm1 \n\t"
1742 "packuswb %%mm1, %%mm0 \n\t"
1743 "movq %%mm0, %%mm1 \n\t"
1744 "psrlw $8, %%mm0 \n\t"
1745 "pand %%mm4, %%mm1 \n\t"
1746 "packuswb %%mm0, %%mm0 \n\t"
1747 "packuswb %%mm1, %%mm1 \n\t"
1748 "movd %%mm0, (%3, %%"REG_a") \n\t"
1749 "movd %%mm1, (%2, %%"REG_a") \n\t"
1750 "add $4, %%"REG_a" \n\t"
1752 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1757 for(i=0; i<width; i++)
1759 dstU[i]= src1[4*i + 1];
1760 dstV[i]= src1[4*i + 3];
1763 assert(src1 == src2);
1766 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1767 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1771 "mov %0, %%"REG_a" \n\t"
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1786 for(i=0; i<width; i++)
1791 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1795 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1796 "mov %0, %%"REG_a" \n\t"
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1800 "pand %%mm4, %%mm0 \n\t"
1801 "pand %%mm4, %%mm1 \n\t"
1802 "packuswb %%mm1, %%mm0 \n\t"
1803 "movq %%mm0, %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm0, %%mm0 \n\t"
1807 "packuswb %%mm1, %%mm1 \n\t"
1808 "movd %%mm0, (%3, %%"REG_a") \n\t"
1809 "movd %%mm1, (%2, %%"REG_a") \n\t"
1810 "add $4, %%"REG_a" \n\t"
1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1817 for(i=0; i<width; i++)
1819 dstU[i]= src1[4*i + 0];
1820 dstV[i]= src1[4*i + 2];
1823 assert(src1 == src2);
1826 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1829 for(i=0; i<width; i++)
1831 int b= ((uint32_t*)src)[i]&0xFF;
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1839 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1842 assert(src1 == src2);
1843 for(i=0; i<width; i++)
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848 const int h= (a&0x00FF00) + (e&0x00FF00);
1849 const int b= l&0x3FF;
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1858 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1862 "mov %2, %%"REG_a" \n\t"
1863 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1864 "movq "MANGLE(w1111)", %%mm5 \n\t"
1865 "pxor %%mm7, %%mm7 \n\t"
1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
1869 PREFETCH" 64(%0, %%"REG_d") \n\t"
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1872 "punpcklbw %%mm7, %%mm0 \n\t"
1873 "punpcklbw %%mm7, %%mm1 \n\t"
1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1876 "punpcklbw %%mm7, %%mm2 \n\t"
1877 "punpcklbw %%mm7, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm1 \n\t"
1880 "pmaddwd %%mm6, %%mm2 \n\t"
1881 "pmaddwd %%mm6, %%mm3 \n\t"
1882 #ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm0 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1888 "packssdw %%mm1, %%mm0 \n\t"
1889 "packssdw %%mm3, %%mm2 \n\t"
1890 "pmaddwd %%mm5, %%mm0 \n\t"
1891 "pmaddwd %%mm5, %%mm2 \n\t"
1892 "packssdw %%mm2, %%mm0 \n\t"
1893 "psraw $7, %%mm0 \n\t"
1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm4 \n\t"
1898 "punpcklbw %%mm7, %%mm1 \n\t"
1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm6, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm6, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 #ifndef FAST_BGR2YV12
1908 "psrad $8, %%mm4 \n\t"
1909 "psrad $8, %%mm1 \n\t"
1910 "psrad $8, %%mm2 \n\t"
1911 "psrad $8, %%mm3 \n\t"
1913 "packssdw %%mm1, %%mm4 \n\t"
1914 "packssdw %%mm3, %%mm2 \n\t"
1915 "pmaddwd %%mm5, %%mm4 \n\t"
1916 "pmaddwd %%mm5, %%mm2 \n\t"
1917 "add $24, %%"REG_d" \n\t"
1918 "packssdw %%mm2, %%mm4 \n\t"
1919 "psraw $7, %%mm4 \n\t"
1921 "packuswb %%mm4, %%mm0 \n\t"
1922 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1924 "movq %%mm0, (%1, %%"REG_a") \n\t"
1925 "add $8, %%"REG_a" \n\t"
1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928 : "%"REG_a, "%"REG_d
1932 for(i=0; i<width; i++)
1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1943 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1947 "mov %3, %%"REG_a" \n\t"
1948 "movq "MANGLE(w1111)", %%mm5 \n\t"
1949 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1950 "pxor %%mm7, %%mm7 \n\t"
1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1952 "add %%"REG_d", %%"REG_d" \n\t"
1955 PREFETCH" 64(%0, %%"REG_d") \n\t"
1956 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1957 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1959 "movq %%mm0, %%mm1 \n\t"
1960 "movq %%mm2, %%mm3 \n\t"
1961 "psrlq $24, %%mm0 \n\t"
1962 "psrlq $24, %%mm2 \n\t"
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1968 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm2 \n\t"
1972 "paddw %%mm2, %%mm0 \n\t"
1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1975 "punpcklbw %%mm7, %%mm4 \n\t"
1976 "punpcklbw %%mm7, %%mm2 \n\t"
1977 "paddw %%mm4, %%mm2 \n\t"
1978 "psrlw $1, %%mm0 \n\t"
1979 "psrlw $1, %%mm2 \n\t"
1981 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1982 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1984 "pmaddwd %%mm0, %%mm1 \n\t"
1985 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "pmaddwd %%mm6, %%mm0 \n\t"
1987 "pmaddwd %%mm6, %%mm2 \n\t"
1988 #ifndef FAST_BGR2YV12
1989 "psrad $8, %%mm0 \n\t"
1990 "psrad $8, %%mm1 \n\t"
1991 "psrad $8, %%mm2 \n\t"
1992 "psrad $8, %%mm3 \n\t"
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "packssdw %%mm3, %%mm1 \n\t"
1996 "pmaddwd %%mm5, %%mm0 \n\t"
1997 "pmaddwd %%mm5, %%mm1 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1999 "psraw $7, %%mm0 \n\t"
2001 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2004 "movq %%mm4, %%mm1 \n\t"
2005 "movq %%mm2, %%mm3 \n\t"
2006 "psrlq $24, %%mm4 \n\t"
2007 "psrlq $24, %%mm2 \n\t"
2010 "punpcklbw %%mm7, %%mm4 \n\t"
2011 "punpcklbw %%mm7, %%mm2 \n\t"
2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2015 "punpcklbw %%mm7, %%mm4 \n\t"
2016 "punpcklbw %%mm7, %%mm2 \n\t"
2017 "paddw %%mm2, %%mm4 \n\t"
2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2020 "punpcklbw %%mm7, %%mm5 \n\t"
2021 "punpcklbw %%mm7, %%mm2 \n\t"
2022 "paddw %%mm5, %%mm2 \n\t"
2023 "movq "MANGLE(w1111)", %%mm5 \n\t"
2024 "psrlw $2, %%mm4 \n\t"
2025 "psrlw $2, %%mm2 \n\t"
2027 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2028 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2030 "pmaddwd %%mm4, %%mm1 \n\t"
2031 "pmaddwd %%mm2, %%mm3 \n\t"
2032 "pmaddwd %%mm6, %%mm4 \n\t"
2033 "pmaddwd %%mm6, %%mm2 \n\t"
2034 #ifndef FAST_BGR2YV12
2035 "psrad $8, %%mm4 \n\t"
2036 "psrad $8, %%mm1 \n\t"
2037 "psrad $8, %%mm2 \n\t"
2038 "psrad $8, %%mm3 \n\t"
2040 "packssdw %%mm2, %%mm4 \n\t"
2041 "packssdw %%mm3, %%mm1 \n\t"
2042 "pmaddwd %%mm5, %%mm4 \n\t"
2043 "pmaddwd %%mm5, %%mm1 \n\t"
2044 "add $24, %%"REG_d" \n\t"
2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2046 "psraw $7, %%mm4 \n\t"
2048 "movq %%mm0, %%mm1 \n\t"
2049 "punpckldq %%mm4, %%mm0 \n\t"
2050 "punpckhdq %%mm4, %%mm1 \n\t"
2051 "packsswb %%mm1, %%mm0 \n\t"
2052 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2054 "movd %%mm0, (%1, %%"REG_a") \n\t"
2055 "punpckhdq %%mm0, %%mm0 \n\t"
2056 "movd %%mm0, (%2, %%"REG_a") \n\t"
2057 "add $4, %%"REG_a" \n\t"
2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060 : "%"REG_a, "%"REG_d
2064 for(i=0; i<width; i++)
2066 int b= src1[6*i + 0] + src1[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4];
2068 int r= src1[6*i + 2] + src1[6*i + 5];
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2074 assert(src1 == src2);
2077 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2080 for(i=0; i<width; i++)
2082 int d= ((uint16_t*)src)[i];
2085 int r= (d>>11)&0x1F;
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2091 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2095 for(i=0; i<width; i++)
2097 int d0= ((uint32_t*)src1)[i];
2099 int dl= (d0&0x07E0F81F);
2100 int dh= ((d0>>5)&0x07C0F83F);
2102 int dh2= (dh>>11) + (dh<<21);
2106 int r= (d>>11)&0x7F;
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2113 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2116 for(i=0; i<width; i++)
2118 int d= ((uint16_t*)src)[i];
2121 int r= (d>>10)&0x1F;
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2127 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2131 for(i=0; i<width; i++)
2133 int d0= ((uint32_t*)src1)[i];
2135 int dl= (d0&0x03E07C1F);
2136 int dh= ((d0>>5)&0x03E0F81F);
2138 int dh2= (dh>>11) + (dh<<21);
2142 int r= (d>>10)&0x7F;
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2150 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2153 for(i=0; i<width; i++)
2155 int r= ((uint32_t*)src)[i]&0xFF;
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2163 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2167 for(i=0; i<width; i++)
2169 const int a= ((uint32_t*)src1)[2*i+0];
2170 const int e= ((uint32_t*)src1)[2*i+1];
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172 const int h= (a&0x00FF00) + (e&0x00FF00);
2173 const int r= l&0x3FF;
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2182 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2185 for(i=0; i<width; i++)
2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2195 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2199 for(i=0; i<width; i++)
2201 int r= src1[6*i + 0] + src1[6*i + 3];
2202 int g= src1[6*i + 1] + src1[6*i + 4];
2203 int b= src1[6*i + 2] + src1[6*i + 5];
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2210 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2213 for(i=0; i<width; i++)
2215 int d= ((uint16_t*)src)[i];
2218 int b= (d>>11)&0x1F;
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2224 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2227 assert(src1 == src2);
2228 for(i=0; i<width; i++)
2230 int d0= ((uint32_t*)src1)[i];
2232 int dl= (d0&0x07E0F81F);
2233 int dh= ((d0>>5)&0x07C0F83F);
2235 int dh2= (dh>>11) + (dh<<21);
2239 int b= (d>>11)&0x7F;
2241 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2246 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2249 for(i=0; i<width; i++)
2251 int d= ((uint16_t*)src)[i];
2254 int b= (d>>10)&0x1F;
2256 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2260 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2263 assert(src1 == src2);
2264 for(i=0; i<width; i++)
2266 int d0= ((uint32_t*)src1)[i];
2268 int dl= (d0&0x03E07C1F);
2269 int dh= ((d0>>5)&0x03E0F81F);
2271 int dh2= (dh>>11) + (dh<<21);
2275 int r= (d>>10)&0x7F;
2277 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2278 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2282 // Bilinear / Bicubic scaling
2283 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2284 int16_t *filter, int16_t *filterPos, long filterSize)
2287 assert(filterSize % 4 == 0 && filterSize>0);
2288 if(filterSize==4) // allways true for upscaling, sometimes for down too
2290 long counter= -2*dstW;
2292 filterPos-= counter/2;
2296 "push %%"REG_b" \n\t"
2298 "pxor %%mm7, %%mm7 \n\t"
2299 "movq "MANGLE(w02)", %%mm6 \n\t"
2300 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2301 "mov %%"REG_a", %%"REG_BP" \n\t"
2304 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2305 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2306 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2307 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2308 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2309 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2310 "punpcklbw %%mm7, %%mm0 \n\t"
2311 "punpcklbw %%mm7, %%mm2 \n\t"
2312 "pmaddwd %%mm1, %%mm0 \n\t"
2313 "pmaddwd %%mm2, %%mm3 \n\t"
2314 "psrad $8, %%mm0 \n\t"
2315 "psrad $8, %%mm3 \n\t"
2316 "packssdw %%mm3, %%mm0 \n\t"
2317 "pmaddwd %%mm6, %%mm0 \n\t"
2318 "packssdw %%mm0, %%mm0 \n\t"
2319 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2320 "add $4, %%"REG_BP" \n\t"
2323 "pop %%"REG_BP" \n\t"
2325 "pop %%"REG_b" \n\t"
2328 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2334 else if(filterSize==8)
2336 long counter= -2*dstW;
2338 filterPos-= counter/2;
2342 "push %%"REG_b" \n\t"
2344 "pxor %%mm7, %%mm7 \n\t"
2345 "movq "MANGLE(w02)", %%mm6 \n\t"
2346 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2347 "mov %%"REG_a", %%"REG_BP" \n\t"
2350 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2351 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2352 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2353 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2354 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2355 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2356 "punpcklbw %%mm7, %%mm0 \n\t"
2357 "punpcklbw %%mm7, %%mm2 \n\t"
2358 "pmaddwd %%mm1, %%mm0 \n\t"
2359 "pmaddwd %%mm2, %%mm3 \n\t"
2361 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2362 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2363 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2364 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2365 "punpcklbw %%mm7, %%mm4 \n\t"
2366 "punpcklbw %%mm7, %%mm2 \n\t"
2367 "pmaddwd %%mm1, %%mm4 \n\t"
2368 "pmaddwd %%mm2, %%mm5 \n\t"
2369 "paddd %%mm4, %%mm0 \n\t"
2370 "paddd %%mm5, %%mm3 \n\t"
2372 "psrad $8, %%mm0 \n\t"
2373 "psrad $8, %%mm3 \n\t"
2374 "packssdw %%mm3, %%mm0 \n\t"
2375 "pmaddwd %%mm6, %%mm0 \n\t"
2376 "packssdw %%mm0, %%mm0 \n\t"
2377 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2378 "add $4, %%"REG_BP" \n\t"
2381 "pop %%"REG_BP" \n\t"
2383 "pop %%"REG_b" \n\t"
2386 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2394 uint8_t *offset = src+filterSize;
2395 long counter= -2*dstW;
2396 // filter-= counter*filterSize/2;
2397 filterPos-= counter/2;
2400 "pxor %%mm7, %%mm7 \n\t"
2401 "movq "MANGLE(w02)", %%mm6 \n\t"
2404 "mov %2, %%"REG_c" \n\t"
2405 "movzwl (%%"REG_c", %0), %%eax \n\t"
2406 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2407 "mov %5, %%"REG_c" \n\t"
2408 "pxor %%mm4, %%mm4 \n\t"
2409 "pxor %%mm5, %%mm5 \n\t"
2411 "movq (%1), %%mm1 \n\t"
2412 "movq (%1, %6), %%mm3 \n\t"
2413 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2414 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
2415 "punpcklbw %%mm7, %%mm0 \n\t"
2416 "punpcklbw %%mm7, %%mm2 \n\t"
2417 "pmaddwd %%mm1, %%mm0 \n\t"
2418 "pmaddwd %%mm2, %%mm3 \n\t"
2419 "paddd %%mm3, %%mm5 \n\t"
2420 "paddd %%mm0, %%mm4 \n\t"
2422 "add $4, %%"REG_c" \n\t"
2423 "cmp %4, %%"REG_c" \n\t"
2426 "psrad $8, %%mm4 \n\t"
2427 "psrad $8, %%mm5 \n\t"
2428 "packssdw %%mm5, %%mm4 \n\t"
2429 "pmaddwd %%mm6, %%mm4 \n\t"
2430 "packssdw %%mm4, %%mm4 \n\t"
2431 "mov %3, %%"REG_a" \n\t"
2432 "movd %%mm4, (%%"REG_a", %0) \n\t"
2436 : "+r" (counter), "+r" (filter)
2437 : "m" (filterPos), "m" (dst), "m"(offset),
2438 "m" (src), "r" (filterSize*2)
2439 : "%"REG_a, "%"REG_c, "%"REG_d
2444 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2447 for(i=0; i<dstW; i++)
2450 int srcPos= filterPos[i];
2452 // printf("filterPos: %d\n", filterPos[i]);
2453 for(j=0; j<filterSize; j++)
2455 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2456 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2458 // filter += hFilterSize;
2459 dst[i] = clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2465 // *** horizontal scale Y line to temp buffer
2466 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2467 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2468 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2469 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2470 int32_t *mmx2FilterPos)
2472 if(srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2474 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2475 src= formatConvBuffer;
2477 else if(srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2479 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2480 src= formatConvBuffer;
2482 else if(srcFormat==PIX_FMT_RGB32)
2484 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2485 src= formatConvBuffer;
2487 else if(srcFormat==PIX_FMT_BGR24)
2489 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2490 src= formatConvBuffer;
2492 else if(srcFormat==PIX_FMT_BGR565)
2494 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2495 src= formatConvBuffer;
2497 else if(srcFormat==PIX_FMT_BGR555)
2499 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2500 src= formatConvBuffer;
2502 else if(srcFormat==PIX_FMT_BGR32)
2504 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2505 src= formatConvBuffer;
2507 else if(srcFormat==PIX_FMT_RGB24)
2509 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2510 src= formatConvBuffer;
2512 else if(srcFormat==PIX_FMT_RGB565)
2514 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2515 src= formatConvBuffer;
2517 else if(srcFormat==PIX_FMT_RGB555)
2519 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2520 src= formatConvBuffer;
2524 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2525 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2527 if(!(flags&SWS_FAST_BILINEAR))
2530 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2532 else // Fast Bilinear upscale / crap downscale
2534 #if defined(ARCH_X86)
2538 uint64_t ebxsave __attribute__((aligned(8)));
2544 "mov %%"REG_b", %5 \n\t"
2546 "pxor %%mm7, %%mm7 \n\t"
2547 "mov %0, %%"REG_c" \n\t"
2548 "mov %1, %%"REG_D" \n\t"
2549 "mov %2, %%"REG_d" \n\t"
2550 "mov %3, %%"REG_b" \n\t"
2551 "xor %%"REG_a", %%"REG_a" \n\t" // i
2552 PREFETCH" (%%"REG_c") \n\t"
2553 PREFETCH" 32(%%"REG_c") \n\t"
2554 PREFETCH" 64(%%"REG_c") \n\t"
2558 #define FUNNY_Y_CODE \
2559 "movl (%%"REG_b"), %%esi \n\t"\
2561 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2562 "add %%"REG_S", %%"REG_c" \n\t"\
2563 "add %%"REG_a", %%"REG_D" \n\t"\
2564 "xor %%"REG_a", %%"REG_a" \n\t"\
2568 #define FUNNY_Y_CODE \
2569 "movl (%%"REG_b"), %%esi \n\t"\
2571 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2572 "add %%"REG_a", %%"REG_D" \n\t"\
2573 "xor %%"REG_a", %%"REG_a" \n\t"\
2587 "mov %5, %%"REG_b" \n\t"
2589 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2594 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2599 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2604 long xInc_shr16 = xInc >> 16;
2605 uint16_t xInc_mask = xInc & 0xffff;
2606 //NO MMX just normal asm ...
2608 "xor %%"REG_a", %%"REG_a" \n\t" // i
2609 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2610 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2613 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2614 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2615 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2616 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2617 "shll $16, %%edi \n\t"
2618 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2619 "mov %1, %%"REG_D" \n\t"
2620 "shrl $9, %%esi \n\t"
2621 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2622 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2623 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2625 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2626 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2627 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2628 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2629 "shll $16, %%edi \n\t"
2630 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2631 "mov %1, %%"REG_D" \n\t"
2632 "shrl $9, %%esi \n\t"
2633 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2634 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2635 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2638 "add $2, %%"REG_a" \n\t"
2639 "cmp %2, %%"REG_a" \n\t"
2643 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2644 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2647 } //if MMX2 can't be used
2651 unsigned int xpos=0;
2652 for(i=0;i<dstWidth;i++)
2654 register unsigned int xx=xpos>>16;
2655 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2656 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2663 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2664 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2665 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2666 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2667 int32_t *mmx2FilterPos)
2669 if(srcFormat==PIX_FMT_YUYV422)
2671 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2672 src1= formatConvBuffer;
2673 src2= formatConvBuffer+2048;
2675 else if(srcFormat==PIX_FMT_UYVY422)
2677 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2678 src1= formatConvBuffer;
2679 src2= formatConvBuffer+2048;
2681 else if(srcFormat==PIX_FMT_RGB32)
2683 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2684 src1= formatConvBuffer;
2685 src2= formatConvBuffer+2048;
2687 else if(srcFormat==PIX_FMT_BGR24)
2689 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2690 src1= formatConvBuffer;
2691 src2= formatConvBuffer+2048;
2693 else if(srcFormat==PIX_FMT_BGR565)
2695 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2696 src1= formatConvBuffer;
2697 src2= formatConvBuffer+2048;
2699 else if(srcFormat==PIX_FMT_BGR555)
2701 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2702 src1= formatConvBuffer;
2703 src2= formatConvBuffer+2048;
2705 else if(srcFormat==PIX_FMT_BGR32)
2707 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2708 src1= formatConvBuffer;
2709 src2= formatConvBuffer+2048;
2711 else if(srcFormat==PIX_FMT_RGB24)
2713 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2714 src1= formatConvBuffer;
2715 src2= formatConvBuffer+2048;
2717 else if(srcFormat==PIX_FMT_RGB565)
2719 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2720 src1= formatConvBuffer;
2721 src2= formatConvBuffer+2048;
2723 else if(srcFormat==PIX_FMT_RGB555)
2725 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2726 src1= formatConvBuffer;
2727 src2= formatConvBuffer+2048;
2729 else if(isGray(srcFormat))
2735 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2736 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2738 if(!(flags&SWS_FAST_BILINEAR))
2741 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2742 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2744 else // Fast Bilinear upscale / crap downscale
2746 #if defined(ARCH_X86)
2750 uint64_t ebxsave __attribute__((aligned(8)));
2756 "mov %%"REG_b", %6 \n\t"
2758 "pxor %%mm7, %%mm7 \n\t"
2759 "mov %0, %%"REG_c" \n\t"
2760 "mov %1, %%"REG_D" \n\t"
2761 "mov %2, %%"REG_d" \n\t"
2762 "mov %3, %%"REG_b" \n\t"
2763 "xor %%"REG_a", %%"REG_a" \n\t" // i
2764 PREFETCH" (%%"REG_c") \n\t"
2765 PREFETCH" 32(%%"REG_c") \n\t"
2766 PREFETCH" 64(%%"REG_c") \n\t"
2770 #define FUNNY_UV_CODE \
2771 "movl (%%"REG_b"), %%esi \n\t"\
2773 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2774 "add %%"REG_S", %%"REG_c" \n\t"\
2775 "add %%"REG_a", %%"REG_D" \n\t"\
2776 "xor %%"REG_a", %%"REG_a" \n\t"\
2780 #define FUNNY_UV_CODE \
2781 "movl (%%"REG_b"), %%esi \n\t"\
2783 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2784 "add %%"REG_a", %%"REG_D" \n\t"\
2785 "xor %%"REG_a", %%"REG_a" \n\t"\
2793 "xor %%"REG_a", %%"REG_a" \n\t" // i
2794 "mov %5, %%"REG_c" \n\t" // src
2795 "mov %1, %%"REG_D" \n\t" // buf1
2796 "add $4096, %%"REG_D" \n\t"
2797 PREFETCH" (%%"REG_c") \n\t"
2798 PREFETCH" 32(%%"REG_c") \n\t"
2799 PREFETCH" 64(%%"REG_c") \n\t"
2807 "mov %6, %%"REG_b" \n\t"
2809 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2810 "m" (funnyUVCode), "m" (src2)
2814 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2819 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2821 // printf("%d %d %d\n", dstWidth, i, srcW);
2822 dst[i] = src1[srcW-1]*128;
2823 dst[i+2048] = src2[srcW-1]*128;
2829 long xInc_shr16 = (long) (xInc >> 16);
2830 uint16_t xInc_mask = xInc & 0xffff;
2832 "xor %%"REG_a", %%"REG_a" \n\t" // i
2833 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2834 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2837 "mov %0, %%"REG_S" \n\t"
2838 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2839 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2840 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2841 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2842 "shll $16, %%edi \n\t"
2843 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2844 "mov %1, %%"REG_D" \n\t"
2845 "shrl $9, %%esi \n\t"
2846 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2848 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2849 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2850 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2851 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2852 "shll $16, %%edi \n\t"
2853 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2854 "mov %1, %%"REG_D" \n\t"
2855 "shrl $9, %%esi \n\t"
2856 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2858 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2859 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2860 "add $1, %%"REG_a" \n\t"
2861 "cmp %2, %%"REG_a" \n\t"
2864 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2865 which is needed to support GCC-4.0 */
2866 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2867 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2869 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2872 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2875 } //if MMX2 can't be used
2879 unsigned int xpos=0;
2880 for(i=0;i<dstWidth;i++)
2882 register unsigned int xx=xpos>>16;
2883 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2884 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2885 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2887 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2888 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2896 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2897 int srcSliceH, uint8_t* dst[], int dstStride[]){
2899 /* load a few things into local vars to make the code more readable? and faster */
2900 const int srcW= c->srcW;
2901 const int dstW= c->dstW;
2902 const int dstH= c->dstH;
2903 const int chrDstW= c->chrDstW;
2904 const int chrSrcW= c->chrSrcW;
2905 const int lumXInc= c->lumXInc;
2906 const int chrXInc= c->chrXInc;
2907 const int dstFormat= c->dstFormat;
2908 const int srcFormat= c->srcFormat;
2909 const int flags= c->flags;
2910 const int canMMX2BeUsed= c->canMMX2BeUsed;
2911 int16_t *vLumFilterPos= c->vLumFilterPos;
2912 int16_t *vChrFilterPos= c->vChrFilterPos;
2913 int16_t *hLumFilterPos= c->hLumFilterPos;
2914 int16_t *hChrFilterPos= c->hChrFilterPos;
2915 int16_t *vLumFilter= c->vLumFilter;
2916 int16_t *vChrFilter= c->vChrFilter;
2917 int16_t *hLumFilter= c->hLumFilter;
2918 int16_t *hChrFilter= c->hChrFilter;
2919 int32_t *lumMmxFilter= c->lumMmxFilter;
2920 int32_t *chrMmxFilter= c->chrMmxFilter;
2921 const int vLumFilterSize= c->vLumFilterSize;
2922 const int vChrFilterSize= c->vChrFilterSize;
2923 const int hLumFilterSize= c->hLumFilterSize;
2924 const int hChrFilterSize= c->hChrFilterSize;
2925 int16_t **lumPixBuf= c->lumPixBuf;
2926 int16_t **chrPixBuf= c->chrPixBuf;
2927 const int vLumBufSize= c->vLumBufSize;
2928 const int vChrBufSize= c->vChrBufSize;
2929 uint8_t *funnyYCode= c->funnyYCode;
2930 uint8_t *funnyUVCode= c->funnyUVCode;
2931 uint8_t *formatConvBuffer= c->formatConvBuffer;
2932 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2933 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2936 /* vars whch will change and which we need to storw back in the context */
2938 int lumBufIndex= c->lumBufIndex;
2939 int chrBufIndex= c->chrBufIndex;
2940 int lastInLumBuf= c->lastInLumBuf;
2941 int lastInChrBuf= c->lastInChrBuf;
2943 if(isPacked(c->srcFormat)){
2949 srcStride[2]= srcStride[0];
2951 srcStride[1]<<= c->vChrDrop;
2952 srcStride[2]<<= c->vChrDrop;
2954 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2955 // (int)dst[0], (int)dst[1], (int)dst[2]);
2957 #if 0 //self test FIXME move to a vfilter or something
2959 static volatile int i=0;
2961 if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2962 selfTest(src, srcStride, c->srcW, c->srcH);
2967 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2968 //dstStride[0],dstStride[1],dstStride[2]);
2970 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2972 static int firstTime=1; //FIXME move this into the context perhaps
2973 if(flags & SWS_PRINT_INFO && firstTime)
2975 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2976 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2981 /* Note the user might start scaling the picture in the middle so this will not get executed
2982 this is not really intended but works currently, so ppl might do it */
2993 for(;dstY < dstH; dstY++){
2994 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2995 const int chrDstY= dstY>>c->chrDstVSubSample;
2996 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2997 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2999 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3000 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3001 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3002 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3004 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3005 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3006 //handle holes (FAST_BILINEAR & weird filters)
3007 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3008 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3009 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3010 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3011 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3013 // Do we have enough lines in this slice to output the dstY line
3014 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3016 //Do horizontal scaling
3017 while(lastInLumBuf < lastLumSrcY)
3019 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3021 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3022 ASSERT(lumBufIndex < 2*vLumBufSize)
3023 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3024 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3025 // printf("%d %d\n", lumBufIndex, vLumBufSize);
3026 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3027 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3028 funnyYCode, c->srcFormat, formatConvBuffer,
3029 c->lumMmx2Filter, c->lumMmx2FilterPos);
3032 while(lastInChrBuf < lastChrSrcY)
3034 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3035 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3037 ASSERT(chrBufIndex < 2*vChrBufSize)
3038 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3039 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3040 //FIXME replace parameters through context struct (some at least)
3042 if(!(isGray(srcFormat) || isGray(dstFormat)))
3043 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3044 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3045 funnyUVCode, c->srcFormat, formatConvBuffer,
3046 c->chrMmx2Filter, c->chrMmx2FilterPos);
3049 //wrap buf index around to stay inside the ring buffer
3050 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3051 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3053 else // not enough lines left in this slice -> load the rest in the buffer
3055 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3056 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3057 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3058 vChrBufSize, vLumBufSize);*/
3060 //Do horizontal scaling
3061 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3063 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3065 ASSERT(lumBufIndex < 2*vLumBufSize)
3066 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3067 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3068 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3069 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3070 funnyYCode, c->srcFormat, formatConvBuffer,
3071 c->lumMmx2Filter, c->lumMmx2FilterPos);
3074 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3076 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3077 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3079 ASSERT(chrBufIndex < 2*vChrBufSize)
3080 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3081 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3083 if(!(isGray(srcFormat) || isGray(dstFormat)))
3084 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3085 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3086 funnyUVCode, c->srcFormat, formatConvBuffer,
3087 c->chrMmx2Filter, c->chrMmx2FilterPos);
3090 //wrap buf index around to stay inside the ring buffer
3091 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3092 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3093 break; //we can't output a dstY line so let's try with the next slice
3097 b5Dither= dither8[dstY&1];
3098 g6Dither= dither4[dstY&1];
3099 g5Dither= dither8[dstY&1];
3100 r5Dither= dither8[(dstY+1)&1];
3104 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3105 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3108 if(flags & SWS_ACCURATE_RND){
3109 for(i=0; i<vLumFilterSize; i+=2){
3110 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3111 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3112 lumMmxFilter[2*i+2]=
3113 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3114 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3116 for(i=0; i<vChrFilterSize; i+=2){
3117 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3118 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3119 chrMmxFilter[2*i+2]=
3120 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3121 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3124 for(i=0; i<vLumFilterSize; i++)
3126 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3127 lumMmxFilter[4*i+2]=
3128 lumMmxFilter[4*i+3]=
3129 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3131 for(i=0; i<vChrFilterSize; i++)
3133 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3134 chrMmxFilter[4*i+2]=
3135 chrMmxFilter[4*i+3]=
3136 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3140 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3141 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3142 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3143 RENAME(yuv2nv12X)(c,
3144 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3145 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3146 dest, uDest, dstW, chrDstW, dstFormat);
3148 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3150 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3151 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3152 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3154 int16_t *lumBuf = lumPixBuf[0];
3155 int16_t *chrBuf= chrPixBuf[0];
3156 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3161 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3162 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3163 dest, uDest, vDest, dstW, chrDstW);
3168 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3169 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3170 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3172 int chrAlpha= vChrFilter[2*dstY+1];
3173 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3174 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3176 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3178 int lumAlpha= vLumFilter[2*dstY+1];
3179 int chrAlpha= vChrFilter[2*dstY+1];
3181 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3183 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3184 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3185 dest, dstW, lumAlpha, chrAlpha, dstY);
3189 RENAME(yuv2packedX)(c,
3190 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3191 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3196 else // hmm looks like we can't use MMX here without overwriting this array's tail
3198 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3199 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3200 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3201 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3202 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3204 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3205 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3206 dest, uDest, dstW, chrDstW, dstFormat);
3208 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3210 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3211 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3213 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3214 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3215 dest, uDest, vDest, dstW, chrDstW);
3219 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3220 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3222 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3223 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3230 __asm __volatile(SFENCE:::"memory");
3231 __asm __volatile(EMMS:::"memory");
3233 /* store changed local vars back in the context */
3235 c->lumBufIndex= lumBufIndex;
3236 c->chrBufIndex= chrBufIndex;
3237 c->lastInLumBuf= lastInLumBuf;
3238 c->lastInChrBuf= lastInChrBuf;
3240 return dstY - lastDstY;