2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddw %%mm7, %%mm0 \n\t"\
194 "paddw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $16, %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $16, %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
953 if (c->flags & SWS_ACCURATE_RND){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
970 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
974 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977 #endif //!HAVE_ALTIVEC
978 #endif /* HAVE_MMX */
981 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
985 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
990 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
999 if (c->flags & SWS_ACCURATE_RND){
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1021 for (i=0; i<dstW; i++)
1023 int val= (lumSrc[i]+64)>>7;
1034 for (i=0; i<chrDstW; i++)
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1041 else if (u>255) u=255;
1043 else if (v>255) v=255;
1054 * vertical scale YV12 to RGB
1056 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1065 YSCALEYUV2PACKEDX_ACCURATE
1067 WRITEBGR32(%4, %5, %%REGa)
1069 YSCALEYUV2PACKEDX_END
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1085 case PIX_FMT_RGB555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1095 WRITERGB15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1098 case PIX_FMT_RGB565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1108 WRITERGB16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1124 switch(c->dstFormat)
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 case PIX_FMT_RGB555:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1155 WRITERGB15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1158 case PIX_FMT_RGB565:
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1168 WRITERGB16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1171 case PIX_FMT_YUYV422:
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1184 #endif /* HAVE_MMX */
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1202 * vertical bilinear scale YV12 to RGB
1204 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1207 int yalpha1=yalpha^4095;
1208 int uvalpha1=uvalpha^4095;
1212 if (flags&SWS_FULL_CHR_H_INT)
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1290 case PIX_FMT_BGR555:
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1323 case PIX_FMT_BGR565:
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1356 #endif /* HAVE_MMX */
1361 if (dstFormat==PIX_FMT_RGB32)
1364 #ifdef WORDS_BIGENDIAN
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1378 else if (dstFormat==PIX_FMT_BGR24)
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1392 else if (dstFormat==PIX_FMT_BGR565)
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1407 else if (dstFormat==PIX_FMT_BGR555)
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1427 switch(c->dstFormat)
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1457 case PIX_FMT_RGB555:
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1470 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 case PIX_FMT_RGB565:
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1491 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 case PIX_FMT_YUYV422:
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1518 * YV12 to RGB without scaling or interpolating
1520 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1523 const int yalpha1=0;
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1529 if (flags&SWS_FULL_CHR_H_INT)
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 case PIX_FMT_RGB555:
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1580 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1588 case PIX_FMT_RGB565:
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1601 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1609 case PIX_FMT_YUYV422:
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 case PIX_FMT_RGB555:
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1669 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1677 case PIX_FMT_RGB565:
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1690 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1698 case PIX_FMT_YUYV422:
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1714 #endif /* HAVE_MMX */
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1723 //FIXME yuy2* can read up to 7 samples too much
1725 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1745 for (i=0; i<width; i++)
1750 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1776 for (i=0; i<width; i++)
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1782 assert(src1 == src2);
1785 /* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1791 "mov %0, %%"REG_a" \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1806 for (i=0; i<width; i++)
1811 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1837 for (i=0; i<width; i++)
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1843 assert(src1 == src2);
1846 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1849 for (i=0; i<width; i++)
1851 int b= ((uint32_t*)src)[i]&0xFF;
1852 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1853 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1855 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1859 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1862 assert(src1 == src2);
1863 for (i=0; i<width; i++)
1865 const int a= ((uint32_t*)src1)[2*i+0];
1866 const int e= ((uint32_t*)src1)[2*i+1];
1867 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1868 const int h= (a&0x00FF00) + (e&0x00FF00);
1869 const int b= l&0x3FF;
1873 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1874 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1878 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1882 "mov %2, %%"REG_a" \n\t"
1883 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1884 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1885 "pxor %%mm7, %%mm7 \n\t"
1886 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1889 PREFETCH" 64(%0, %%"REG_d") \n\t"
1890 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1891 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1892 "punpcklbw %%mm7, %%mm0 \n\t"
1893 "punpcklbw %%mm7, %%mm1 \n\t"
1894 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1895 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1896 "punpcklbw %%mm7, %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "pmaddwd %%mm6, %%mm0 \n\t"
1899 "pmaddwd %%mm6, %%mm1 \n\t"
1900 "pmaddwd %%mm6, %%mm2 \n\t"
1901 "pmaddwd %%mm6, %%mm3 \n\t"
1902 #ifndef FAST_BGR2YV12
1903 "psrad $8, %%mm0 \n\t"
1904 "psrad $8, %%mm1 \n\t"
1905 "psrad $8, %%mm2 \n\t"
1906 "psrad $8, %%mm3 \n\t"
1908 "packssdw %%mm1, %%mm0 \n\t"
1909 "packssdw %%mm3, %%mm2 \n\t"
1910 "pmaddwd %%mm5, %%mm0 \n\t"
1911 "pmaddwd %%mm5, %%mm2 \n\t"
1912 "packssdw %%mm2, %%mm0 \n\t"
1913 "psraw $7, %%mm0 \n\t"
1915 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1916 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1917 "punpcklbw %%mm7, %%mm4 \n\t"
1918 "punpcklbw %%mm7, %%mm1 \n\t"
1919 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1920 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1921 "punpcklbw %%mm7, %%mm2 \n\t"
1922 "punpcklbw %%mm7, %%mm3 \n\t"
1923 "pmaddwd %%mm6, %%mm4 \n\t"
1924 "pmaddwd %%mm6, %%mm1 \n\t"
1925 "pmaddwd %%mm6, %%mm2 \n\t"
1926 "pmaddwd %%mm6, %%mm3 \n\t"
1927 #ifndef FAST_BGR2YV12
1928 "psrad $8, %%mm4 \n\t"
1929 "psrad $8, %%mm1 \n\t"
1930 "psrad $8, %%mm2 \n\t"
1931 "psrad $8, %%mm3 \n\t"
1933 "packssdw %%mm1, %%mm4 \n\t"
1934 "packssdw %%mm3, %%mm2 \n\t"
1935 "pmaddwd %%mm5, %%mm4 \n\t"
1936 "pmaddwd %%mm5, %%mm2 \n\t"
1937 "add $24, %%"REG_d" \n\t"
1938 "packssdw %%mm2, %%mm4 \n\t"
1939 "psraw $7, %%mm4 \n\t"
1941 "packuswb %%mm4, %%mm0 \n\t"
1942 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1944 "movq %%mm0, (%1, %%"REG_a") \n\t"
1945 "add $8, %%"REG_a" \n\t"
1947 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1948 : "%"REG_a, "%"REG_d
1952 for (i=0; i<width; i++)
1958 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1960 #endif /* HAVE_MMX */
1963 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1967 "mov %3, %%"REG_a" \n\t"
1968 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1969 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1970 "pxor %%mm7, %%mm7 \n\t"
1971 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1972 "add %%"REG_d", %%"REG_d" \n\t"
1975 PREFETCH" 64(%0, %%"REG_d") \n\t"
1976 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1977 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1978 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1979 "movq %%mm0, %%mm1 \n\t"
1980 "movq %%mm2, %%mm3 \n\t"
1981 "psrlq $24, %%mm0 \n\t"
1982 "psrlq $24, %%mm2 \n\t"
1985 "punpcklbw %%mm7, %%mm0 \n\t"
1986 "punpcklbw %%mm7, %%mm2 \n\t"
1988 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1989 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1990 "punpcklbw %%mm7, %%mm0 \n\t"
1991 "punpcklbw %%mm7, %%mm2 \n\t"
1992 "paddw %%mm2, %%mm0 \n\t"
1993 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1994 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1995 "punpcklbw %%mm7, %%mm4 \n\t"
1996 "punpcklbw %%mm7, %%mm2 \n\t"
1997 "paddw %%mm4, %%mm2 \n\t"
1998 "psrlw $1, %%mm0 \n\t"
1999 "psrlw $1, %%mm2 \n\t"
2001 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2002 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2004 "pmaddwd %%mm0, %%mm1 \n\t"
2005 "pmaddwd %%mm2, %%mm3 \n\t"
2006 "pmaddwd %%mm6, %%mm0 \n\t"
2007 "pmaddwd %%mm6, %%mm2 \n\t"
2008 #ifndef FAST_BGR2YV12
2009 "psrad $8, %%mm0 \n\t"
2010 "psrad $8, %%mm1 \n\t"
2011 "psrad $8, %%mm2 \n\t"
2012 "psrad $8, %%mm3 \n\t"
2014 "packssdw %%mm2, %%mm0 \n\t"
2015 "packssdw %%mm3, %%mm1 \n\t"
2016 "pmaddwd %%mm5, %%mm0 \n\t"
2017 "pmaddwd %%mm5, %%mm1 \n\t"
2018 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2019 "psraw $7, %%mm0 \n\t"
2021 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2022 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2023 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2024 "movq %%mm4, %%mm1 \n\t"
2025 "movq %%mm2, %%mm3 \n\t"
2026 "psrlq $24, %%mm4 \n\t"
2027 "psrlq $24, %%mm2 \n\t"
2030 "punpcklbw %%mm7, %%mm4 \n\t"
2031 "punpcklbw %%mm7, %%mm2 \n\t"
2033 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2034 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2035 "punpcklbw %%mm7, %%mm4 \n\t"
2036 "punpcklbw %%mm7, %%mm2 \n\t"
2037 "paddw %%mm2, %%mm4 \n\t"
2038 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2039 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2040 "punpcklbw %%mm7, %%mm5 \n\t"
2041 "punpcklbw %%mm7, %%mm2 \n\t"
2042 "paddw %%mm5, %%mm2 \n\t"
2043 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2044 "psrlw $2, %%mm4 \n\t"
2045 "psrlw $2, %%mm2 \n\t"
2047 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2048 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2050 "pmaddwd %%mm4, %%mm1 \n\t"
2051 "pmaddwd %%mm2, %%mm3 \n\t"
2052 "pmaddwd %%mm6, %%mm4 \n\t"
2053 "pmaddwd %%mm6, %%mm2 \n\t"
2054 #ifndef FAST_BGR2YV12
2055 "psrad $8, %%mm4 \n\t"
2056 "psrad $8, %%mm1 \n\t"
2057 "psrad $8, %%mm2 \n\t"
2058 "psrad $8, %%mm3 \n\t"
2060 "packssdw %%mm2, %%mm4 \n\t"
2061 "packssdw %%mm3, %%mm1 \n\t"
2062 "pmaddwd %%mm5, %%mm4 \n\t"
2063 "pmaddwd %%mm5, %%mm1 \n\t"
2064 "add $24, %%"REG_d" \n\t"
2065 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2066 "psraw $7, %%mm4 \n\t"
2068 "movq %%mm0, %%mm1 \n\t"
2069 "punpckldq %%mm4, %%mm0 \n\t"
2070 "punpckhdq %%mm4, %%mm1 \n\t"
2071 "packsswb %%mm1, %%mm0 \n\t"
2072 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2074 "movd %%mm0, (%1, %%"REG_a") \n\t"
2075 "punpckhdq %%mm0, %%mm0 \n\t"
2076 "movd %%mm0, (%2, %%"REG_a") \n\t"
2077 "add $4, %%"REG_a" \n\t"
2079 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2080 : "%"REG_a, "%"REG_d
2084 for (i=0; i<width; i++)
2086 int b= src1[6*i + 0] + src1[6*i + 3];
2087 int g= src1[6*i + 1] + src1[6*i + 4];
2088 int r= src1[6*i + 2] + src1[6*i + 5];
2090 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2091 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2093 #endif /* HAVE_MMX */
2094 assert(src1 == src2);
2097 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2100 for (i=0; i<width; i++)
2102 int d= ((uint16_t*)src)[i];
2105 int r= (d>>11)&0x1F;
2107 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2111 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2115 for (i=0; i<width; i++)
2117 int d0= ((uint32_t*)src1)[i];
2119 int dl= (d0&0x07E0F81F);
2120 int dh= ((d0>>5)&0x07C0F83F);
2122 int dh2= (dh>>11) + (dh<<21);
2126 int r= (d>>11)&0x7F;
2128 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2129 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2133 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2136 for (i=0; i<width; i++)
2138 int d= ((uint16_t*)src)[i];
2141 int r= (d>>10)&0x1F;
2143 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2147 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2151 for (i=0; i<width; i++)
2153 int d0= ((uint32_t*)src1)[i];
2155 int dl= (d0&0x03E07C1F);
2156 int dh= ((d0>>5)&0x03E0F81F);
2158 int dh2= (dh>>11) + (dh<<21);
2162 int r= (d>>10)&0x7F;
2164 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2165 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2170 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2173 for (i=0; i<width; i++)
2175 int r= ((uint32_t*)src)[i]&0xFF;
2176 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2177 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2179 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2183 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2187 for (i=0; i<width; i++)
2189 const int a= ((uint32_t*)src1)[2*i+0];
2190 const int e= ((uint32_t*)src1)[2*i+1];
2191 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2192 const int h= (a&0x00FF00) + (e&0x00FF00);
2193 const int r= l&0x3FF;
2197 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2198 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2202 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2205 for (i=0; i<width; i++)
2211 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2215 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2219 for (i=0; i<width; i++)
2221 int r= src1[6*i + 0] + src1[6*i + 3];
2222 int g= src1[6*i + 1] + src1[6*i + 4];
2223 int b= src1[6*i + 2] + src1[6*i + 5];
2225 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2226 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2230 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2233 for (i=0; i<width; i++)
2235 int d= ((uint16_t*)src)[i];
2238 int b= (d>>11)&0x1F;
2240 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2244 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2247 assert(src1 == src2);
2248 for (i=0; i<width; i++)
2250 int d0= ((uint32_t*)src1)[i];
2252 int dl= (d0&0x07E0F81F);
2253 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2256 int b= (d>>11)&0x3F;
2258 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2259 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2263 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2266 for (i=0; i<width; i++)
2268 int d= ((uint16_t*)src)[i];
2271 int b= (d>>10)&0x1F;
2273 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2277 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2280 assert(src1 == src2);
2281 for (i=0; i<width; i++)
2283 int d0= ((uint32_t*)src1)[i];
2285 int dl= (d0&0x03E07C1F);
2286 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2289 int b= (d>>10)&0x3F;
2291 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2292 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2296 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2299 for (i=0; i<width; i++)
2303 dst[i]= pal[d] & 0xFF;
2307 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2310 assert(src1 == src2);
2311 for (i=0; i<width; i++)
2313 int p= pal[src1[i]];
2320 // bilinear / bicubic scaling
2321 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2322 int16_t *filter, int16_t *filterPos, long filterSize)
2325 assert(filterSize % 4 == 0 && filterSize>0);
2326 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2328 long counter= -2*dstW;
2330 filterPos-= counter/2;
2334 "push %%"REG_b" \n\t"
2336 "pxor %%mm7, %%mm7 \n\t"
2337 "movq "MANGLE(w02)", %%mm6 \n\t"
2338 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2339 "mov %%"REG_a", %%"REG_BP" \n\t"
2342 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2343 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2344 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2345 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2346 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2347 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2348 "punpcklbw %%mm7, %%mm0 \n\t"
2349 "punpcklbw %%mm7, %%mm2 \n\t"
2350 "pmaddwd %%mm1, %%mm0 \n\t"
2351 "pmaddwd %%mm2, %%mm3 \n\t"
2352 "psrad $8, %%mm0 \n\t"
2353 "psrad $8, %%mm3 \n\t"
2354 "packssdw %%mm3, %%mm0 \n\t"
2355 "pmaddwd %%mm6, %%mm0 \n\t"
2356 "packssdw %%mm0, %%mm0 \n\t"
2357 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2358 "add $4, %%"REG_BP" \n\t"
2361 "pop %%"REG_BP" \n\t"
2363 "pop %%"REG_b" \n\t"
2366 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2372 else if (filterSize==8)
2374 long counter= -2*dstW;
2376 filterPos-= counter/2;
2380 "push %%"REG_b" \n\t"
2382 "pxor %%mm7, %%mm7 \n\t"
2383 "movq "MANGLE(w02)", %%mm6 \n\t"
2384 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2385 "mov %%"REG_a", %%"REG_BP" \n\t"
2388 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2389 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2390 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2391 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2392 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2393 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2394 "punpcklbw %%mm7, %%mm0 \n\t"
2395 "punpcklbw %%mm7, %%mm2 \n\t"
2396 "pmaddwd %%mm1, %%mm0 \n\t"
2397 "pmaddwd %%mm2, %%mm3 \n\t"
2399 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2400 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2401 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2402 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2403 "punpcklbw %%mm7, %%mm4 \n\t"
2404 "punpcklbw %%mm7, %%mm2 \n\t"
2405 "pmaddwd %%mm1, %%mm4 \n\t"
2406 "pmaddwd %%mm2, %%mm5 \n\t"
2407 "paddd %%mm4, %%mm0 \n\t"
2408 "paddd %%mm5, %%mm3 \n\t"
2410 "psrad $8, %%mm0 \n\t"
2411 "psrad $8, %%mm3 \n\t"
2412 "packssdw %%mm3, %%mm0 \n\t"
2413 "pmaddwd %%mm6, %%mm0 \n\t"
2414 "packssdw %%mm0, %%mm0 \n\t"
2415 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2416 "add $4, %%"REG_BP" \n\t"
2419 "pop %%"REG_BP" \n\t"
2421 "pop %%"REG_b" \n\t"
2424 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2432 uint8_t *offset = src+filterSize;
2433 long counter= -2*dstW;
2434 //filter-= counter*filterSize/2;
2435 filterPos-= counter/2;
2438 "pxor %%mm7, %%mm7 \n\t"
2439 "movq "MANGLE(w02)", %%mm6 \n\t"
2442 "mov %2, %%"REG_c" \n\t"
2443 "movzwl (%%"REG_c", %0), %%eax \n\t"
2444 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2445 "mov %5, %%"REG_c" \n\t"
2446 "pxor %%mm4, %%mm4 \n\t"
2447 "pxor %%mm5, %%mm5 \n\t"
2449 "movq (%1), %%mm1 \n\t"
2450 "movq (%1, %6), %%mm3 \n\t"
2451 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2452 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2453 "punpcklbw %%mm7, %%mm0 \n\t"
2454 "punpcklbw %%mm7, %%mm2 \n\t"
2455 "pmaddwd %%mm1, %%mm0 \n\t"
2456 "pmaddwd %%mm2, %%mm3 \n\t"
2457 "paddd %%mm3, %%mm5 \n\t"
2458 "paddd %%mm0, %%mm4 \n\t"
2460 "add $4, %%"REG_c" \n\t"
2461 "cmp %4, %%"REG_c" \n\t"
2464 "psrad $8, %%mm4 \n\t"
2465 "psrad $8, %%mm5 \n\t"
2466 "packssdw %%mm5, %%mm4 \n\t"
2467 "pmaddwd %%mm6, %%mm4 \n\t"
2468 "packssdw %%mm4, %%mm4 \n\t"
2469 "mov %3, %%"REG_a" \n\t"
2470 "movd %%mm4, (%%"REG_a", %0) \n\t"
2474 : "+r" (counter), "+r" (filter)
2475 : "m" (filterPos), "m" (dst), "m"(offset),
2476 "m" (src), "r" (filterSize*2)
2477 : "%"REG_a, "%"REG_c, "%"REG_d
2482 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2485 for (i=0; i<dstW; i++)
2488 int srcPos= filterPos[i];
2490 //printf("filterPos: %d\n", filterPos[i]);
2491 for (j=0; j<filterSize; j++)
2493 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2494 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2496 //filter += hFilterSize;
2497 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2500 #endif /* HAVE_ALTIVEC */
2501 #endif /* HAVE_MMX */
2503 // *** horizontal scale Y line to temp buffer
2504 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2505 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2506 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2507 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2508 int32_t *mmx2FilterPos, uint8_t *pal)
2510 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2512 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2513 src= formatConvBuffer;
2515 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2517 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2518 src= formatConvBuffer;
2520 else if (srcFormat==PIX_FMT_RGB32)
2522 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2523 src= formatConvBuffer;
2525 else if (srcFormat==PIX_FMT_BGR24)
2527 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2528 src= formatConvBuffer;
2530 else if (srcFormat==PIX_FMT_BGR565)
2532 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2533 src= formatConvBuffer;
2535 else if (srcFormat==PIX_FMT_BGR555)
2537 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2538 src= formatConvBuffer;
2540 else if (srcFormat==PIX_FMT_BGR32)
2542 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2543 src= formatConvBuffer;
2545 else if (srcFormat==PIX_FMT_RGB24)
2547 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2548 src= formatConvBuffer;
2550 else if (srcFormat==PIX_FMT_RGB565)
2552 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2553 src= formatConvBuffer;
2555 else if (srcFormat==PIX_FMT_RGB555)
2557 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2558 src= formatConvBuffer;
2560 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2562 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2563 src= formatConvBuffer;
2567 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2568 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2570 if (!(flags&SWS_FAST_BILINEAR))
2573 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2575 else // fast bilinear upscale / crap downscale
2577 #if defined(ARCH_X86)
2581 uint64_t ebxsave __attribute__((aligned(8)));
2587 "mov %%"REG_b", %5 \n\t"
2589 "pxor %%mm7, %%mm7 \n\t"
2590 "mov %0, %%"REG_c" \n\t"
2591 "mov %1, %%"REG_D" \n\t"
2592 "mov %2, %%"REG_d" \n\t"
2593 "mov %3, %%"REG_b" \n\t"
2594 "xor %%"REG_a", %%"REG_a" \n\t" // i
2595 PREFETCH" (%%"REG_c") \n\t"
2596 PREFETCH" 32(%%"REG_c") \n\t"
2597 PREFETCH" 64(%%"REG_c") \n\t"
2601 #define FUNNY_Y_CODE \
2602 "movl (%%"REG_b"), %%esi \n\t"\
2604 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2605 "add %%"REG_S", %%"REG_c" \n\t"\
2606 "add %%"REG_a", %%"REG_D" \n\t"\
2607 "xor %%"REG_a", %%"REG_a" \n\t"\
2611 #define FUNNY_Y_CODE \
2612 "movl (%%"REG_b"), %%esi \n\t"\
2614 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2615 "add %%"REG_a", %%"REG_D" \n\t"\
2616 "xor %%"REG_a", %%"REG_a" \n\t"\
2618 #endif /* ARCH_X86_64 */
2630 "mov %5, %%"REG_b" \n\t"
2632 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2637 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2642 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2646 #endif /* HAVE_MMX2 */
2647 long xInc_shr16 = xInc >> 16;
2648 uint16_t xInc_mask = xInc & 0xffff;
2649 //NO MMX just normal asm ...
2651 "xor %%"REG_a", %%"REG_a" \n\t" // i
2652 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2653 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2656 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2657 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2658 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2659 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2660 "shll $16, %%edi \n\t"
2661 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2662 "mov %1, %%"REG_D" \n\t"
2663 "shrl $9, %%esi \n\t"
2664 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2665 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2666 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2668 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2669 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2670 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2671 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2672 "shll $16, %%edi \n\t"
2673 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2674 "mov %1, %%"REG_D" \n\t"
2675 "shrl $9, %%esi \n\t"
2676 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2677 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2678 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2681 "add $2, %%"REG_a" \n\t"
2682 "cmp %2, %%"REG_a" \n\t"
2686 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2687 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2690 } //if MMX2 can't be used
2694 unsigned int xpos=0;
2695 for (i=0;i<dstWidth;i++)
2697 register unsigned int xx=xpos>>16;
2698 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2699 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2702 #endif /* defined(ARCH_X86) */
2706 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2707 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2708 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2709 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2710 int32_t *mmx2FilterPos, uint8_t *pal)
2712 if (srcFormat==PIX_FMT_YUYV422)
2714 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2715 src1= formatConvBuffer;
2716 src2= formatConvBuffer+VOFW;
2718 else if (srcFormat==PIX_FMT_UYVY422)
2720 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2721 src1= formatConvBuffer;
2722 src2= formatConvBuffer+VOFW;
2724 else if (srcFormat==PIX_FMT_RGB32)
2726 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2727 src1= formatConvBuffer;
2728 src2= formatConvBuffer+VOFW;
2730 else if (srcFormat==PIX_FMT_BGR24)
2732 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2733 src1= formatConvBuffer;
2734 src2= formatConvBuffer+VOFW;
2736 else if (srcFormat==PIX_FMT_BGR565)
2738 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2739 src1= formatConvBuffer;
2740 src2= formatConvBuffer+VOFW;
2742 else if (srcFormat==PIX_FMT_BGR555)
2744 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2745 src1= formatConvBuffer;
2746 src2= formatConvBuffer+VOFW;
2748 else if (srcFormat==PIX_FMT_BGR32)
2750 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2751 src1= formatConvBuffer;
2752 src2= formatConvBuffer+VOFW;
2754 else if (srcFormat==PIX_FMT_RGB24)
2756 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2757 src1= formatConvBuffer;
2758 src2= formatConvBuffer+VOFW;
2760 else if (srcFormat==PIX_FMT_RGB565)
2762 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2763 src1= formatConvBuffer;
2764 src2= formatConvBuffer+VOFW;
2766 else if (srcFormat==PIX_FMT_RGB555)
2768 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2769 src1= formatConvBuffer;
2770 src2= formatConvBuffer+VOFW;
2772 else if (isGray(srcFormat))
2776 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2778 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2779 src1= formatConvBuffer;
2780 src2= formatConvBuffer+VOFW;
2784 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2785 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2787 if (!(flags&SWS_FAST_BILINEAR))
2790 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2791 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2793 else // fast bilinear upscale / crap downscale
2795 #if defined(ARCH_X86)
2799 uint64_t ebxsave __attribute__((aligned(8)));
2805 "mov %%"REG_b", %6 \n\t"
2807 "pxor %%mm7, %%mm7 \n\t"
2808 "mov %0, %%"REG_c" \n\t"
2809 "mov %1, %%"REG_D" \n\t"
2810 "mov %2, %%"REG_d" \n\t"
2811 "mov %3, %%"REG_b" \n\t"
2812 "xor %%"REG_a", %%"REG_a" \n\t" // i
2813 PREFETCH" (%%"REG_c") \n\t"
2814 PREFETCH" 32(%%"REG_c") \n\t"
2815 PREFETCH" 64(%%"REG_c") \n\t"
2819 #define FUNNY_UV_CODE \
2820 "movl (%%"REG_b"), %%esi \n\t"\
2822 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2823 "add %%"REG_S", %%"REG_c" \n\t"\
2824 "add %%"REG_a", %%"REG_D" \n\t"\
2825 "xor %%"REG_a", %%"REG_a" \n\t"\
2829 #define FUNNY_UV_CODE \
2830 "movl (%%"REG_b"), %%esi \n\t"\
2832 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2833 "add %%"REG_a", %%"REG_D" \n\t"\
2834 "xor %%"REG_a", %%"REG_a" \n\t"\
2836 #endif /* ARCH_X86_64 */
2842 "xor %%"REG_a", %%"REG_a" \n\t" // i
2843 "mov %5, %%"REG_c" \n\t" // src
2844 "mov %1, %%"REG_D" \n\t" // buf1
2845 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2846 PREFETCH" (%%"REG_c") \n\t"
2847 PREFETCH" 32(%%"REG_c") \n\t"
2848 PREFETCH" 64(%%"REG_c") \n\t"
2856 "mov %6, %%"REG_b" \n\t"
2858 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2859 "m" (funnyUVCode), "m" (src2)
2863 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2868 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2870 //printf("%d %d %d\n", dstWidth, i, srcW);
2871 dst[i] = src1[srcW-1]*128;
2872 dst[i+VOFW] = src2[srcW-1]*128;
2877 #endif /* HAVE_MMX2 */
2878 long xInc_shr16 = (long) (xInc >> 16);
2879 uint16_t xInc_mask = xInc & 0xffff;
2881 "xor %%"REG_a", %%"REG_a" \n\t" // i
2882 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2883 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2886 "mov %0, %%"REG_S" \n\t"
2887 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2888 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2889 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2890 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2891 "shll $16, %%edi \n\t"
2892 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2893 "mov %1, %%"REG_D" \n\t"
2894 "shrl $9, %%esi \n\t"
2895 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2897 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2898 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2899 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2900 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2901 "shll $16, %%edi \n\t"
2902 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2903 "mov %1, %%"REG_D" \n\t"
2904 "shrl $9, %%esi \n\t"
2905 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2907 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2908 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2909 "add $1, %%"REG_a" \n\t"
2910 "cmp %2, %%"REG_a" \n\t"
2913 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2914 which is needed to support GCC 4.0. */
2915 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2916 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2918 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2921 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2924 } //if MMX2 can't be used
2928 unsigned int xpos=0;
2929 for (i=0;i<dstWidth;i++)
2931 register unsigned int xx=xpos>>16;
2932 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2933 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2934 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2936 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2937 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2941 #endif /* defined(ARCH_X86) */
2945 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2946 int srcSliceH, uint8_t* dst[], int dstStride[]){
2948 /* load a few things into local vars to make the code more readable? and faster */
2949 const int srcW= c->srcW;
2950 const int dstW= c->dstW;
2951 const int dstH= c->dstH;
2952 const int chrDstW= c->chrDstW;
2953 const int chrSrcW= c->chrSrcW;
2954 const int lumXInc= c->lumXInc;
2955 const int chrXInc= c->chrXInc;
2956 const int dstFormat= c->dstFormat;
2957 const int srcFormat= c->srcFormat;
2958 const int flags= c->flags;
2959 const int canMMX2BeUsed= c->canMMX2BeUsed;
2960 int16_t *vLumFilterPos= c->vLumFilterPos;
2961 int16_t *vChrFilterPos= c->vChrFilterPos;
2962 int16_t *hLumFilterPos= c->hLumFilterPos;
2963 int16_t *hChrFilterPos= c->hChrFilterPos;
2964 int16_t *vLumFilter= c->vLumFilter;
2965 int16_t *vChrFilter= c->vChrFilter;
2966 int16_t *hLumFilter= c->hLumFilter;
2967 int16_t *hChrFilter= c->hChrFilter;
2968 int32_t *lumMmxFilter= c->lumMmxFilter;
2969 int32_t *chrMmxFilter= c->chrMmxFilter;
2970 const int vLumFilterSize= c->vLumFilterSize;
2971 const int vChrFilterSize= c->vChrFilterSize;
2972 const int hLumFilterSize= c->hLumFilterSize;
2973 const int hChrFilterSize= c->hChrFilterSize;
2974 int16_t **lumPixBuf= c->lumPixBuf;
2975 int16_t **chrPixBuf= c->chrPixBuf;
2976 const int vLumBufSize= c->vLumBufSize;
2977 const int vChrBufSize= c->vChrBufSize;
2978 uint8_t *funnyYCode= c->funnyYCode;
2979 uint8_t *funnyUVCode= c->funnyUVCode;
2980 uint8_t *formatConvBuffer= c->formatConvBuffer;
2981 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2982 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2986 /* vars which will change and which we need to store back in the context */
2988 int lumBufIndex= c->lumBufIndex;
2989 int chrBufIndex= c->chrBufIndex;
2990 int lastInLumBuf= c->lastInLumBuf;
2991 int lastInChrBuf= c->lastInChrBuf;
2993 if (isPacked(c->srcFormat)){
3000 srcStride[2]= srcStride[0];
3002 srcStride[1]<<= c->vChrDrop;
3003 srcStride[2]<<= c->vChrDrop;
3005 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3006 // (int)dst[0], (int)dst[1], (int)dst[2]);
3008 #if 0 //self test FIXME move to a vfilter or something
3010 static volatile int i=0;
3012 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3013 selfTest(src, srcStride, c->srcW, c->srcH);
3018 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3019 //dstStride[0],dstStride[1],dstStride[2]);
3021 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3023 static int firstTime=1; //FIXME move this into the context perhaps
3024 if (flags & SWS_PRINT_INFO && firstTime)
3026 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3027 " ->cannot do aligned memory accesses anymore\n");
3032 /* Note the user might start scaling the picture in the middle so this
3033 will not get executed. This is not really intended but works
3034 currently, so people might do it. */
3045 for (;dstY < dstH; dstY++){
3046 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3047 const int chrDstY= dstY>>c->chrDstVSubSample;
3048 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3049 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3051 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3052 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3053 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3054 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3056 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3057 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3058 //handle holes (FAST_BILINEAR & weird filters)
3059 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3060 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3061 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3062 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3063 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3065 // Do we have enough lines in this slice to output the dstY line
3066 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3068 //Do horizontal scaling
3069 while(lastInLumBuf < lastLumSrcY)
3071 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3073 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3074 assert(lumBufIndex < 2*vLumBufSize);
3075 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3076 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3077 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3078 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3079 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3080 funnyYCode, c->srcFormat, formatConvBuffer,
3081 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3084 while(lastInChrBuf < lastChrSrcY)
3086 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3087 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3089 assert(chrBufIndex < 2*vChrBufSize);
3090 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3091 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3092 //FIXME replace parameters through context struct (some at least)
3094 if (!(isGray(srcFormat) || isGray(dstFormat)))
3095 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3096 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3097 funnyUVCode, c->srcFormat, formatConvBuffer,
3098 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3101 //wrap buf index around to stay inside the ring buffer
3102 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3103 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3105 else // not enough lines left in this slice -> load the rest in the buffer
3107 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3108 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3109 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3110 vChrBufSize, vLumBufSize);*/
3112 //Do horizontal scaling
3113 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3115 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3117 assert(lumBufIndex < 2*vLumBufSize);
3118 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3119 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3120 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3121 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3122 funnyYCode, c->srcFormat, formatConvBuffer,
3123 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3126 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3128 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3129 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3131 assert(chrBufIndex < 2*vChrBufSize);
3132 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3133 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3135 if (!(isGray(srcFormat) || isGray(dstFormat)))
3136 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3137 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3138 funnyUVCode, c->srcFormat, formatConvBuffer,
3139 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3142 //wrap buf index around to stay inside the ring buffer
3143 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3144 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3145 break; //we can't output a dstY line so let's try with the next slice
3149 b5Dither= ff_dither8[dstY&1];
3150 g6Dither= ff_dither4[dstY&1];
3151 g5Dither= ff_dither8[dstY&1];
3152 r5Dither= ff_dither8[(dstY+1)&1];
3156 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3157 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3160 if (flags & SWS_ACCURATE_RND){
3161 for (i=0; i<vLumFilterSize; i+=2){
3162 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3163 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3164 lumMmxFilter[2*i+2]=
3165 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3166 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3168 for (i=0; i<vChrFilterSize; i+=2){
3169 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3170 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3171 chrMmxFilter[2*i+2]=
3172 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3173 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3176 for (i=0; i<vLumFilterSize; i++)
3178 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3179 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3180 lumMmxFilter[4*i+2]=
3181 lumMmxFilter[4*i+3]=
3182 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3184 for (i=0; i<vChrFilterSize; i++)
3186 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3187 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3188 chrMmxFilter[4*i+2]=
3189 chrMmxFilter[4*i+3]=
3190 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3194 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3195 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3196 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3197 RENAME(yuv2nv12X)(c,
3198 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3199 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3200 dest, uDest, dstW, chrDstW, dstFormat);
3202 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3204 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3205 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3206 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3208 int16_t *lumBuf = lumPixBuf[0];
3209 int16_t *chrBuf= chrPixBuf[0];
3210 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3215 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3216 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3217 dest, uDest, vDest, dstW, chrDstW);
3222 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3223 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3224 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3226 int chrAlpha= vChrFilter[2*dstY+1];
3227 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3228 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3230 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3232 int lumAlpha= vLumFilter[2*dstY+1];
3233 int chrAlpha= vChrFilter[2*dstY+1];
3235 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3237 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3238 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3239 dest, dstW, lumAlpha, chrAlpha, dstY);
3243 RENAME(yuv2packedX)(c,
3244 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3245 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3250 else // hmm looks like we can't use MMX here without overwriting this array's tail
3252 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3253 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3254 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3255 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3256 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3258 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3259 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3260 dest, uDest, dstW, chrDstW, dstFormat);
3262 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3264 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3265 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3267 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3268 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3269 dest, uDest, vDest, dstW, chrDstW);
3273 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3274 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3276 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3277 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3284 asm volatile(SFENCE:::"memory");
3285 asm volatile(EMMS:::"memory");
3287 /* store changed local vars back in the context */
3289 c->lumBufIndex= lumBufIndex;
3290 c->chrBufIndex= chrBufIndex;
3291 c->lastInLumBuf= lastInLumBuf;
3292 c->lastInChrBuf= lastInChrBuf;
3294 return dstY - lastDstY;