2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
194 #define YSCALEYUV2PACKEDX_UV \
196 "xor %%"REG_a", %%"REG_a" \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
250 "xor %%"REG_a", %%"REG_a" \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
824 "add $24, "#dst" \n\t"\
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
872 "add $24, "#dst" \n\t"\
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
878 #if COMPILE_TEMPLATE_MMX2
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)){
910 if (c->flags & SWS_ACCURATE_RND){
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
915 if (CONFIG_SWSCALE_ALPHA && aDest){
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
925 if (CONFIG_SWSCALE_ALPHA && aDest){
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)){
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
965 if (c->flags & SWS_ACCURATE_RND){
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
991 for (i=0; i<dstW; i++)
993 int val= (lumSrc[i]+64)>>7;
1004 for (i=0; i<chrDstW; i++)
1006 int u=(chrSrc[i ]+64)>>7;
1007 int v=(chrSrc[i + VOFW]+64)>>7;
1011 else if (u>255) u=255;
1013 else if (v>255) v=255;
1020 if (CONFIG_SWSCALE_ALPHA && aDest)
1021 for (i=0; i<dstW; i++){
1022 int val= (alpSrc[i]+64)>>7;
1023 aDest[i]= av_clip_uint8(val);
1029 * vertical scale YV12 to RGB
1031 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1035 #if COMPILE_TEMPLATE_MMX
1037 if(!(c->flags & SWS_BITEXACT)){
1038 if (c->flags & SWS_ACCURATE_RND){
1039 switch(c->dstFormat){
1041 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042 YSCALEYUV2PACKEDX_ACCURATE
1044 "movq %%mm2, "U_TEMP"(%0) \n\t"
1045 "movq %%mm4, "V_TEMP"(%0) \n\t"
1046 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1047 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1049 "psraw $3, %%mm1 \n\t"
1050 "psraw $3, %%mm7 \n\t"
1051 "packuswb %%mm7, %%mm1 \n\t"
1052 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1054 YSCALEYUV2PACKEDX_END
1056 YSCALEYUV2PACKEDX_ACCURATE
1058 "pcmpeqd %%mm7, %%mm7 \n\t"
1059 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1061 YSCALEYUV2PACKEDX_END
1065 YSCALEYUV2PACKEDX_ACCURATE
1067 "pxor %%mm7, %%mm7 \n\t"
1068 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069 "add %4, %%"REG_c" \n\t"
1070 WRITEBGR24(%%REGc, %5, %%REGa)
1073 :: "r" (&c->redDither),
1074 "m" (dummy), "m" (dummy), "m" (dummy),
1075 "r" (dest), "m" (dstW)
1076 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1079 case PIX_FMT_RGB555:
1080 YSCALEYUV2PACKEDX_ACCURATE
1082 "pxor %%mm7, %%mm7 \n\t"
1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1090 WRITERGB15(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1093 case PIX_FMT_RGB565:
1094 YSCALEYUV2PACKEDX_ACCURATE
1096 "pxor %%mm7, %%mm7 \n\t"
1097 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1099 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1104 WRITERGB16(%4, %5, %%REGa)
1105 YSCALEYUV2PACKEDX_END
1107 case PIX_FMT_YUYV422:
1108 YSCALEYUV2PACKEDX_ACCURATE
1109 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1111 "psraw $3, %%mm3 \n\t"
1112 "psraw $3, %%mm4 \n\t"
1113 "psraw $3, %%mm1 \n\t"
1114 "psraw $3, %%mm7 \n\t"
1115 WRITEYUY2(%4, %5, %%REGa)
1116 YSCALEYUV2PACKEDX_END
1120 switch(c->dstFormat)
1123 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1126 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 "packuswb %%mm7, %%mm1 \n\t"
1130 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131 YSCALEYUV2PACKEDX_END
1135 "pcmpeqd %%mm7, %%mm7 \n\t"
1136 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137 YSCALEYUV2PACKEDX_END
1143 "pxor %%mm7, %%mm7 \n\t"
1144 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1145 "add %4, %%"REG_c" \n\t"
1146 WRITEBGR24(%%REGc, %5, %%REGa)
1148 :: "r" (&c->redDither),
1149 "m" (dummy), "m" (dummy), "m" (dummy),
1150 "r" (dest), "m" (dstW)
1151 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1154 case PIX_FMT_RGB555:
1157 "pxor %%mm7, %%mm7 \n\t"
1158 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1160 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1161 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1162 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1165 WRITERGB15(%4, %5, %%REGa)
1166 YSCALEYUV2PACKEDX_END
1168 case PIX_FMT_RGB565:
1171 "pxor %%mm7, %%mm7 \n\t"
1172 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1175 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1176 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1179 WRITERGB16(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1182 case PIX_FMT_YUYV422:
1184 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1186 "psraw $3, %%mm3 \n\t"
1187 "psraw $3, %%mm4 \n\t"
1188 "psraw $3, %%mm1 \n\t"
1189 "psraw $3, %%mm7 \n\t"
1190 WRITEYUY2(%4, %5, %%REGa)
1191 YSCALEYUV2PACKEDX_END
1196 #endif /* COMPILE_TEMPLATE_MMX */
1197 #if COMPILE_TEMPLATE_ALTIVEC
1198 /* The following list of supported dstFormat values should
1199 match what's found in the body of ff_yuv2packedX_altivec() */
1200 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1201 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1202 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1203 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1204 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205 chrFilter, chrSrc, chrFilterSize,
1209 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210 chrFilter, chrSrc, chrFilterSize,
1211 alpSrc, dest, dstW, dstY);
1215 * vertical bilinear scale YV12 to RGB
1217 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1220 int yalpha1=4095- yalpha;
1221 int uvalpha1=4095-uvalpha;
1224 #if COMPILE_TEMPLATE_MMX
1225 if(!(c->flags & SWS_BITEXACT)){
1226 switch(c->dstFormat)
1228 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1230 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1233 YSCALEYUV2RGB(%%REGBP, %5)
1234 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237 "packuswb %%mm7, %%mm1 \n\t"
1238 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1242 ,"r" (abuf0), "r" (abuf1)
1246 *(uint16_t **)(&c->u_temp)=abuf0;
1247 *(uint16_t **)(&c->v_temp)=abuf1;
1249 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1250 "mov %4, %%"REG_b" \n\t"
1251 "push %%"REG_BP" \n\t"
1252 YSCALEYUV2RGB(%%REGBP, %5)
1255 "mov "U_TEMP"(%5), %0 \n\t"
1256 "mov "V_TEMP"(%5), %1 \n\t"
1257 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260 "packuswb %%mm7, %%mm1 \n\t"
1263 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264 "pop %%"REG_BP" \n\t"
1265 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1267 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1274 "mov %4, %%"REG_b" \n\t"
1275 "push %%"REG_BP" \n\t"
1276 YSCALEYUV2RGB(%%REGBP, %5)
1277 "pcmpeqd %%mm7, %%mm7 \n\t"
1278 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2RGB(%%REGBP, %5)
1293 "pxor %%mm7, %%mm7 \n\t"
1294 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1301 case PIX_FMT_RGB555:
1303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304 "mov %4, %%"REG_b" \n\t"
1305 "push %%"REG_BP" \n\t"
1306 YSCALEYUV2RGB(%%REGBP, %5)
1307 "pxor %%mm7, %%mm7 \n\t"
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1310 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1311 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1312 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1315 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1316 "pop %%"REG_BP" \n\t"
1317 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1319 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1323 case PIX_FMT_RGB565:
1325 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1326 "mov %4, %%"REG_b" \n\t"
1327 "push %%"REG_BP" \n\t"
1328 YSCALEYUV2RGB(%%REGBP, %5)
1329 "pxor %%mm7, %%mm7 \n\t"
1330 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1332 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1333 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1334 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1337 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1338 "pop %%"REG_BP" \n\t"
1339 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1340 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344 case PIX_FMT_YUYV422:
1346 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1347 "mov %4, %%"REG_b" \n\t"
1348 "push %%"REG_BP" \n\t"
1349 YSCALEYUV2PACKED(%%REGBP, %5)
1350 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351 "pop %%"REG_BP" \n\t"
1352 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1360 #endif //COMPILE_TEMPLATE_MMX
1361 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1365 * YV12 to RGB without scaling or interpolating
1367 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1370 const int yalpha1=0;
1373 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1374 const int yalpha= 4096; //FIXME ...
1376 if (flags&SWS_FULL_CHR_H_INT)
1378 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1382 #if COMPILE_TEMPLATE_MMX
1383 if(!(flags & SWS_BITEXACT)){
1384 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1389 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_b" \n\t"
1393 "push %%"REG_BP" \n\t"
1394 YSCALEYUV2RGB1(%%REGBP, %5)
1395 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397 "pop %%"REG_BP" \n\t"
1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1400 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1405 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1406 "mov %4, %%"REG_b" \n\t"
1407 "push %%"REG_BP" \n\t"
1408 YSCALEYUV2RGB1(%%REGBP, %5)
1409 "pcmpeqd %%mm7, %%mm7 \n\t"
1410 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411 "pop %%"REG_BP" \n\t"
1412 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1414 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_b" \n\t"
1423 "push %%"REG_BP" \n\t"
1424 YSCALEYUV2RGB1(%%REGBP, %5)
1425 "pxor %%mm7, %%mm7 \n\t"
1426 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1430 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 case PIX_FMT_RGB555:
1436 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_b" \n\t"
1438 "push %%"REG_BP" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP, %5)
1440 "pxor %%mm7, %%mm7 \n\t"
1441 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1444 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1445 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1447 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448 "pop %%"REG_BP" \n\t"
1449 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1451 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455 case PIX_FMT_RGB565:
1457 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1458 "mov %4, %%"REG_b" \n\t"
1459 "push %%"REG_BP" \n\t"
1460 YSCALEYUV2RGB1(%%REGBP, %5)
1461 "pxor %%mm7, %%mm7 \n\t"
1462 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1465 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1466 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1469 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470 "pop %%"REG_BP" \n\t"
1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 case PIX_FMT_YUYV422:
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1480 "mov %4, %%"REG_b" \n\t"
1481 "push %%"REG_BP" \n\t"
1482 YSCALEYUV2PACKED1(%%REGBP, %5)
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484 "pop %%"REG_BP" \n\t"
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1509 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pcmpeqd %%mm7, %%mm7 \n\t"
1519 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1530 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1531 "mov %4, %%"REG_b" \n\t"
1532 "push %%"REG_BP" \n\t"
1533 YSCALEYUV2RGB1b(%%REGBP, %5)
1534 "pxor %%mm7, %%mm7 \n\t"
1535 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536 "pop %%"REG_BP" \n\t"
1537 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1539 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1543 case PIX_FMT_RGB555:
1545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1546 "mov %4, %%"REG_b" \n\t"
1547 "push %%"REG_BP" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP, %5)
1549 "pxor %%mm7, %%mm7 \n\t"
1550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1552 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1553 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1554 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1556 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557 "pop %%"REG_BP" \n\t"
1558 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1560 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1564 case PIX_FMT_RGB565:
1566 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1567 "mov %4, %%"REG_b" \n\t"
1568 "push %%"REG_BP" \n\t"
1569 YSCALEYUV2RGB1b(%%REGBP, %5)
1570 "pxor %%mm7, %%mm7 \n\t"
1571 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1573 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1574 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1575 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1578 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586 case PIX_FMT_YUYV422:
1588 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1589 "mov %4, %%"REG_b" \n\t"
1590 "push %%"REG_BP" \n\t"
1591 YSCALEYUV2PACKED1b(%%REGBP, %5)
1592 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593 "pop %%"REG_BP" \n\t"
1594 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1596 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1603 #endif /* COMPILE_TEMPLATE_MMX */
1606 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1608 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1612 //FIXME yuy2* can read up to 7 samples too much
1614 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1616 #if COMPILE_TEMPLATE_MMX
1618 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1619 "mov %0, %%"REG_a" \n\t"
1621 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1622 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1623 "pand %%mm2, %%mm0 \n\t"
1624 "pand %%mm2, %%mm1 \n\t"
1625 "packuswb %%mm1, %%mm0 \n\t"
1626 "movq %%mm0, (%2, %%"REG_a") \n\t"
1627 "add $8, %%"REG_a" \n\t"
1629 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1634 for (i=0; i<width; i++)
1639 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1641 #if COMPILE_TEMPLATE_MMX
1643 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1644 "mov %0, %%"REG_a" \n\t"
1646 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1647 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1648 "psrlw $8, %%mm0 \n\t"
1649 "psrlw $8, %%mm1 \n\t"
1650 "packuswb %%mm1, %%mm0 \n\t"
1651 "movq %%mm0, %%mm1 \n\t"
1652 "psrlw $8, %%mm0 \n\t"
1653 "pand %%mm4, %%mm1 \n\t"
1654 "packuswb %%mm0, %%mm0 \n\t"
1655 "packuswb %%mm1, %%mm1 \n\t"
1656 "movd %%mm0, (%3, %%"REG_a") \n\t"
1657 "movd %%mm1, (%2, %%"REG_a") \n\t"
1658 "add $4, %%"REG_a" \n\t"
1660 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1665 for (i=0; i<width; i++)
1667 dstU[i]= src1[4*i + 1];
1668 dstV[i]= src1[4*i + 3];
1671 assert(src1 == src2);
1674 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1676 #if COMPILE_TEMPLATE_MMX
1678 "mov %0, %%"REG_a" \n\t"
1680 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1681 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1682 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1683 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1684 "psrlw $8, %%mm0 \n\t"
1685 "psrlw $8, %%mm1 \n\t"
1686 "psrlw $8, %%mm2 \n\t"
1687 "psrlw $8, %%mm3 \n\t"
1688 "packuswb %%mm1, %%mm0 \n\t"
1689 "packuswb %%mm3, %%mm2 \n\t"
1690 "movq %%mm0, (%3, %%"REG_a") \n\t"
1691 "movq %%mm2, (%4, %%"REG_a") \n\t"
1692 "add $8, %%"REG_a" \n\t"
1694 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1699 for (i=0; i<width; i++)
1701 dstU[i]= src1[2*i + 1];
1702 dstV[i]= src2[2*i + 1];
1707 /* This is almost identical to the previous, end exists only because
1708 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1709 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1711 #if COMPILE_TEMPLATE_MMX
1713 "mov %0, %%"REG_a" \n\t"
1715 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1716 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1717 "psrlw $8, %%mm0 \n\t"
1718 "psrlw $8, %%mm1 \n\t"
1719 "packuswb %%mm1, %%mm0 \n\t"
1720 "movq %%mm0, (%2, %%"REG_a") \n\t"
1721 "add $8, %%"REG_a" \n\t"
1723 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1728 for (i=0; i<width; i++)
1733 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1735 #if COMPILE_TEMPLATE_MMX
1737 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1738 "mov %0, %%"REG_a" \n\t"
1740 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1741 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1742 "pand %%mm4, %%mm0 \n\t"
1743 "pand %%mm4, %%mm1 \n\t"
1744 "packuswb %%mm1, %%mm0 \n\t"
1745 "movq %%mm0, %%mm1 \n\t"
1746 "psrlw $8, %%mm0 \n\t"
1747 "pand %%mm4, %%mm1 \n\t"
1748 "packuswb %%mm0, %%mm0 \n\t"
1749 "packuswb %%mm1, %%mm1 \n\t"
1750 "movd %%mm0, (%3, %%"REG_a") \n\t"
1751 "movd %%mm1, (%2, %%"REG_a") \n\t"
1752 "add $4, %%"REG_a" \n\t"
1754 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1759 for (i=0; i<width; i++)
1761 dstU[i]= src1[4*i + 0];
1762 dstV[i]= src1[4*i + 2];
1765 assert(src1 == src2);
1768 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1770 #if COMPILE_TEMPLATE_MMX
1772 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1773 "mov %0, %%"REG_a" \n\t"
1775 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1776 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1777 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1778 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1779 "pand %%mm4, %%mm0 \n\t"
1780 "pand %%mm4, %%mm1 \n\t"
1781 "pand %%mm4, %%mm2 \n\t"
1782 "pand %%mm4, %%mm3 \n\t"
1783 "packuswb %%mm1, %%mm0 \n\t"
1784 "packuswb %%mm3, %%mm2 \n\t"
1785 "movq %%mm0, (%3, %%"REG_a") \n\t"
1786 "movq %%mm2, (%4, %%"REG_a") \n\t"
1787 "add $8, %%"REG_a" \n\t"
1789 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1794 for (i=0; i<width; i++)
1802 #if COMPILE_TEMPLATE_MMX
1803 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1806 if(srcFormat == PIX_FMT_BGR24){
1808 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1809 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1814 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1815 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1821 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1822 "mov %2, %%"REG_a" \n\t"
1823 "pxor %%mm7, %%mm7 \n\t"
1825 PREFETCH" 64(%0) \n\t"
1826 "movd (%0), %%mm0 \n\t"
1827 "movd 2(%0), %%mm1 \n\t"
1828 "movd 6(%0), %%mm2 \n\t"
1829 "movd 8(%0), %%mm3 \n\t"
1831 "punpcklbw %%mm7, %%mm0 \n\t"
1832 "punpcklbw %%mm7, %%mm1 \n\t"
1833 "punpcklbw %%mm7, %%mm2 \n\t"
1834 "punpcklbw %%mm7, %%mm3 \n\t"
1835 "pmaddwd %%mm5, %%mm0 \n\t"
1836 "pmaddwd %%mm6, %%mm1 \n\t"
1837 "pmaddwd %%mm5, %%mm2 \n\t"
1838 "pmaddwd %%mm6, %%mm3 \n\t"
1839 "paddd %%mm1, %%mm0 \n\t"
1840 "paddd %%mm3, %%mm2 \n\t"
1841 "paddd %%mm4, %%mm0 \n\t"
1842 "paddd %%mm4, %%mm2 \n\t"
1843 "psrad $15, %%mm0 \n\t"
1844 "psrad $15, %%mm2 \n\t"
1845 "packssdw %%mm2, %%mm0 \n\t"
1846 "packuswb %%mm0, %%mm0 \n\t"
1847 "movd %%mm0, (%1, %%"REG_a") \n\t"
1848 "add $4, %%"REG_a" \n\t"
1851 : "r" (dst+width), "g" ((x86_reg)-width)
1856 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1859 "movq 24+%4, %%mm6 \n\t"
1860 "mov %3, %%"REG_a" \n\t"
1861 "pxor %%mm7, %%mm7 \n\t"
1863 PREFETCH" 64(%0) \n\t"
1864 "movd (%0), %%mm0 \n\t"
1865 "movd 2(%0), %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm0 \n\t"
1867 "punpcklbw %%mm7, %%mm1 \n\t"
1868 "movq %%mm0, %%mm2 \n\t"
1869 "movq %%mm1, %%mm3 \n\t"
1870 "pmaddwd %4, %%mm0 \n\t"
1871 "pmaddwd 8+%4, %%mm1 \n\t"
1872 "pmaddwd 16+%4, %%mm2 \n\t"
1873 "pmaddwd %%mm6, %%mm3 \n\t"
1874 "paddd %%mm1, %%mm0 \n\t"
1875 "paddd %%mm3, %%mm2 \n\t"
1877 "movd 6(%0), %%mm1 \n\t"
1878 "movd 8(%0), %%mm3 \n\t"
1880 "punpcklbw %%mm7, %%mm1 \n\t"
1881 "punpcklbw %%mm7, %%mm3 \n\t"
1882 "movq %%mm1, %%mm4 \n\t"
1883 "movq %%mm3, %%mm5 \n\t"
1884 "pmaddwd %4, %%mm1 \n\t"
1885 "pmaddwd 8+%4, %%mm3 \n\t"
1886 "pmaddwd 16+%4, %%mm4 \n\t"
1887 "pmaddwd %%mm6, %%mm5 \n\t"
1888 "paddd %%mm3, %%mm1 \n\t"
1889 "paddd %%mm5, %%mm4 \n\t"
1891 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1892 "paddd %%mm3, %%mm0 \n\t"
1893 "paddd %%mm3, %%mm2 \n\t"
1894 "paddd %%mm3, %%mm1 \n\t"
1895 "paddd %%mm3, %%mm4 \n\t"
1896 "psrad $15, %%mm0 \n\t"
1897 "psrad $15, %%mm2 \n\t"
1898 "psrad $15, %%mm1 \n\t"
1899 "psrad $15, %%mm4 \n\t"
1900 "packssdw %%mm1, %%mm0 \n\t"
1901 "packssdw %%mm4, %%mm2 \n\t"
1902 "packuswb %%mm0, %%mm0 \n\t"
1903 "packuswb %%mm2, %%mm2 \n\t"
1904 "movd %%mm0, (%1, %%"REG_a") \n\t"
1905 "movd %%mm2, (%2, %%"REG_a") \n\t"
1906 "add $4, %%"REG_a" \n\t"
1909 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1915 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1917 #if COMPILE_TEMPLATE_MMX
1918 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1921 for (i=0; i<width; i++)
1927 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1929 #endif /* COMPILE_TEMPLATE_MMX */
1932 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1934 #if COMPILE_TEMPLATE_MMX
1935 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1938 for (i=0; i<width; i++)
1940 int b= src1[3*i + 0];
1941 int g= src1[3*i + 1];
1942 int r= src1[3*i + 2];
1944 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1945 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1947 #endif /* COMPILE_TEMPLATE_MMX */
1948 assert(src1 == src2);
1951 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1954 for (i=0; i<width; i++)
1956 int b= src1[6*i + 0] + src1[6*i + 3];
1957 int g= src1[6*i + 1] + src1[6*i + 4];
1958 int r= src1[6*i + 2] + src1[6*i + 5];
1960 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1961 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1963 assert(src1 == src2);
1966 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1968 #if COMPILE_TEMPLATE_MMX
1969 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1972 for (i=0; i<width; i++)
1978 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1983 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1985 #if COMPILE_TEMPLATE_MMX
1987 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1991 for (i=0; i<width; i++)
1993 int r= src1[3*i + 0];
1994 int g= src1[3*i + 1];
1995 int b= src1[3*i + 2];
1997 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1998 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2003 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2007 for (i=0; i<width; i++)
2009 int r= src1[6*i + 0] + src1[6*i + 3];
2010 int g= src1[6*i + 1] + src1[6*i + 4];
2011 int b= src1[6*i + 2] + src1[6*i + 5];
2013 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2014 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2019 // bilinear / bicubic scaling
2020 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2021 const int16_t *filter, const int16_t *filterPos, long filterSize)
2023 #if COMPILE_TEMPLATE_MMX
2024 assert(filterSize % 4 == 0 && filterSize>0);
2025 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2027 x86_reg counter= -2*dstW;
2029 filterPos-= counter/2;
2033 "push %%"REG_b" \n\t"
2035 "pxor %%mm7, %%mm7 \n\t"
2036 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2037 "mov %%"REG_a", %%"REG_BP" \n\t"
2040 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2041 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2042 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2043 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2044 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2045 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2046 "punpcklbw %%mm7, %%mm0 \n\t"
2047 "punpcklbw %%mm7, %%mm2 \n\t"
2048 "pmaddwd %%mm1, %%mm0 \n\t"
2049 "pmaddwd %%mm2, %%mm3 \n\t"
2050 "movq %%mm0, %%mm4 \n\t"
2051 "punpckldq %%mm3, %%mm0 \n\t"
2052 "punpckhdq %%mm3, %%mm4 \n\t"
2053 "paddd %%mm4, %%mm0 \n\t"
2054 "psrad $7, %%mm0 \n\t"
2055 "packssdw %%mm0, %%mm0 \n\t"
2056 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2057 "add $4, %%"REG_BP" \n\t"
2060 "pop %%"REG_BP" \n\t"
2062 "pop %%"REG_b" \n\t"
2065 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2071 else if (filterSize==8)
2073 x86_reg counter= -2*dstW;
2075 filterPos-= counter/2;
2079 "push %%"REG_b" \n\t"
2081 "pxor %%mm7, %%mm7 \n\t"
2082 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2083 "mov %%"REG_a", %%"REG_BP" \n\t"
2086 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2087 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2088 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2089 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2090 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2091 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2092 "punpcklbw %%mm7, %%mm0 \n\t"
2093 "punpcklbw %%mm7, %%mm2 \n\t"
2094 "pmaddwd %%mm1, %%mm0 \n\t"
2095 "pmaddwd %%mm2, %%mm3 \n\t"
2097 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2098 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2099 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2100 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2101 "punpcklbw %%mm7, %%mm4 \n\t"
2102 "punpcklbw %%mm7, %%mm2 \n\t"
2103 "pmaddwd %%mm1, %%mm4 \n\t"
2104 "pmaddwd %%mm2, %%mm5 \n\t"
2105 "paddd %%mm4, %%mm0 \n\t"
2106 "paddd %%mm5, %%mm3 \n\t"
2107 "movq %%mm0, %%mm4 \n\t"
2108 "punpckldq %%mm3, %%mm0 \n\t"
2109 "punpckhdq %%mm3, %%mm4 \n\t"
2110 "paddd %%mm4, %%mm0 \n\t"
2111 "psrad $7, %%mm0 \n\t"
2112 "packssdw %%mm0, %%mm0 \n\t"
2113 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2114 "add $4, %%"REG_BP" \n\t"
2117 "pop %%"REG_BP" \n\t"
2119 "pop %%"REG_b" \n\t"
2122 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2130 uint8_t *offset = src+filterSize;
2131 x86_reg counter= -2*dstW;
2132 //filter-= counter*filterSize/2;
2133 filterPos-= counter/2;
2136 "pxor %%mm7, %%mm7 \n\t"
2139 "mov %2, %%"REG_c" \n\t"
2140 "movzwl (%%"REG_c", %0), %%eax \n\t"
2141 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2142 "mov %5, %%"REG_c" \n\t"
2143 "pxor %%mm4, %%mm4 \n\t"
2144 "pxor %%mm5, %%mm5 \n\t"
2146 "movq (%1), %%mm1 \n\t"
2147 "movq (%1, %6), %%mm3 \n\t"
2148 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2149 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2150 "punpcklbw %%mm7, %%mm0 \n\t"
2151 "punpcklbw %%mm7, %%mm2 \n\t"
2152 "pmaddwd %%mm1, %%mm0 \n\t"
2153 "pmaddwd %%mm2, %%mm3 \n\t"
2154 "paddd %%mm3, %%mm5 \n\t"
2155 "paddd %%mm0, %%mm4 \n\t"
2157 "add $4, %%"REG_c" \n\t"
2158 "cmp %4, %%"REG_c" \n\t"
2161 "movq %%mm4, %%mm0 \n\t"
2162 "punpckldq %%mm5, %%mm4 \n\t"
2163 "punpckhdq %%mm5, %%mm0 \n\t"
2164 "paddd %%mm0, %%mm4 \n\t"
2165 "psrad $7, %%mm4 \n\t"
2166 "packssdw %%mm4, %%mm4 \n\t"
2167 "mov %3, %%"REG_a" \n\t"
2168 "movd %%mm4, (%%"REG_a", %0) \n\t"
2172 : "+r" (counter), "+r" (filter)
2173 : "m" (filterPos), "m" (dst), "m"(offset),
2174 "m" (src), "r" ((x86_reg)filterSize*2)
2175 : "%"REG_a, "%"REG_c, "%"REG_d
2179 #if COMPILE_TEMPLATE_ALTIVEC
2180 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183 for (i=0; i<dstW; i++)
2186 int srcPos= filterPos[i];
2188 //printf("filterPos: %d\n", filterPos[i]);
2189 for (j=0; j<filterSize; j++)
2191 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2192 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2194 //filter += hFilterSize;
2195 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2198 #endif /* COMPILE_ALTIVEC */
2199 #endif /* COMPILE_MMX */
2202 #define FAST_BILINEAR_X86 \
2203 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2204 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2205 "shll $16, %%edi \n\t" \
2206 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2207 "mov %1, %%"REG_D"\n\t" \
2208 "shrl $9, %%esi \n\t" \
2210 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2211 int dstWidth, const uint8_t *src, int srcW,
2215 unsigned int xpos=0;
2216 for (i=0;i<dstWidth;i++)
2218 register unsigned int xx=xpos>>16;
2219 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2220 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2225 // *** horizontal scale Y line to temp buffer
2226 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2227 int flags, const int16_t *hLumFilter,
2228 const int16_t *hLumFilterPos, int hLumFilterSize,
2229 int srcFormat, uint8_t *formatConvBuffer,
2230 uint32_t *pal, int isAlpha)
2232 int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2233 int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
2234 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2235 void av_unused *funnyYCode = c->funnyYCode;
2236 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2239 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2242 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2246 if (srcFormat == PIX_FMT_RGB48LE)
2249 if (internal_func) {
2250 internal_func(formatConvBuffer, src, srcW, pal);
2251 src= formatConvBuffer;
2254 #if COMPILE_TEMPLATE_MMX
2255 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2256 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2258 if (!(flags&SWS_FAST_BILINEAR))
2261 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2263 else // fast bilinear upscale / crap downscale
2265 #if ARCH_X86 && CONFIG_GPL
2266 #if COMPILE_TEMPLATE_MMX2
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2275 "mov %%"REG_b", %5 \n\t"
2277 "pxor %%mm7, %%mm7 \n\t"
2278 "mov %0, %%"REG_c" \n\t"
2279 "mov %1, %%"REG_D" \n\t"
2280 "mov %2, %%"REG_d" \n\t"
2281 "mov %3, %%"REG_b" \n\t"
2282 "xor %%"REG_a", %%"REG_a" \n\t" // i
2283 PREFETCH" (%%"REG_c") \n\t"
2284 PREFETCH" 32(%%"REG_c") \n\t"
2285 PREFETCH" 64(%%"REG_c") \n\t"
2289 #define FUNNY_Y_CODE \
2290 "movl (%%"REG_b"), %%esi \n\t"\
2292 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2293 "add %%"REG_S", %%"REG_c" \n\t"\
2294 "add %%"REG_a", %%"REG_D" \n\t"\
2295 "xor %%"REG_a", %%"REG_a" \n\t"\
2299 #define FUNNY_Y_CODE \
2300 "movl (%%"REG_b"), %%esi \n\t"\
2302 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2303 "add %%"REG_a", %%"REG_D" \n\t"\
2304 "xor %%"REG_a", %%"REG_a" \n\t"\
2306 #endif /* ARCH_X86_64 */
2318 "mov %5, %%"REG_b" \n\t"
2320 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2325 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2330 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2334 #endif /* COMPILE_TEMPLATE_MMX2 */
2335 x86_reg xInc_shr16 = xInc >> 16;
2336 uint16_t xInc_mask = xInc & 0xffff;
2337 //NO MMX just normal asm ...
2339 "xor %%"REG_a", %%"REG_a" \n\t" // i
2340 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2341 "xorl %%ecx, %%ecx \n\t" // xalpha
2344 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2345 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2347 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2348 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2349 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2351 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2352 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2354 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2355 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2356 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2359 "add $2, %%"REG_a" \n\t"
2360 "cmp %2, %%"REG_a" \n\t"
2364 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2365 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2367 #if COMPILE_TEMPLATE_MMX2
2368 } //if MMX2 can't be used
2371 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2372 #endif /* ARCH_X86 */
2375 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2377 //FIXME all pal and rgb srcFormats could do this convertion as well
2378 //FIXME all scalers more complex than bilinear could do half of this transform
2380 for (i=0; i<dstWidth; i++)
2381 dst[i]= (dst[i]*14071 + 33561947)>>14;
2383 for (i=0; i<dstWidth; i++)
2384 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2389 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2390 int dstWidth, const uint8_t *src1,
2391 const uint8_t *src2, int srcW, int xInc)
2394 unsigned int xpos=0;
2395 for (i=0;i<dstWidth;i++)
2397 register unsigned int xx=xpos>>16;
2398 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2399 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2400 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2402 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2403 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2409 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2410 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2411 const int16_t *hChrFilterPos, int hChrFilterSize,
2412 int srcFormat, uint8_t *formatConvBuffer,
2415 int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2416 int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
2417 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2418 void av_unused *funnyUVCode = c->funnyUVCode;
2420 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2423 if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
2428 if (srcFormat==PIX_FMT_RGB48LE) {
2433 if (c->hcscale_internal) {
2434 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2435 src1= formatConvBuffer;
2436 src2= formatConvBuffer+VOFW;
2439 #if COMPILE_TEMPLATE_MMX
2440 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2441 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2443 if (!(flags&SWS_FAST_BILINEAR))
2446 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2447 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2449 else // fast bilinear upscale / crap downscale
2451 #if ARCH_X86 && CONFIG_GPL
2452 #if COMPILE_TEMPLATE_MMX2
2455 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2461 "mov %%"REG_b", %6 \n\t"
2463 "pxor %%mm7, %%mm7 \n\t"
2464 "mov %0, %%"REG_c" \n\t"
2465 "mov %1, %%"REG_D" \n\t"
2466 "mov %2, %%"REG_d" \n\t"
2467 "mov %3, %%"REG_b" \n\t"
2468 "xor %%"REG_a", %%"REG_a" \n\t" // i
2469 PREFETCH" (%%"REG_c") \n\t"
2470 PREFETCH" 32(%%"REG_c") \n\t"
2471 PREFETCH" 64(%%"REG_c") \n\t"
2475 #define FUNNY_UV_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2478 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2479 "add %%"REG_S", %%"REG_c" \n\t"\
2480 "add %%"REG_a", %%"REG_D" \n\t"\
2481 "xor %%"REG_a", %%"REG_a" \n\t"\
2485 #define FUNNY_UV_CODE \
2486 "movl (%%"REG_b"), %%esi \n\t"\
2488 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489 "add %%"REG_a", %%"REG_D" \n\t"\
2490 "xor %%"REG_a", %%"REG_a" \n\t"\
2492 #endif /* ARCH_X86_64 */
2498 "xor %%"REG_a", %%"REG_a" \n\t" // i
2499 "mov %5, %%"REG_c" \n\t" // src
2500 "mov %1, %%"REG_D" \n\t" // buf1
2501 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2502 PREFETCH" (%%"REG_c") \n\t"
2503 PREFETCH" 32(%%"REG_c") \n\t"
2504 PREFETCH" 64(%%"REG_c") \n\t"
2512 "mov %6, %%"REG_b" \n\t"
2514 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2515 "m" (funnyUVCode), "m" (src2)
2519 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2524 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2526 //printf("%d %d %d\n", dstWidth, i, srcW);
2527 dst[i] = src1[srcW-1]*128;
2528 dst[i+VOFW] = src2[srcW-1]*128;
2533 #endif /* COMPILE_TEMPLATE_MMX2 */
2534 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2535 uint16_t xInc_mask = xInc & 0xffff;
2537 "xor %%"REG_a", %%"REG_a" \n\t" // i
2538 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2539 "xorl %%ecx, %%ecx \n\t" // xalpha
2542 "mov %0, %%"REG_S" \n\t"
2543 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2544 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2546 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2548 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2549 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2551 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2553 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2554 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2555 "add $1, %%"REG_a" \n\t"
2556 "cmp %2, %%"REG_a" \n\t"
2559 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2560 which is needed to support GCC 4.0. */
2561 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2562 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2564 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2567 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2569 #if COMPILE_TEMPLATE_MMX2
2570 } //if MMX2 can't be used
2573 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2574 #endif /* ARCH_X86 */
2576 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2578 //FIXME all pal and rgb srcFormats could do this convertion as well
2579 //FIXME all scalers more complex than bilinear could do half of this transform
2581 for (i=0; i<dstWidth; i++){
2582 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2583 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2586 for (i=0; i<dstWidth; i++){
2587 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2588 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2594 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2595 int srcSliceH, uint8_t* dst[], int dstStride[]){
2597 /* load a few things into local vars to make the code more readable? and faster */
2598 const int srcW= c->srcW;
2599 const int dstW= c->dstW;
2600 const int dstH= c->dstH;
2601 const int chrDstW= c->chrDstW;
2602 const int chrSrcW= c->chrSrcW;
2603 const int lumXInc= c->lumXInc;
2604 const int chrXInc= c->chrXInc;
2605 const int dstFormat= c->dstFormat;
2606 const int srcFormat= c->srcFormat;
2607 const int flags= c->flags;
2608 int16_t *vLumFilterPos= c->vLumFilterPos;
2609 int16_t *vChrFilterPos= c->vChrFilterPos;
2610 int16_t *hLumFilterPos= c->hLumFilterPos;
2611 int16_t *hChrFilterPos= c->hChrFilterPos;
2612 int16_t *vLumFilter= c->vLumFilter;
2613 int16_t *vChrFilter= c->vChrFilter;
2614 int16_t *hLumFilter= c->hLumFilter;
2615 int16_t *hChrFilter= c->hChrFilter;
2616 int32_t *lumMmxFilter= c->lumMmxFilter;
2617 int32_t *chrMmxFilter= c->chrMmxFilter;
2618 int32_t *alpMmxFilter= c->alpMmxFilter;
2619 const int vLumFilterSize= c->vLumFilterSize;
2620 const int vChrFilterSize= c->vChrFilterSize;
2621 const int hLumFilterSize= c->hLumFilterSize;
2622 const int hChrFilterSize= c->hChrFilterSize;
2623 int16_t **lumPixBuf= c->lumPixBuf;
2624 int16_t **chrPixBuf= c->chrPixBuf;
2625 int16_t **alpPixBuf= c->alpPixBuf;
2626 const int vLumBufSize= c->vLumBufSize;
2627 const int vChrBufSize= c->vChrBufSize;
2628 uint8_t *formatConvBuffer= c->formatConvBuffer;
2629 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2630 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2632 uint32_t *pal=c->pal_yuv;
2634 /* vars which will change and which we need to store back in the context */
2636 int lumBufIndex= c->lumBufIndex;
2637 int chrBufIndex= c->chrBufIndex;
2638 int lastInLumBuf= c->lastInLumBuf;
2639 int lastInChrBuf= c->lastInChrBuf;
2641 if (isPacked(c->srcFormat)){
2649 srcStride[3]= srcStride[0];
2651 srcStride[1]<<= c->vChrDrop;
2652 srcStride[2]<<= c->vChrDrop;
2654 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2655 // (int)dst[0], (int)dst[1], (int)dst[2]);
2657 #if 0 //self test FIXME move to a vfilter or something
2659 static volatile int i=0;
2661 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2662 selfTest(src, srcStride, c->srcW, c->srcH);
2667 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2668 //dstStride[0],dstStride[1],dstStride[2]);
2670 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2672 static int warnedAlready=0; //FIXME move this into the context perhaps
2673 if (flags & SWS_PRINT_INFO && !warnedAlready)
2675 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2676 " ->cannot do aligned memory accesses anymore\n");
2681 /* Note the user might start scaling the picture in the middle so this
2682 will not get executed. This is not really intended but works
2683 currently, so people might do it. */
2694 for (;dstY < dstH; dstY++){
2695 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2696 const int chrDstY= dstY>>c->chrDstVSubSample;
2697 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2698 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2699 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2701 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2702 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2703 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2704 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2707 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2708 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2709 //handle holes (FAST_BILINEAR & weird filters)
2710 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2711 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2712 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2713 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2714 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2716 // Do we have enough lines in this slice to output the dstY line
2717 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2718 if (!enough_lines) {
2719 lastLumSrcY = srcSliceY + srcSliceH - 1;
2720 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2723 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2724 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2725 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2726 vChrBufSize, vLumBufSize);*/
2728 //Do horizontal scaling
2729 while(lastInLumBuf < lastLumSrcY)
2731 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2732 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2734 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2735 assert(lumBufIndex < 2*vLumBufSize);
2736 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2737 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2738 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2739 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2740 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2741 c->srcFormat, formatConvBuffer,
2743 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2744 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2745 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2746 c->srcFormat, formatConvBuffer,
2750 while(lastInChrBuf < lastChrSrcY)
2752 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2753 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2755 assert(chrBufIndex < 2*vChrBufSize);
2756 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2757 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2758 //FIXME replace parameters through context struct (some at least)
2760 if (!(isGray(srcFormat) || isGray(dstFormat)))
2761 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2762 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2763 c->srcFormat, formatConvBuffer,
2767 //wrap buf index around to stay inside the ring buffer
2768 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2769 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2771 break; //we can't output a dstY line so let's try with the next slice
2773 #if COMPILE_TEMPLATE_MMX
2774 c->blueDither= ff_dither8[dstY&1];
2775 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2776 c->greenDither= ff_dither8[dstY&1];
2778 c->greenDither= ff_dither4[dstY&1];
2779 c->redDither= ff_dither8[(dstY+1)&1];
2783 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2784 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2785 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2786 #if COMPILE_TEMPLATE_MMX
2788 if (flags & SWS_ACCURATE_RND){
2789 int s= APCK_SIZE / 8;
2790 for (i=0; i<vLumFilterSize; i+=2){
2791 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2792 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2793 lumMmxFilter[s*i+APCK_COEF/4 ]=
2794 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2795 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2796 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2797 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2798 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2799 alpMmxFilter[s*i+APCK_COEF/4 ]=
2800 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2803 for (i=0; i<vChrFilterSize; i+=2){
2804 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2805 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2806 chrMmxFilter[s*i+APCK_COEF/4 ]=
2807 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2808 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2811 for (i=0; i<vLumFilterSize; i++)
2813 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2814 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2815 lumMmxFilter[4*i+2]=
2816 lumMmxFilter[4*i+3]=
2817 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2818 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2819 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2820 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2821 alpMmxFilter[4*i+2]=
2822 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2825 for (i=0; i<vChrFilterSize; i++)
2827 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2828 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2829 chrMmxFilter[4*i+2]=
2830 chrMmxFilter[4*i+3]=
2831 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2835 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2836 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2837 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2839 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2840 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2841 dest, uDest, dstW, chrDstW, dstFormat);
2843 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2845 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2846 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2847 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2849 int16_t *lumBuf = lumPixBuf[0];
2850 int16_t *chrBuf= chrPixBuf[0];
2851 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
2852 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2857 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2858 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2859 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2864 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2865 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2866 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2868 int chrAlpha= vChrFilter[2*dstY+1];
2869 if(flags & SWS_FULL_CHR_H_INT){
2870 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2871 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2872 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2873 alpSrcPtr, dest, dstW, dstY);
2875 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2876 alpPixBuf ? *alpSrcPtr : NULL,
2877 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2880 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2882 int lumAlpha= vLumFilter[2*dstY+1];
2883 int chrAlpha= vChrFilter[2*dstY+1];
2885 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2887 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2888 if(flags & SWS_FULL_CHR_H_INT){
2889 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2890 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2891 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2892 alpSrcPtr, dest, dstW, dstY);
2894 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2895 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2896 dest, dstW, lumAlpha, chrAlpha, dstY);
2901 if(flags & SWS_FULL_CHR_H_INT){
2903 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2904 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2905 alpSrcPtr, dest, dstW, dstY);
2908 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2909 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2910 alpSrcPtr, dest, dstW, dstY);
2915 else // hmm looks like we can't use MMX here without overwriting this array's tail
2917 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2918 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2919 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2920 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2921 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2922 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2924 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2925 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2926 dest, uDest, dstW, chrDstW, dstFormat);
2928 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2930 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2931 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2933 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2934 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2935 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2939 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2940 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2941 if(flags & SWS_FULL_CHR_H_INT){
2943 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2944 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2945 alpSrcPtr, dest, dstW, dstY);
2948 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2949 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2950 alpSrcPtr, dest, dstW, dstY);
2956 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2957 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2959 #if COMPILE_TEMPLATE_MMX
2960 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2961 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2962 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2963 else __asm__ volatile("emms" :::"memory");
2965 /* store changed local vars back in the context */
2967 c->lumBufIndex= lumBufIndex;
2968 c->chrBufIndex= chrBufIndex;
2969 c->lastInLumBuf= lastInLumBuf;
2970 c->lastInChrBuf= lastInChrBuf;
2972 return dstY - lastDstY;
2975 static void RENAME(sws_init_swScale)(SwsContext *c)
2977 enum PixelFormat srcFormat = c->srcFormat;
2979 c->yuv2nv12X = RENAME(yuv2nv12X );
2980 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2981 c->yuv2yuvX = RENAME(yuv2yuvX );
2982 c->yuv2packed1 = RENAME(yuv2packed1 );
2983 c->yuv2packed2 = RENAME(yuv2packed2 );
2984 c->yuv2packedX = RENAME(yuv2packedX );
2986 c->hScale = RENAME(hScale );
2988 c->hyscale_fast = RENAME(hyscale_fast);
2989 c->hcscale_fast = RENAME(hcscale_fast);
2991 c->hcscale_internal = NULL;
2993 case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
2994 case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
2998 case PIX_FMT_BGR4_BYTE:
2999 case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
3000 case PIX_FMT_YUV420PBE:
3001 case PIX_FMT_YUV422PBE:
3002 case PIX_FMT_YUV444PBE: c->hcscale_internal = RENAME(BEToUV); break;
3003 case PIX_FMT_YUV420PLE:
3004 case PIX_FMT_YUV422PLE:
3005 case PIX_FMT_YUV444PLE: c->hcscale_internal = RENAME(LEToUV); break;
3007 if (c->chrSrcHSubSample) {
3009 case PIX_FMT_RGB48BE:
3010 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
3011 case PIX_FMT_RGB32 :
3012 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
3013 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
3014 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
3015 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
3016 case PIX_FMT_BGR32 :
3017 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
3018 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
3019 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
3020 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
3024 case PIX_FMT_RGB48BE:
3025 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
3026 case PIX_FMT_RGB32 :
3027 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
3028 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break;
3029 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
3030 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
3031 case PIX_FMT_BGR32 :
3032 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
3033 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break;
3034 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
3035 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
3039 c->hyscale_internal = NULL;
3040 c->hascale_internal = NULL;
3041 switch (srcFormat) {
3042 case PIX_FMT_YUYV422 :
3043 case PIX_FMT_YUV420PBE:
3044 case PIX_FMT_YUV422PBE:
3045 case PIX_FMT_YUV444PBE:
3046 case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
3047 case PIX_FMT_UYVY422 :
3048 case PIX_FMT_YUV420PLE:
3049 case PIX_FMT_YUV422PLE:
3050 case PIX_FMT_YUV444PLE:
3051 case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
3052 case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break;
3053 case PIX_FMT_BGR565 : c->hyscale_internal = bgr16ToY; break;
3054 case PIX_FMT_BGR555 : c->hyscale_internal = bgr15ToY; break;
3055 case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break;
3056 case PIX_FMT_RGB565 : c->hyscale_internal = rgb16ToY; break;
3057 case PIX_FMT_RGB555 : c->hyscale_internal = rgb15ToY; break;
3061 case PIX_FMT_BGR4_BYTE:
3062 case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
3063 case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
3064 case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
3065 case PIX_FMT_RGB32 :
3066 case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
3067 case PIX_FMT_BGR32 :
3068 case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
3069 case PIX_FMT_RGB48BE:
3070 case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
3073 switch (srcFormat) {
3074 case PIX_FMT_RGB32 :
3075 case PIX_FMT_RGB32_1:
3076 case PIX_FMT_BGR32 :
3077 case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;