2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
194 #define YSCALEYUV2PACKEDX_UV \
196 "xor %%"REG_a", %%"REG_a" \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
250 "xor %%"REG_a", %%"REG_a" \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
824 "add $24, "#dst" \n\t"\
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
872 "add $24, "#dst" \n\t"\
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
878 #if COMPILE_TEMPLATE_MMX2
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)) {
910 if (c->flags & SWS_ACCURATE_RND) {
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
915 if (CONFIG_SWSCALE_ALPHA && aDest) {
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
925 if (CONFIG_SWSCALE_ALPHA && aDest) {
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)) {
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
965 if (c->flags & SWS_ACCURATE_RND) {
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
991 for (i=0; i<dstW; i++) {
992 int val= (lumSrc[i]+64)>>7;
1003 for (i=0; i<chrDstW; i++) {
1004 int u=(chrSrc[i ]+64)>>7;
1005 int v=(chrSrc[i + VOFW]+64)>>7;
1009 else if (u>255) u=255;
1011 else if (v>255) v=255;
1018 if (CONFIG_SWSCALE_ALPHA && aDest)
1019 for (i=0; i<dstW; i++) {
1020 int val= (alpSrc[i]+64)>>7;
1021 aDest[i]= av_clip_uint8(val);
1027 * vertical scale YV12 to RGB
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1033 #if COMPILE_TEMPLATE_MMX
1035 if(!(c->flags & SWS_BITEXACT)) {
1036 if (c->flags & SWS_ACCURATE_RND) {
1037 switch(c->dstFormat) {
1039 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 "movq %%mm2, "U_TEMP"(%0) \n\t"
1043 "movq %%mm4, "V_TEMP"(%0) \n\t"
1044 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1045 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1047 "psraw $3, %%mm1 \n\t"
1048 "psraw $3, %%mm7 \n\t"
1049 "packuswb %%mm7, %%mm1 \n\t"
1050 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1052 YSCALEYUV2PACKEDX_END
1054 YSCALEYUV2PACKEDX_ACCURATE
1056 "pcmpeqd %%mm7, %%mm7 \n\t"
1057 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1059 YSCALEYUV2PACKEDX_END
1063 YSCALEYUV2PACKEDX_ACCURATE
1065 "pxor %%mm7, %%mm7 \n\t"
1066 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067 "add %4, %%"REG_c" \n\t"
1068 WRITEBGR24(%%REGc, %5, %%REGa)
1071 :: "r" (&c->redDither),
1072 "m" (dummy), "m" (dummy), "m" (dummy),
1073 "r" (dest), "m" (dstW)
1074 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 case PIX_FMT_RGB555:
1078 YSCALEYUV2PACKEDX_ACCURATE
1080 "pxor %%mm7, %%mm7 \n\t"
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088 WRITERGB15(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_RGB565:
1092 YSCALEYUV2PACKEDX_ACCURATE
1094 "pxor %%mm7, %%mm7 \n\t"
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102 WRITERGB16(%4, %5, %%REGa)
1103 YSCALEYUV2PACKEDX_END
1105 case PIX_FMT_YUYV422:
1106 YSCALEYUV2PACKEDX_ACCURATE
1107 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1109 "psraw $3, %%mm3 \n\t"
1110 "psraw $3, %%mm4 \n\t"
1111 "psraw $3, %%mm1 \n\t"
1112 "psraw $3, %%mm7 \n\t"
1113 WRITEYUY2(%4, %5, %%REGa)
1114 YSCALEYUV2PACKEDX_END
1118 switch(c->dstFormat) {
1120 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1123 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124 "psraw $3, %%mm1 \n\t"
1125 "psraw $3, %%mm7 \n\t"
1126 "packuswb %%mm7, %%mm1 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1132 "pcmpeqd %%mm7, %%mm7 \n\t"
1133 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 YSCALEYUV2PACKEDX_END
1140 "pxor %%mm7, %%mm7 \n\t"
1141 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1142 "add %4, %%"REG_c" \n\t"
1143 WRITEBGR24(%%REGc, %5, %%REGa)
1145 :: "r" (&c->redDither),
1146 "m" (dummy), "m" (dummy), "m" (dummy),
1147 "r" (dest), "m" (dstW)
1148 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1151 case PIX_FMT_RGB555:
1154 "pxor %%mm7, %%mm7 \n\t"
1155 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1157 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1158 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1159 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1162 WRITERGB15(%4, %5, %%REGa)
1163 YSCALEYUV2PACKEDX_END
1165 case PIX_FMT_RGB565:
1168 "pxor %%mm7, %%mm7 \n\t"
1169 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1171 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1172 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1173 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1176 WRITERGB16(%4, %5, %%REGa)
1177 YSCALEYUV2PACKEDX_END
1179 case PIX_FMT_YUYV422:
1181 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1183 "psraw $3, %%mm3 \n\t"
1184 "psraw $3, %%mm4 \n\t"
1185 "psraw $3, %%mm1 \n\t"
1186 "psraw $3, %%mm7 \n\t"
1187 WRITEYUY2(%4, %5, %%REGa)
1188 YSCALEYUV2PACKEDX_END
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195 /* The following list of supported dstFormat values should
1196 match what's found in the body of ff_yuv2packedX_altivec() */
1197 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1199 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1201 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1206 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207 chrFilter, chrSrc, chrFilterSize,
1208 alpSrc, dest, dstW, dstY);
1212 * vertical bilinear scale YV12 to RGB
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1217 int yalpha1=4095- yalpha;
1218 int uvalpha1=4095-uvalpha;
1221 #if COMPILE_TEMPLATE_MMX
1222 if(!(c->flags & SWS_BITEXACT)) {
1223 switch(c->dstFormat) {
1224 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1226 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1229 YSCALEYUV2RGB(%%REGBP, %5)
1230 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233 "packuswb %%mm7, %%mm1 \n\t"
1234 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1236 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1238 ,"r" (abuf0), "r" (abuf1)
1242 *(uint16_t **)(&c->u_temp)=abuf0;
1243 *(uint16_t **)(&c->v_temp)=abuf1;
1245 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1246 "mov %4, %%"REG_b" \n\t"
1247 "push %%"REG_BP" \n\t"
1248 YSCALEYUV2RGB(%%REGBP, %5)
1251 "mov "U_TEMP"(%5), %0 \n\t"
1252 "mov "V_TEMP"(%5), %1 \n\t"
1253 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256 "packuswb %%mm7, %%mm1 \n\t"
1259 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 "pcmpeqd %%mm7, %%mm7 \n\t"
1274 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275 "pop %%"REG_BP" \n\t"
1276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1278 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1285 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_b" \n\t"
1287 "push %%"REG_BP" \n\t"
1288 YSCALEYUV2RGB(%%REGBP, %5)
1289 "pxor %%mm7, %%mm7 \n\t"
1290 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291 "pop %%"REG_BP" \n\t"
1292 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1293 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1297 case PIX_FMT_RGB555:
1299 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1300 "mov %4, %%"REG_b" \n\t"
1301 "push %%"REG_BP" \n\t"
1302 YSCALEYUV2RGB(%%REGBP, %5)
1303 "pxor %%mm7, %%mm7 \n\t"
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1307 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1308 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1311 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312 "pop %%"REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1315 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1319 case PIX_FMT_RGB565:
1321 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"REG_b" \n\t"
1323 "push %%"REG_BP" \n\t"
1324 YSCALEYUV2RGB(%%REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1333 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1340 case PIX_FMT_YUYV422:
1342 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1343 "mov %4, %%"REG_b" \n\t"
1344 "push %%"REG_BP" \n\t"
1345 YSCALEYUV2PACKED(%%REGBP, %5)
1346 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347 "pop %%"REG_BP" \n\t"
1348 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1349 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1356 #endif //COMPILE_TEMPLATE_MMX
1357 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1361 * YV12 to RGB without scaling or interpolating
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1366 const int yalpha1=0;
1369 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370 const int yalpha= 4096; //FIXME ...
1372 if (flags&SWS_FULL_CHR_H_INT) {
1373 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1377 #if COMPILE_TEMPLATE_MMX
1378 if(!(flags & SWS_BITEXACT)) {
1379 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1382 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2RGB1(%%REGBP, %5)
1388 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1393 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1398 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1399 "mov %4, %%"REG_b" \n\t"
1400 "push %%"REG_BP" \n\t"
1401 YSCALEYUV2RGB1(%%REGBP, %5)
1402 "pcmpeqd %%mm7, %%mm7 \n\t"
1403 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404 "pop %%"REG_BP" \n\t"
1405 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1407 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1415 "mov %4, %%"REG_b" \n\t"
1416 "push %%"REG_BP" \n\t"
1417 YSCALEYUV2RGB1(%%REGBP, %5)
1418 "pxor %%mm7, %%mm7 \n\t"
1419 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420 "pop %%"REG_BP" \n\t"
1421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1423 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1427 case PIX_FMT_RGB555:
1429 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_b" \n\t"
1431 "push %%"REG_BP" \n\t"
1432 YSCALEYUV2RGB1(%%REGBP, %5)
1433 "pxor %%mm7, %%mm7 \n\t"
1434 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1436 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1437 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1438 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1440 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441 "pop %%"REG_BP" \n\t"
1442 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1444 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1448 case PIX_FMT_RGB565:
1450 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1451 "mov %4, %%"REG_b" \n\t"
1452 "push %%"REG_BP" \n\t"
1453 YSCALEYUV2RGB1(%%REGBP, %5)
1454 "pxor %%mm7, %%mm7 \n\t"
1455 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1457 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1458 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1459 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1462 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1470 case PIX_FMT_YUYV422:
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2PACKED1(%%REGBP, %5)
1476 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1491 "mov %4, %%"REG_b" \n\t"
1492 "push %%"REG_BP" \n\t"
1493 YSCALEYUV2RGB1b(%%REGBP, %5)
1494 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496 "pop %%"REG_BP" \n\t"
1497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1499 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1505 "mov %4, %%"REG_b" \n\t"
1506 "push %%"REG_BP" \n\t"
1507 YSCALEYUV2RGB1b(%%REGBP, %5)
1508 "pcmpeqd %%mm7, %%mm7 \n\t"
1509 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510 "pop %%"REG_BP" \n\t"
1511 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1b(%%REGBP, %5)
1524 "pxor %%mm7, %%mm7 \n\t"
1525 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1533 case PIX_FMT_RGB555:
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1b(%%REGBP, %5)
1539 "pxor %%mm7, %%mm7 \n\t"
1540 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1542 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1543 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1544 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1546 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1554 case PIX_FMT_RGB565:
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1b(%%REGBP, %5)
1560 "pxor %%mm7, %%mm7 \n\t"
1561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1563 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1564 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1565 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1568 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 case PIX_FMT_YUYV422:
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2PACKED1b(%%REGBP, %5)
1582 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594 if (uvalpha < 2048) {
1595 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1597 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1601 //FIXME yuy2* can read up to 7 samples too much
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1605 #if COMPILE_TEMPLATE_MMX
1607 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1608 "mov %0, %%"REG_a" \n\t"
1610 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1611 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1612 "pand %%mm2, %%mm0 \n\t"
1613 "pand %%mm2, %%mm1 \n\t"
1614 "packuswb %%mm1, %%mm0 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "add $8, %%"REG_a" \n\t"
1618 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1623 for (i=0; i<width; i++)
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1630 #if COMPILE_TEMPLATE_MMX
1632 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1633 "mov %0, %%"REG_a" \n\t"
1635 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1636 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "psrlw $8, %%mm1 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "movq %%mm0, %%mm1 \n\t"
1641 "psrlw $8, %%mm0 \n\t"
1642 "pand %%mm4, %%mm1 \n\t"
1643 "packuswb %%mm0, %%mm0 \n\t"
1644 "packuswb %%mm1, %%mm1 \n\t"
1645 "movd %%mm0, (%3, %%"REG_a") \n\t"
1646 "movd %%mm1, (%2, %%"REG_a") \n\t"
1647 "add $4, %%"REG_a" \n\t"
1649 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1654 for (i=0; i<width; i++) {
1655 dstU[i]= src1[4*i + 1];
1656 dstV[i]= src1[4*i + 3];
1659 assert(src1 == src2);
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1664 #if COMPILE_TEMPLATE_MMX
1666 "mov %0, %%"REG_a" \n\t"
1668 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1669 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1670 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1671 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1672 "psrlw $8, %%mm0 \n\t"
1673 "psrlw $8, %%mm1 \n\t"
1674 "psrlw $8, %%mm2 \n\t"
1675 "psrlw $8, %%mm3 \n\t"
1676 "packuswb %%mm1, %%mm0 \n\t"
1677 "packuswb %%mm3, %%mm2 \n\t"
1678 "movq %%mm0, (%3, %%"REG_a") \n\t"
1679 "movq %%mm2, (%4, %%"REG_a") \n\t"
1680 "add $8, %%"REG_a" \n\t"
1682 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1687 for (i=0; i<width; i++) {
1688 dstU[i]= src1[2*i + 1];
1689 dstV[i]= src2[2*i + 1];
1694 /* This is almost identical to the previous, end exists only because
1695 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1698 #if COMPILE_TEMPLATE_MMX
1700 "mov %0, %%"REG_a" \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "psrlw $8, %%mm0 \n\t"
1705 "psrlw $8, %%mm1 \n\t"
1706 "packuswb %%mm1, %%mm0 \n\t"
1707 "movq %%mm0, (%2, %%"REG_a") \n\t"
1708 "add $8, %%"REG_a" \n\t"
1710 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1715 for (i=0; i<width; i++)
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1722 #if COMPILE_TEMPLATE_MMX
1724 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1725 "mov %0, %%"REG_a" \n\t"
1727 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1728 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1729 "pand %%mm4, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm1, %%mm0 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "psrlw $8, %%mm0 \n\t"
1734 "pand %%mm4, %%mm1 \n\t"
1735 "packuswb %%mm0, %%mm0 \n\t"
1736 "packuswb %%mm1, %%mm1 \n\t"
1737 "movd %%mm0, (%3, %%"REG_a") \n\t"
1738 "movd %%mm1, (%2, %%"REG_a") \n\t"
1739 "add $4, %%"REG_a" \n\t"
1741 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1746 for (i=0; i<width; i++) {
1747 dstU[i]= src1[4*i + 0];
1748 dstV[i]= src1[4*i + 2];
1751 assert(src1 == src2);
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1756 #if COMPILE_TEMPLATE_MMX
1758 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1759 "mov %0, %%"REG_a" \n\t"
1761 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1762 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1763 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1764 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1765 "pand %%mm4, %%mm0 \n\t"
1766 "pand %%mm4, %%mm1 \n\t"
1767 "pand %%mm4, %%mm2 \n\t"
1768 "pand %%mm4, %%mm3 \n\t"
1769 "packuswb %%mm1, %%mm0 \n\t"
1770 "packuswb %%mm3, %%mm2 \n\t"
1771 "movq %%mm0, (%3, %%"REG_a") \n\t"
1772 "movq %%mm2, (%4, %%"REG_a") \n\t"
1773 "add $8, %%"REG_a" \n\t"
1775 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1780 for (i=0; i<width; i++) {
1787 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1788 const uint8_t *src, long width)
1790 #if COMPILE_TEMPLATE_MMX
1792 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1793 "mov %0, %%"REG_a" \n\t"
1795 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1796 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1797 "movq %%mm0, %%mm2 \n\t"
1798 "movq %%mm1, %%mm3 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "psrlw $8, %%mm2 \n\t"
1802 "psrlw $8, %%mm3 \n\t"
1803 "packuswb %%mm1, %%mm0 \n\t"
1804 "packuswb %%mm3, %%mm2 \n\t"
1805 "movq %%mm0, (%2, %%"REG_a") \n\t"
1806 "movq %%mm2, (%3, %%"REG_a") \n\t"
1807 "add $8, %%"REG_a" \n\t"
1809 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1814 for (i = 0; i < width; i++) {
1815 dst1[i] = src[2*i+0];
1816 dst2[i] = src[2*i+1];
1821 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822 const uint8_t *src1, const uint8_t *src2,
1823 long width, uint32_t *unused)
1825 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1828 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1829 const uint8_t *src1, const uint8_t *src2,
1830 long width, uint32_t *unused)
1832 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1835 #if COMPILE_TEMPLATE_MMX
1836 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1839 if(srcFormat == PIX_FMT_BGR24) {
1841 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1842 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1847 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1848 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1854 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1855 "mov %2, %%"REG_a" \n\t"
1856 "pxor %%mm7, %%mm7 \n\t"
1858 PREFETCH" 64(%0) \n\t"
1859 "movd (%0), %%mm0 \n\t"
1860 "movd 2(%0), %%mm1 \n\t"
1861 "movd 6(%0), %%mm2 \n\t"
1862 "movd 8(%0), %%mm3 \n\t"
1864 "punpcklbw %%mm7, %%mm0 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm5, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm5, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872 "paddd %%mm1, %%mm0 \n\t"
1873 "paddd %%mm3, %%mm2 \n\t"
1874 "paddd %%mm4, %%mm0 \n\t"
1875 "paddd %%mm4, %%mm2 \n\t"
1876 "psrad $15, %%mm0 \n\t"
1877 "psrad $15, %%mm2 \n\t"
1878 "packssdw %%mm2, %%mm0 \n\t"
1879 "packuswb %%mm0, %%mm0 \n\t"
1880 "movd %%mm0, (%1, %%"REG_a") \n\t"
1881 "add $4, %%"REG_a" \n\t"
1884 : "r" (dst+width), "g" ((x86_reg)-width)
1889 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1892 "movq 24+%4, %%mm6 \n\t"
1893 "mov %3, %%"REG_a" \n\t"
1894 "pxor %%mm7, %%mm7 \n\t"
1896 PREFETCH" 64(%0) \n\t"
1897 "movd (%0), %%mm0 \n\t"
1898 "movd 2(%0), %%mm1 \n\t"
1899 "punpcklbw %%mm7, %%mm0 \n\t"
1900 "punpcklbw %%mm7, %%mm1 \n\t"
1901 "movq %%mm0, %%mm2 \n\t"
1902 "movq %%mm1, %%mm3 \n\t"
1903 "pmaddwd %4, %%mm0 \n\t"
1904 "pmaddwd 8+%4, %%mm1 \n\t"
1905 "pmaddwd 16+%4, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 "paddd %%mm1, %%mm0 \n\t"
1908 "paddd %%mm3, %%mm2 \n\t"
1910 "movd 6(%0), %%mm1 \n\t"
1911 "movd 8(%0), %%mm3 \n\t"
1913 "punpcklbw %%mm7, %%mm1 \n\t"
1914 "punpcklbw %%mm7, %%mm3 \n\t"
1915 "movq %%mm1, %%mm4 \n\t"
1916 "movq %%mm3, %%mm5 \n\t"
1917 "pmaddwd %4, %%mm1 \n\t"
1918 "pmaddwd 8+%4, %%mm3 \n\t"
1919 "pmaddwd 16+%4, %%mm4 \n\t"
1920 "pmaddwd %%mm6, %%mm5 \n\t"
1921 "paddd %%mm3, %%mm1 \n\t"
1922 "paddd %%mm5, %%mm4 \n\t"
1924 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1925 "paddd %%mm3, %%mm0 \n\t"
1926 "paddd %%mm3, %%mm2 \n\t"
1927 "paddd %%mm3, %%mm1 \n\t"
1928 "paddd %%mm3, %%mm4 \n\t"
1929 "psrad $15, %%mm0 \n\t"
1930 "psrad $15, %%mm2 \n\t"
1931 "psrad $15, %%mm1 \n\t"
1932 "psrad $15, %%mm4 \n\t"
1933 "packssdw %%mm1, %%mm0 \n\t"
1934 "packssdw %%mm4, %%mm2 \n\t"
1935 "packuswb %%mm0, %%mm0 \n\t"
1936 "packuswb %%mm2, %%mm2 \n\t"
1937 "movd %%mm0, (%1, %%"REG_a") \n\t"
1938 "movd %%mm2, (%2, %%"REG_a") \n\t"
1939 "add $4, %%"REG_a" \n\t"
1942 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1948 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1950 #if COMPILE_TEMPLATE_MMX
1951 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1954 for (i=0; i<width; i++) {
1959 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1961 #endif /* COMPILE_TEMPLATE_MMX */
1964 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1966 #if COMPILE_TEMPLATE_MMX
1967 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1970 for (i=0; i<width; i++) {
1971 int b= src1[3*i + 0];
1972 int g= src1[3*i + 1];
1973 int r= src1[3*i + 2];
1975 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1976 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1978 #endif /* COMPILE_TEMPLATE_MMX */
1979 assert(src1 == src2);
1982 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1985 for (i=0; i<width; i++) {
1986 int b= src1[6*i + 0] + src1[6*i + 3];
1987 int g= src1[6*i + 1] + src1[6*i + 4];
1988 int r= src1[6*i + 2] + src1[6*i + 5];
1990 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1991 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1993 assert(src1 == src2);
1996 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1998 #if COMPILE_TEMPLATE_MMX
1999 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2002 for (i=0; i<width; i++) {
2007 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2012 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2014 #if COMPILE_TEMPLATE_MMX
2016 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2020 for (i=0; i<width; i++) {
2021 int r= src1[3*i + 0];
2022 int g= src1[3*i + 1];
2023 int b= src1[3*i + 2];
2025 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2026 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2031 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2035 for (i=0; i<width; i++) {
2036 int r= src1[6*i + 0] + src1[6*i + 3];
2037 int g= src1[6*i + 1] + src1[6*i + 4];
2038 int b= src1[6*i + 2] + src1[6*i + 5];
2040 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2041 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2046 // bilinear / bicubic scaling
2047 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2048 const int16_t *filter, const int16_t *filterPos, long filterSize)
2050 #if COMPILE_TEMPLATE_MMX
2051 assert(filterSize % 4 == 0 && filterSize>0);
2052 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2053 x86_reg counter= -2*dstW;
2055 filterPos-= counter/2;
2059 "push %%"REG_b" \n\t"
2061 "pxor %%mm7, %%mm7 \n\t"
2062 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2063 "mov %%"REG_a", %%"REG_BP" \n\t"
2066 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2067 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2068 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2069 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2070 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2071 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2072 "punpcklbw %%mm7, %%mm0 \n\t"
2073 "punpcklbw %%mm7, %%mm2 \n\t"
2074 "pmaddwd %%mm1, %%mm0 \n\t"
2075 "pmaddwd %%mm2, %%mm3 \n\t"
2076 "movq %%mm0, %%mm4 \n\t"
2077 "punpckldq %%mm3, %%mm0 \n\t"
2078 "punpckhdq %%mm3, %%mm4 \n\t"
2079 "paddd %%mm4, %%mm0 \n\t"
2080 "psrad $7, %%mm0 \n\t"
2081 "packssdw %%mm0, %%mm0 \n\t"
2082 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2083 "add $4, %%"REG_BP" \n\t"
2086 "pop %%"REG_BP" \n\t"
2088 "pop %%"REG_b" \n\t"
2091 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2096 } else if (filterSize==8) {
2097 x86_reg counter= -2*dstW;
2099 filterPos-= counter/2;
2103 "push %%"REG_b" \n\t"
2105 "pxor %%mm7, %%mm7 \n\t"
2106 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2107 "mov %%"REG_a", %%"REG_BP" \n\t"
2110 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2111 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2112 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2113 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2114 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2115 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm0 \n\t"
2117 "punpcklbw %%mm7, %%mm2 \n\t"
2118 "pmaddwd %%mm1, %%mm0 \n\t"
2119 "pmaddwd %%mm2, %%mm3 \n\t"
2121 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2122 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2123 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2124 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm4 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "pmaddwd %%mm1, %%mm4 \n\t"
2128 "pmaddwd %%mm2, %%mm5 \n\t"
2129 "paddd %%mm4, %%mm0 \n\t"
2130 "paddd %%mm5, %%mm3 \n\t"
2131 "movq %%mm0, %%mm4 \n\t"
2132 "punpckldq %%mm3, %%mm0 \n\t"
2133 "punpckhdq %%mm3, %%mm4 \n\t"
2134 "paddd %%mm4, %%mm0 \n\t"
2135 "psrad $7, %%mm0 \n\t"
2136 "packssdw %%mm0, %%mm0 \n\t"
2137 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2138 "add $4, %%"REG_BP" \n\t"
2141 "pop %%"REG_BP" \n\t"
2143 "pop %%"REG_b" \n\t"
2146 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2152 uint8_t *offset = src+filterSize;
2153 x86_reg counter= -2*dstW;
2154 //filter-= counter*filterSize/2;
2155 filterPos-= counter/2;
2158 "pxor %%mm7, %%mm7 \n\t"
2161 "mov %2, %%"REG_c" \n\t"
2162 "movzwl (%%"REG_c", %0), %%eax \n\t"
2163 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2164 "mov %5, %%"REG_c" \n\t"
2165 "pxor %%mm4, %%mm4 \n\t"
2166 "pxor %%mm5, %%mm5 \n\t"
2168 "movq (%1), %%mm1 \n\t"
2169 "movq (%1, %6), %%mm3 \n\t"
2170 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2171 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm0 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "pmaddwd %%mm1, %%mm0 \n\t"
2175 "pmaddwd %%mm2, %%mm3 \n\t"
2176 "paddd %%mm3, %%mm5 \n\t"
2177 "paddd %%mm0, %%mm4 \n\t"
2179 "add $4, %%"REG_c" \n\t"
2180 "cmp %4, %%"REG_c" \n\t"
2183 "movq %%mm4, %%mm0 \n\t"
2184 "punpckldq %%mm5, %%mm4 \n\t"
2185 "punpckhdq %%mm5, %%mm0 \n\t"
2186 "paddd %%mm0, %%mm4 \n\t"
2187 "psrad $7, %%mm4 \n\t"
2188 "packssdw %%mm4, %%mm4 \n\t"
2189 "mov %3, %%"REG_a" \n\t"
2190 "movd %%mm4, (%%"REG_a", %0) \n\t"
2194 : "+r" (counter), "+r" (filter)
2195 : "m" (filterPos), "m" (dst), "m"(offset),
2196 "m" (src), "r" ((x86_reg)filterSize*2)
2197 : "%"REG_a, "%"REG_c, "%"REG_d
2201 #if COMPILE_TEMPLATE_ALTIVEC
2202 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2205 for (i=0; i<dstW; i++) {
2207 int srcPos= filterPos[i];
2209 //printf("filterPos: %d\n", filterPos[i]);
2210 for (j=0; j<filterSize; j++) {
2211 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2212 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2214 //filter += hFilterSize;
2215 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2218 #endif /* COMPILE_ALTIVEC */
2219 #endif /* COMPILE_MMX */
2222 #define FAST_BILINEAR_X86 \
2223 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2224 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2225 "shll $16, %%edi \n\t" \
2226 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2227 "mov %1, %%"REG_D"\n\t" \
2228 "shrl $9, %%esi \n\t" \
2230 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2231 int dstWidth, const uint8_t *src, int srcW,
2235 unsigned int xpos=0;
2236 for (i=0;i<dstWidth;i++) {
2237 register unsigned int xx=xpos>>16;
2238 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2239 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2244 // *** horizontal scale Y line to temp buffer
2245 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2246 int flags, const int16_t *hLumFilter,
2247 const int16_t *hLumFilterPos, int hLumFilterSize,
2248 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2249 uint32_t *pal, int isAlpha)
2251 int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2252 int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
2253 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2254 void av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2255 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2258 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2261 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2265 if (srcFormat == PIX_FMT_RGB48LE)
2268 if (internal_func) {
2269 internal_func(formatConvBuffer, src, srcW, pal);
2270 src= formatConvBuffer;
2273 #if COMPILE_TEMPLATE_MMX
2274 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2275 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2277 if (!(flags&SWS_FAST_BILINEAR))
2280 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2281 } else { // fast bilinear upscale / crap downscale
2282 #if ARCH_X86 && CONFIG_GPL
2283 #if COMPILE_TEMPLATE_MMX2
2286 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2288 if (canMMX2BeUsed) {
2291 "mov %%"REG_b", %5 \n\t"
2293 "pxor %%mm7, %%mm7 \n\t"
2294 "mov %0, %%"REG_c" \n\t"
2295 "mov %1, %%"REG_D" \n\t"
2296 "mov %2, %%"REG_d" \n\t"
2297 "mov %3, %%"REG_b" \n\t"
2298 "xor %%"REG_a", %%"REG_a" \n\t" // i
2299 PREFETCH" (%%"REG_c") \n\t"
2300 PREFETCH" 32(%%"REG_c") \n\t"
2301 PREFETCH" 64(%%"REG_c") \n\t"
2305 #define CALL_MMX2_FILTER_CODE \
2306 "movl (%%"REG_b"), %%esi \n\t"\
2308 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2309 "add %%"REG_S", %%"REG_c" \n\t"\
2310 "add %%"REG_a", %%"REG_D" \n\t"\
2311 "xor %%"REG_a", %%"REG_a" \n\t"\
2315 #define CALL_MMX2_FILTER_CODE \
2316 "movl (%%"REG_b"), %%esi \n\t"\
2318 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2319 "add %%"REG_a", %%"REG_D" \n\t"\
2320 "xor %%"REG_a", %%"REG_a" \n\t"\
2322 #endif /* ARCH_X86_64 */
2324 CALL_MMX2_FILTER_CODE
2325 CALL_MMX2_FILTER_CODE
2326 CALL_MMX2_FILTER_CODE
2327 CALL_MMX2_FILTER_CODE
2328 CALL_MMX2_FILTER_CODE
2329 CALL_MMX2_FILTER_CODE
2330 CALL_MMX2_FILTER_CODE
2331 CALL_MMX2_FILTER_CODE
2334 "mov %5, %%"REG_b" \n\t"
2336 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2337 "m" (mmx2FilterCode)
2341 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2346 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2348 #endif /* COMPILE_TEMPLATE_MMX2 */
2349 x86_reg xInc_shr16 = xInc >> 16;
2350 uint16_t xInc_mask = xInc & 0xffff;
2351 //NO MMX just normal asm ...
2353 "xor %%"REG_a", %%"REG_a" \n\t" // i
2354 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2355 "xorl %%ecx, %%ecx \n\t" // xalpha
2358 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2359 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2361 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2362 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2363 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2365 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2366 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2368 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2369 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2370 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2373 "add $2, %%"REG_a" \n\t"
2374 "cmp %2, %%"REG_a" \n\t"
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2379 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2381 #if COMPILE_TEMPLATE_MMX2
2382 } //if MMX2 can't be used
2385 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2386 #endif /* ARCH_X86 */
2389 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2391 //FIXME all pal and rgb srcFormats could do this convertion as well
2392 //FIXME all scalers more complex than bilinear could do half of this transform
2394 for (i=0; i<dstWidth; i++)
2395 dst[i]= (dst[i]*14071 + 33561947)>>14;
2397 for (i=0; i<dstWidth; i++)
2398 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2403 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2404 int dstWidth, const uint8_t *src1,
2405 const uint8_t *src2, int srcW, int xInc)
2408 unsigned int xpos=0;
2409 for (i=0;i<dstWidth;i++) {
2410 register unsigned int xx=xpos>>16;
2411 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2412 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2413 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2415 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2416 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2422 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2423 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2424 const int16_t *hChrFilterPos, int hChrFilterSize,
2425 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2428 int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2429 int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
2430 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2431 void av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2433 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2436 if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
2441 if (srcFormat==PIX_FMT_RGB48LE) {
2446 if (c->hcscale_internal) {
2447 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2448 src1= formatConvBuffer;
2449 src2= formatConvBuffer+VOFW;
2452 #if COMPILE_TEMPLATE_MMX
2453 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2454 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2456 if (!(flags&SWS_FAST_BILINEAR))
2459 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2460 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2461 } else { // fast bilinear upscale / crap downscale
2462 #if ARCH_X86 && CONFIG_GPL
2463 #if COMPILE_TEMPLATE_MMX2
2466 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2468 if (canMMX2BeUsed) {
2471 "mov %%"REG_b", %6 \n\t"
2473 "pxor %%mm7, %%mm7 \n\t"
2474 "mov %0, %%"REG_c" \n\t"
2475 "mov %1, %%"REG_D" \n\t"
2476 "mov %2, %%"REG_d" \n\t"
2477 "mov %3, %%"REG_b" \n\t"
2478 "xor %%"REG_a", %%"REG_a" \n\t" // i
2479 PREFETCH" (%%"REG_c") \n\t"
2480 PREFETCH" 32(%%"REG_c") \n\t"
2481 PREFETCH" 64(%%"REG_c") \n\t"
2483 CALL_MMX2_FILTER_CODE
2484 CALL_MMX2_FILTER_CODE
2485 CALL_MMX2_FILTER_CODE
2486 CALL_MMX2_FILTER_CODE
2487 "xor %%"REG_a", %%"REG_a" \n\t" // i
2488 "mov %5, %%"REG_c" \n\t" // src
2489 "mov %1, %%"REG_D" \n\t" // buf1
2490 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2491 PREFETCH" (%%"REG_c") \n\t"
2492 PREFETCH" 32(%%"REG_c") \n\t"
2493 PREFETCH" 64(%%"REG_c") \n\t"
2495 CALL_MMX2_FILTER_CODE
2496 CALL_MMX2_FILTER_CODE
2497 CALL_MMX2_FILTER_CODE
2498 CALL_MMX2_FILTER_CODE
2501 "mov %6, %%"REG_b" \n\t"
2503 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2504 "m" (mmx2FilterCode), "m" (src2)
2508 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2513 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2514 //printf("%d %d %d\n", dstWidth, i, srcW);
2515 dst[i] = src1[srcW-1]*128;
2516 dst[i+VOFW] = src2[srcW-1]*128;
2519 #endif /* COMPILE_TEMPLATE_MMX2 */
2520 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2521 uint16_t xInc_mask = xInc & 0xffff;
2523 "xor %%"REG_a", %%"REG_a" \n\t" // i
2524 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2525 "xorl %%ecx, %%ecx \n\t" // xalpha
2528 "mov %0, %%"REG_S" \n\t"
2529 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2530 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2532 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2534 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2537 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2539 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2540 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2541 "add $1, %%"REG_a" \n\t"
2542 "cmp %2, %%"REG_a" \n\t"
2545 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2546 which is needed to support GCC 4.0. */
2547 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2548 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2550 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2553 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2555 #if COMPILE_TEMPLATE_MMX2
2556 } //if MMX2 can't be used
2559 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2560 #endif /* ARCH_X86 */
2562 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2564 //FIXME all pal and rgb srcFormats could do this convertion as well
2565 //FIXME all scalers more complex than bilinear could do half of this transform
2567 for (i=0; i<dstWidth; i++) {
2568 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2569 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2572 for (i=0; i<dstWidth; i++) {
2573 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2574 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2580 #define DEBUG_SWSCALE_BUFFERS 0
2581 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2583 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2584 int srcSliceH, uint8_t* dst[], int dstStride[])
2586 /* load a few things into local vars to make the code more readable? and faster */
2587 const int srcW= c->srcW;
2588 const int dstW= c->dstW;
2589 const int dstH= c->dstH;
2590 const int chrDstW= c->chrDstW;
2591 const int chrSrcW= c->chrSrcW;
2592 const int lumXInc= c->lumXInc;
2593 const int chrXInc= c->chrXInc;
2594 const enum PixelFormat dstFormat= c->dstFormat;
2595 const enum PixelFormat srcFormat= c->srcFormat;
2596 const int flags= c->flags;
2597 int16_t *vLumFilterPos= c->vLumFilterPos;
2598 int16_t *vChrFilterPos= c->vChrFilterPos;
2599 int16_t *hLumFilterPos= c->hLumFilterPos;
2600 int16_t *hChrFilterPos= c->hChrFilterPos;
2601 int16_t *vLumFilter= c->vLumFilter;
2602 int16_t *vChrFilter= c->vChrFilter;
2603 int16_t *hLumFilter= c->hLumFilter;
2604 int16_t *hChrFilter= c->hChrFilter;
2605 int32_t *lumMmxFilter= c->lumMmxFilter;
2606 int32_t *chrMmxFilter= c->chrMmxFilter;
2607 int32_t *alpMmxFilter= c->alpMmxFilter;
2608 const int vLumFilterSize= c->vLumFilterSize;
2609 const int vChrFilterSize= c->vChrFilterSize;
2610 const int hLumFilterSize= c->hLumFilterSize;
2611 const int hChrFilterSize= c->hChrFilterSize;
2612 int16_t **lumPixBuf= c->lumPixBuf;
2613 int16_t **chrPixBuf= c->chrPixBuf;
2614 int16_t **alpPixBuf= c->alpPixBuf;
2615 const int vLumBufSize= c->vLumBufSize;
2616 const int vChrBufSize= c->vChrBufSize;
2617 uint8_t *formatConvBuffer= c->formatConvBuffer;
2618 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2619 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2621 uint32_t *pal=c->pal_yuv;
2623 /* vars which will change and which we need to store back in the context */
2625 int lumBufIndex= c->lumBufIndex;
2626 int chrBufIndex= c->chrBufIndex;
2627 int lastInLumBuf= c->lastInLumBuf;
2628 int lastInChrBuf= c->lastInChrBuf;
2630 if (isPacked(c->srcFormat)) {
2638 srcStride[3]= srcStride[0];
2640 srcStride[1]<<= c->vChrDrop;
2641 srcStride[2]<<= c->vChrDrop;
2643 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2644 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2645 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2646 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2647 srcSliceY, srcSliceH, dstY, dstH);
2648 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2649 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2651 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2652 static int warnedAlready=0; //FIXME move this into the context perhaps
2653 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2654 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2655 " ->cannot do aligned memory accesses anymore\n");
2660 /* Note the user might start scaling the picture in the middle so this
2661 will not get executed. This is not really intended but works
2662 currently, so people might do it. */
2663 if (srcSliceY ==0) {
2673 for (;dstY < dstH; dstY++) {
2674 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2675 const int chrDstY= dstY>>c->chrDstVSubSample;
2676 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2677 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2678 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2680 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2681 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2682 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2683 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2686 //handle holes (FAST_BILINEAR & weird filters)
2687 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2688 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2689 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2690 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2692 // Do we have enough lines in this slice to output the dstY line
2693 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2694 if (!enough_lines) {
2695 lastLumSrcY = srcSliceY + srcSliceH - 1;
2696 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2699 DEBUG_BUFFERS("dstY: %d\n", dstY);
2700 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2701 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2702 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2703 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2705 //Do horizontal scaling
2706 while(lastInLumBuf < lastLumSrcY) {
2707 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2708 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2710 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2711 lumBufIndex, lastInLumBuf);
2712 assert(lumBufIndex < 2*vLumBufSize);
2713 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2714 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2715 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2716 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2717 c->srcFormat, formatConvBuffer,
2719 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2720 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2721 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2722 c->srcFormat, formatConvBuffer,
2726 while(lastInChrBuf < lastChrSrcY) {
2727 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2728 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2730 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2731 chrBufIndex, lastInChrBuf);
2732 assert(chrBufIndex < 2*vChrBufSize);
2733 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2734 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2735 //FIXME replace parameters through context struct (some at least)
2737 if (!(isGray(srcFormat) || isGray(dstFormat)))
2738 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2739 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2740 c->srcFormat, formatConvBuffer,
2744 //wrap buf index around to stay inside the ring buffer
2745 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2746 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2748 break; //we can't output a dstY line so let's try with the next slice
2750 #if COMPILE_TEMPLATE_MMX
2751 c->blueDither= ff_dither8[dstY&1];
2752 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2753 c->greenDither= ff_dither8[dstY&1];
2755 c->greenDither= ff_dither4[dstY&1];
2756 c->redDither= ff_dither8[(dstY+1)&1];
2758 if (dstY < dstH-2) {
2759 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2760 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2761 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2762 #if COMPILE_TEMPLATE_MMX
2764 if (flags & SWS_ACCURATE_RND) {
2765 int s= APCK_SIZE / 8;
2766 for (i=0; i<vLumFilterSize; i+=2) {
2767 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2768 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2769 lumMmxFilter[s*i+APCK_COEF/4 ]=
2770 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2771 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2772 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2773 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2774 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2775 alpMmxFilter[s*i+APCK_COEF/4 ]=
2776 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2779 for (i=0; i<vChrFilterSize; i+=2) {
2780 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2781 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2782 chrMmxFilter[s*i+APCK_COEF/4 ]=
2783 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2784 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2787 for (i=0; i<vLumFilterSize; i++) {
2788 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2789 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2790 lumMmxFilter[4*i+2]=
2791 lumMmxFilter[4*i+3]=
2792 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2793 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2794 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2795 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2796 alpMmxFilter[4*i+2]=
2797 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2800 for (i=0; i<vChrFilterSize; i++) {
2801 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2802 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2803 chrMmxFilter[4*i+2]=
2804 chrMmxFilter[4*i+3]=
2805 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2809 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2810 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2813 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2814 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2815 dest, uDest, dstW, chrDstW, dstFormat);
2816 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2817 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2818 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2819 if (is16BPS(dstFormat)) {
2821 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2822 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2823 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2825 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2826 int16_t *lumBuf = lumSrcPtr[0];
2827 int16_t *chrBuf= chrSrcPtr[0];
2828 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2829 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2830 } else { //General YV12
2832 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2833 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2834 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2837 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2838 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2839 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2840 int chrAlpha= vChrFilter[2*dstY+1];
2841 if(flags & SWS_FULL_CHR_H_INT) {
2842 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2843 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2844 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2845 alpSrcPtr, dest, dstW, dstY);
2847 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2848 alpPixBuf ? *alpSrcPtr : NULL,
2849 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2851 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2852 int lumAlpha= vLumFilter[2*dstY+1];
2853 int chrAlpha= vChrFilter[2*dstY+1];
2855 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2857 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2858 if(flags & SWS_FULL_CHR_H_INT) {
2859 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2860 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2862 alpSrcPtr, dest, dstW, dstY);
2864 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2865 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2866 dest, dstW, lumAlpha, chrAlpha, dstY);
2868 } else { //general RGB
2869 if(flags & SWS_FULL_CHR_H_INT) {
2871 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2872 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2873 alpSrcPtr, dest, dstW, dstY);
2876 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2877 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2878 alpSrcPtr, dest, dstW, dstY);
2882 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2883 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2884 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2885 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2886 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2887 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2888 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2890 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2891 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2892 dest, uDest, dstW, chrDstW, dstFormat);
2893 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2894 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2895 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2896 if (is16BPS(dstFormat)) {
2898 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2899 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2900 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2904 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2905 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2906 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2909 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2910 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2911 if(flags & SWS_FULL_CHR_H_INT) {
2913 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2914 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2915 alpSrcPtr, dest, dstW, dstY);
2918 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2919 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2920 alpSrcPtr, dest, dstW, dstY);
2926 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2927 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2929 #if COMPILE_TEMPLATE_MMX
2930 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2931 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2932 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2933 else __asm__ volatile("emms" :::"memory");
2935 /* store changed local vars back in the context */
2937 c->lumBufIndex= lumBufIndex;
2938 c->chrBufIndex= chrBufIndex;
2939 c->lastInLumBuf= lastInLumBuf;
2940 c->lastInChrBuf= lastInChrBuf;
2942 return dstY - lastDstY;
2945 static void RENAME(sws_init_swScale)(SwsContext *c)
2947 enum PixelFormat srcFormat = c->srcFormat;
2949 c->yuv2nv12X = RENAME(yuv2nv12X );
2950 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2951 c->yuv2yuvX = RENAME(yuv2yuvX );
2952 c->yuv2packed1 = RENAME(yuv2packed1 );
2953 c->yuv2packed2 = RENAME(yuv2packed2 );
2954 c->yuv2packedX = RENAME(yuv2packedX );
2956 c->hScale = RENAME(hScale );
2958 c->hyscale_fast = RENAME(hyscale_fast);
2959 c->hcscale_fast = RENAME(hcscale_fast);
2961 c->hcscale_internal = NULL;
2963 case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
2964 case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
2965 case PIX_FMT_NV12 : c->hcscale_internal = RENAME(nv12ToUV); break;
2966 case PIX_FMT_NV21 : c->hcscale_internal = RENAME(nv21ToUV); break;
2970 case PIX_FMT_BGR4_BYTE:
2971 case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
2972 case PIX_FMT_YUV420P16BE:
2973 case PIX_FMT_YUV422P16BE:
2974 case PIX_FMT_YUV444P16BE: c->hcscale_internal = RENAME(BEToUV); break;
2975 case PIX_FMT_YUV420P16LE:
2976 case PIX_FMT_YUV422P16LE:
2977 case PIX_FMT_YUV444P16LE: c->hcscale_internal = RENAME(LEToUV); break;
2979 if (c->chrSrcHSubSample) {
2981 case PIX_FMT_RGB48BE:
2982 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
2983 case PIX_FMT_RGB32 :
2984 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
2985 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
2986 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
2987 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
2988 case PIX_FMT_BGR32 :
2989 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
2990 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
2991 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
2992 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
2996 case PIX_FMT_RGB48BE:
2997 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
2998 case PIX_FMT_RGB32 :
2999 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
3000 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break;
3001 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
3002 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
3003 case PIX_FMT_BGR32 :
3004 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
3005 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break;
3006 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
3007 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
3011 c->hyscale_internal = NULL;
3012 c->hascale_internal = NULL;
3013 switch (srcFormat) {
3014 case PIX_FMT_YUYV422 :
3015 case PIX_FMT_YUV420P16BE:
3016 case PIX_FMT_YUV422P16BE:
3017 case PIX_FMT_YUV444P16BE:
3018 case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
3019 case PIX_FMT_UYVY422 :
3020 case PIX_FMT_YUV420P16LE:
3021 case PIX_FMT_YUV422P16LE:
3022 case PIX_FMT_YUV444P16LE:
3023 case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
3024 case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break;
3025 case PIX_FMT_BGR565 : c->hyscale_internal = bgr16ToY; break;
3026 case PIX_FMT_BGR555 : c->hyscale_internal = bgr15ToY; break;
3027 case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break;
3028 case PIX_FMT_RGB565 : c->hyscale_internal = rgb16ToY; break;
3029 case PIX_FMT_RGB555 : c->hyscale_internal = rgb15ToY; break;
3033 case PIX_FMT_BGR4_BYTE:
3034 case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
3035 case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
3036 case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
3037 case PIX_FMT_RGB32 :
3038 case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
3039 case PIX_FMT_BGR32 :
3040 case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
3041 case PIX_FMT_RGB48BE:
3042 case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
3045 switch (srcFormat) {
3046 case PIX_FMT_RGB32 :
3047 case PIX_FMT_RGB32_1:
3048 case PIX_FMT_BGR32 :
3049 case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;