2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
194 #define YSCALEYUV2PACKEDX_UV \
196 "xor %%"REG_a", %%"REG_a" \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
250 "xor %%"REG_a", %%"REG_a" \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
824 "add $24, "#dst" \n\t"\
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
872 "add $24, "#dst" \n\t"\
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
878 #if COMPILE_TEMPLATE_MMX2
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)) {
910 if (c->flags & SWS_ACCURATE_RND) {
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
915 if (CONFIG_SWSCALE_ALPHA && aDest) {
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
925 if (CONFIG_SWSCALE_ALPHA && aDest) {
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)) {
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
965 if (c->flags & SWS_ACCURATE_RND) {
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
991 for (i=0; i<dstW; i++) {
992 int val= (lumSrc[i]+64)>>7;
1003 for (i=0; i<chrDstW; i++) {
1004 int u=(chrSrc[i ]+64)>>7;
1005 int v=(chrSrc[i + VOFW]+64)>>7;
1009 else if (u>255) u=255;
1011 else if (v>255) v=255;
1018 if (CONFIG_SWSCALE_ALPHA && aDest)
1019 for (i=0; i<dstW; i++) {
1020 int val= (alpSrc[i]+64)>>7;
1021 aDest[i]= av_clip_uint8(val);
1027 * vertical scale YV12 to RGB
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1033 #if COMPILE_TEMPLATE_MMX
1035 if(!(c->flags & SWS_BITEXACT)) {
1036 if (c->flags & SWS_ACCURATE_RND) {
1037 switch(c->dstFormat) {
1039 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 "movq %%mm2, "U_TEMP"(%0) \n\t"
1043 "movq %%mm4, "V_TEMP"(%0) \n\t"
1044 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1045 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1047 "psraw $3, %%mm1 \n\t"
1048 "psraw $3, %%mm7 \n\t"
1049 "packuswb %%mm7, %%mm1 \n\t"
1050 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1052 YSCALEYUV2PACKEDX_END
1054 YSCALEYUV2PACKEDX_ACCURATE
1056 "pcmpeqd %%mm7, %%mm7 \n\t"
1057 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1059 YSCALEYUV2PACKEDX_END
1063 YSCALEYUV2PACKEDX_ACCURATE
1065 "pxor %%mm7, %%mm7 \n\t"
1066 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067 "add %4, %%"REG_c" \n\t"
1068 WRITEBGR24(%%REGc, %5, %%REGa)
1071 :: "r" (&c->redDither),
1072 "m" (dummy), "m" (dummy), "m" (dummy),
1073 "r" (dest), "m" (dstW)
1074 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 case PIX_FMT_RGB555:
1078 YSCALEYUV2PACKEDX_ACCURATE
1080 "pxor %%mm7, %%mm7 \n\t"
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088 WRITERGB15(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_RGB565:
1092 YSCALEYUV2PACKEDX_ACCURATE
1094 "pxor %%mm7, %%mm7 \n\t"
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102 WRITERGB16(%4, %5, %%REGa)
1103 YSCALEYUV2PACKEDX_END
1105 case PIX_FMT_YUYV422:
1106 YSCALEYUV2PACKEDX_ACCURATE
1107 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1109 "psraw $3, %%mm3 \n\t"
1110 "psraw $3, %%mm4 \n\t"
1111 "psraw $3, %%mm1 \n\t"
1112 "psraw $3, %%mm7 \n\t"
1113 WRITEYUY2(%4, %5, %%REGa)
1114 YSCALEYUV2PACKEDX_END
1118 switch(c->dstFormat) {
1120 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1123 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124 "psraw $3, %%mm1 \n\t"
1125 "psraw $3, %%mm7 \n\t"
1126 "packuswb %%mm7, %%mm1 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1132 "pcmpeqd %%mm7, %%mm7 \n\t"
1133 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 YSCALEYUV2PACKEDX_END
1140 "pxor %%mm7, %%mm7 \n\t"
1141 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1142 "add %4, %%"REG_c" \n\t"
1143 WRITEBGR24(%%REGc, %5, %%REGa)
1145 :: "r" (&c->redDither),
1146 "m" (dummy), "m" (dummy), "m" (dummy),
1147 "r" (dest), "m" (dstW)
1148 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1151 case PIX_FMT_RGB555:
1154 "pxor %%mm7, %%mm7 \n\t"
1155 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1157 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1158 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1159 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1162 WRITERGB15(%4, %5, %%REGa)
1163 YSCALEYUV2PACKEDX_END
1165 case PIX_FMT_RGB565:
1168 "pxor %%mm7, %%mm7 \n\t"
1169 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1171 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1172 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1173 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1176 WRITERGB16(%4, %5, %%REGa)
1177 YSCALEYUV2PACKEDX_END
1179 case PIX_FMT_YUYV422:
1181 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1183 "psraw $3, %%mm3 \n\t"
1184 "psraw $3, %%mm4 \n\t"
1185 "psraw $3, %%mm1 \n\t"
1186 "psraw $3, %%mm7 \n\t"
1187 WRITEYUY2(%4, %5, %%REGa)
1188 YSCALEYUV2PACKEDX_END
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195 /* The following list of supported dstFormat values should
1196 match what's found in the body of ff_yuv2packedX_altivec() */
1197 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1199 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1201 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1206 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207 chrFilter, chrSrc, chrFilterSize,
1208 alpSrc, dest, dstW, dstY);
1212 * vertical bilinear scale YV12 to RGB
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1217 int yalpha1=4095- yalpha;
1218 int uvalpha1=4095-uvalpha;
1221 #if COMPILE_TEMPLATE_MMX
1222 if(!(c->flags & SWS_BITEXACT)) {
1223 switch(c->dstFormat) {
1224 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1226 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1229 YSCALEYUV2RGB(%%REGBP, %5)
1230 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233 "packuswb %%mm7, %%mm1 \n\t"
1234 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1236 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1238 ,"r" (abuf0), "r" (abuf1)
1242 *(uint16_t **)(&c->u_temp)=abuf0;
1243 *(uint16_t **)(&c->v_temp)=abuf1;
1245 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1246 "mov %4, %%"REG_b" \n\t"
1247 "push %%"REG_BP" \n\t"
1248 YSCALEYUV2RGB(%%REGBP, %5)
1251 "mov "U_TEMP"(%5), %0 \n\t"
1252 "mov "V_TEMP"(%5), %1 \n\t"
1253 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256 "packuswb %%mm7, %%mm1 \n\t"
1259 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 "pcmpeqd %%mm7, %%mm7 \n\t"
1274 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275 "pop %%"REG_BP" \n\t"
1276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1278 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1285 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_b" \n\t"
1287 "push %%"REG_BP" \n\t"
1288 YSCALEYUV2RGB(%%REGBP, %5)
1289 "pxor %%mm7, %%mm7 \n\t"
1290 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291 "pop %%"REG_BP" \n\t"
1292 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1293 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1297 case PIX_FMT_RGB555:
1299 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1300 "mov %4, %%"REG_b" \n\t"
1301 "push %%"REG_BP" \n\t"
1302 YSCALEYUV2RGB(%%REGBP, %5)
1303 "pxor %%mm7, %%mm7 \n\t"
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1307 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1308 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1311 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312 "pop %%"REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1315 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1319 case PIX_FMT_RGB565:
1321 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"REG_b" \n\t"
1323 "push %%"REG_BP" \n\t"
1324 YSCALEYUV2RGB(%%REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1333 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1340 case PIX_FMT_YUYV422:
1342 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1343 "mov %4, %%"REG_b" \n\t"
1344 "push %%"REG_BP" \n\t"
1345 YSCALEYUV2PACKED(%%REGBP, %5)
1346 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347 "pop %%"REG_BP" \n\t"
1348 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1349 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1356 #endif //COMPILE_TEMPLATE_MMX
1357 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1361 * YV12 to RGB without scaling or interpolating
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1366 const int yalpha1=0;
1369 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370 const int yalpha= 4096; //FIXME ...
1372 if (flags&SWS_FULL_CHR_H_INT) {
1373 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1377 #if COMPILE_TEMPLATE_MMX
1378 if(!(flags & SWS_BITEXACT)) {
1379 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1382 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2RGB1(%%REGBP, %5)
1388 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1393 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1398 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1399 "mov %4, %%"REG_b" \n\t"
1400 "push %%"REG_BP" \n\t"
1401 YSCALEYUV2RGB1(%%REGBP, %5)
1402 "pcmpeqd %%mm7, %%mm7 \n\t"
1403 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404 "pop %%"REG_BP" \n\t"
1405 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1407 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1415 "mov %4, %%"REG_b" \n\t"
1416 "push %%"REG_BP" \n\t"
1417 YSCALEYUV2RGB1(%%REGBP, %5)
1418 "pxor %%mm7, %%mm7 \n\t"
1419 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420 "pop %%"REG_BP" \n\t"
1421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1423 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1427 case PIX_FMT_RGB555:
1429 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_b" \n\t"
1431 "push %%"REG_BP" \n\t"
1432 YSCALEYUV2RGB1(%%REGBP, %5)
1433 "pxor %%mm7, %%mm7 \n\t"
1434 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1436 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1437 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1438 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1440 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441 "pop %%"REG_BP" \n\t"
1442 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1444 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1448 case PIX_FMT_RGB565:
1450 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1451 "mov %4, %%"REG_b" \n\t"
1452 "push %%"REG_BP" \n\t"
1453 YSCALEYUV2RGB1(%%REGBP, %5)
1454 "pxor %%mm7, %%mm7 \n\t"
1455 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1457 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1458 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1459 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1462 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1470 case PIX_FMT_YUYV422:
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2PACKED1(%%REGBP, %5)
1476 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1491 "mov %4, %%"REG_b" \n\t"
1492 "push %%"REG_BP" \n\t"
1493 YSCALEYUV2RGB1b(%%REGBP, %5)
1494 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496 "pop %%"REG_BP" \n\t"
1497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1499 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1505 "mov %4, %%"REG_b" \n\t"
1506 "push %%"REG_BP" \n\t"
1507 YSCALEYUV2RGB1b(%%REGBP, %5)
1508 "pcmpeqd %%mm7, %%mm7 \n\t"
1509 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510 "pop %%"REG_BP" \n\t"
1511 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1b(%%REGBP, %5)
1524 "pxor %%mm7, %%mm7 \n\t"
1525 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1533 case PIX_FMT_RGB555:
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1b(%%REGBP, %5)
1539 "pxor %%mm7, %%mm7 \n\t"
1540 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1542 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1543 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1544 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1546 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1554 case PIX_FMT_RGB565:
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1b(%%REGBP, %5)
1560 "pxor %%mm7, %%mm7 \n\t"
1561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1563 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1564 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1565 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1568 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 case PIX_FMT_YUYV422:
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2PACKED1b(%%REGBP, %5)
1582 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594 if (uvalpha < 2048) {
1595 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1597 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1601 //FIXME yuy2* can read up to 7 samples too much
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1605 #if COMPILE_TEMPLATE_MMX
1607 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1608 "mov %0, %%"REG_a" \n\t"
1610 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1611 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1612 "pand %%mm2, %%mm0 \n\t"
1613 "pand %%mm2, %%mm1 \n\t"
1614 "packuswb %%mm1, %%mm0 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "add $8, %%"REG_a" \n\t"
1618 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1623 for (i=0; i<width; i++)
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1630 #if COMPILE_TEMPLATE_MMX
1632 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1633 "mov %0, %%"REG_a" \n\t"
1635 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1636 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "psrlw $8, %%mm1 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "movq %%mm0, %%mm1 \n\t"
1641 "psrlw $8, %%mm0 \n\t"
1642 "pand %%mm4, %%mm1 \n\t"
1643 "packuswb %%mm0, %%mm0 \n\t"
1644 "packuswb %%mm1, %%mm1 \n\t"
1645 "movd %%mm0, (%3, %%"REG_a") \n\t"
1646 "movd %%mm1, (%2, %%"REG_a") \n\t"
1647 "add $4, %%"REG_a" \n\t"
1649 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1654 for (i=0; i<width; i++) {
1655 dstU[i]= src1[4*i + 1];
1656 dstV[i]= src1[4*i + 3];
1659 assert(src1 == src2);
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1664 #if COMPILE_TEMPLATE_MMX
1666 "mov %0, %%"REG_a" \n\t"
1668 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1669 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1670 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1671 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1672 "psrlw $8, %%mm0 \n\t"
1673 "psrlw $8, %%mm1 \n\t"
1674 "psrlw $8, %%mm2 \n\t"
1675 "psrlw $8, %%mm3 \n\t"
1676 "packuswb %%mm1, %%mm0 \n\t"
1677 "packuswb %%mm3, %%mm2 \n\t"
1678 "movq %%mm0, (%3, %%"REG_a") \n\t"
1679 "movq %%mm2, (%4, %%"REG_a") \n\t"
1680 "add $8, %%"REG_a" \n\t"
1682 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1687 for (i=0; i<width; i++) {
1688 dstU[i]= src1[2*i + 1];
1689 dstV[i]= src2[2*i + 1];
1694 /* This is almost identical to the previous, end exists only because
1695 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1698 #if COMPILE_TEMPLATE_MMX
1700 "mov %0, %%"REG_a" \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "psrlw $8, %%mm0 \n\t"
1705 "psrlw $8, %%mm1 \n\t"
1706 "packuswb %%mm1, %%mm0 \n\t"
1707 "movq %%mm0, (%2, %%"REG_a") \n\t"
1708 "add $8, %%"REG_a" \n\t"
1710 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1715 for (i=0; i<width; i++)
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1722 #if COMPILE_TEMPLATE_MMX
1724 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1725 "mov %0, %%"REG_a" \n\t"
1727 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1728 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1729 "pand %%mm4, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm1, %%mm0 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "psrlw $8, %%mm0 \n\t"
1734 "pand %%mm4, %%mm1 \n\t"
1735 "packuswb %%mm0, %%mm0 \n\t"
1736 "packuswb %%mm1, %%mm1 \n\t"
1737 "movd %%mm0, (%3, %%"REG_a") \n\t"
1738 "movd %%mm1, (%2, %%"REG_a") \n\t"
1739 "add $4, %%"REG_a" \n\t"
1741 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1746 for (i=0; i<width; i++) {
1747 dstU[i]= src1[4*i + 0];
1748 dstV[i]= src1[4*i + 2];
1751 assert(src1 == src2);
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1756 #if COMPILE_TEMPLATE_MMX
1758 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1759 "mov %0, %%"REG_a" \n\t"
1761 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1762 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1763 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1764 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1765 "pand %%mm4, %%mm0 \n\t"
1766 "pand %%mm4, %%mm1 \n\t"
1767 "pand %%mm4, %%mm2 \n\t"
1768 "pand %%mm4, %%mm3 \n\t"
1769 "packuswb %%mm1, %%mm0 \n\t"
1770 "packuswb %%mm3, %%mm2 \n\t"
1771 "movq %%mm0, (%3, %%"REG_a") \n\t"
1772 "movq %%mm2, (%4, %%"REG_a") \n\t"
1773 "add $8, %%"REG_a" \n\t"
1775 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1780 for (i=0; i<width; i++) {
1787 #if COMPILE_TEMPLATE_MMX
1788 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1791 if(srcFormat == PIX_FMT_BGR24) {
1793 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1794 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1799 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1800 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1806 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1807 "mov %2, %%"REG_a" \n\t"
1808 "pxor %%mm7, %%mm7 \n\t"
1810 PREFETCH" 64(%0) \n\t"
1811 "movd (%0), %%mm0 \n\t"
1812 "movd 2(%0), %%mm1 \n\t"
1813 "movd 6(%0), %%mm2 \n\t"
1814 "movd 8(%0), %%mm3 \n\t"
1816 "punpcklbw %%mm7, %%mm0 \n\t"
1817 "punpcklbw %%mm7, %%mm1 \n\t"
1818 "punpcklbw %%mm7, %%mm2 \n\t"
1819 "punpcklbw %%mm7, %%mm3 \n\t"
1820 "pmaddwd %%mm5, %%mm0 \n\t"
1821 "pmaddwd %%mm6, %%mm1 \n\t"
1822 "pmaddwd %%mm5, %%mm2 \n\t"
1823 "pmaddwd %%mm6, %%mm3 \n\t"
1824 "paddd %%mm1, %%mm0 \n\t"
1825 "paddd %%mm3, %%mm2 \n\t"
1826 "paddd %%mm4, %%mm0 \n\t"
1827 "paddd %%mm4, %%mm2 \n\t"
1828 "psrad $15, %%mm0 \n\t"
1829 "psrad $15, %%mm2 \n\t"
1830 "packssdw %%mm2, %%mm0 \n\t"
1831 "packuswb %%mm0, %%mm0 \n\t"
1832 "movd %%mm0, (%1, %%"REG_a") \n\t"
1833 "add $4, %%"REG_a" \n\t"
1836 : "r" (dst+width), "g" ((x86_reg)-width)
1841 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1844 "movq 24+%4, %%mm6 \n\t"
1845 "mov %3, %%"REG_a" \n\t"
1846 "pxor %%mm7, %%mm7 \n\t"
1848 PREFETCH" 64(%0) \n\t"
1849 "movd (%0), %%mm0 \n\t"
1850 "movd 2(%0), %%mm1 \n\t"
1851 "punpcklbw %%mm7, %%mm0 \n\t"
1852 "punpcklbw %%mm7, %%mm1 \n\t"
1853 "movq %%mm0, %%mm2 \n\t"
1854 "movq %%mm1, %%mm3 \n\t"
1855 "pmaddwd %4, %%mm0 \n\t"
1856 "pmaddwd 8+%4, %%mm1 \n\t"
1857 "pmaddwd 16+%4, %%mm2 \n\t"
1858 "pmaddwd %%mm6, %%mm3 \n\t"
1859 "paddd %%mm1, %%mm0 \n\t"
1860 "paddd %%mm3, %%mm2 \n\t"
1862 "movd 6(%0), %%mm1 \n\t"
1863 "movd 8(%0), %%mm3 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm3 \n\t"
1867 "movq %%mm1, %%mm4 \n\t"
1868 "movq %%mm3, %%mm5 \n\t"
1869 "pmaddwd %4, %%mm1 \n\t"
1870 "pmaddwd 8+%4, %%mm3 \n\t"
1871 "pmaddwd 16+%4, %%mm4 \n\t"
1872 "pmaddwd %%mm6, %%mm5 \n\t"
1873 "paddd %%mm3, %%mm1 \n\t"
1874 "paddd %%mm5, %%mm4 \n\t"
1876 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1877 "paddd %%mm3, %%mm0 \n\t"
1878 "paddd %%mm3, %%mm2 \n\t"
1879 "paddd %%mm3, %%mm1 \n\t"
1880 "paddd %%mm3, %%mm4 \n\t"
1881 "psrad $15, %%mm0 \n\t"
1882 "psrad $15, %%mm2 \n\t"
1883 "psrad $15, %%mm1 \n\t"
1884 "psrad $15, %%mm4 \n\t"
1885 "packssdw %%mm1, %%mm0 \n\t"
1886 "packssdw %%mm4, %%mm2 \n\t"
1887 "packuswb %%mm0, %%mm0 \n\t"
1888 "packuswb %%mm2, %%mm2 \n\t"
1889 "movd %%mm0, (%1, %%"REG_a") \n\t"
1890 "movd %%mm2, (%2, %%"REG_a") \n\t"
1891 "add $4, %%"REG_a" \n\t"
1894 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1900 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1902 #if COMPILE_TEMPLATE_MMX
1903 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1906 for (i=0; i<width; i++) {
1911 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1913 #endif /* COMPILE_TEMPLATE_MMX */
1916 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1918 #if COMPILE_TEMPLATE_MMX
1919 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1922 for (i=0; i<width; i++) {
1923 int b= src1[3*i + 0];
1924 int g= src1[3*i + 1];
1925 int r= src1[3*i + 2];
1927 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1928 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1930 #endif /* COMPILE_TEMPLATE_MMX */
1931 assert(src1 == src2);
1934 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1937 for (i=0; i<width; i++) {
1938 int b= src1[6*i + 0] + src1[6*i + 3];
1939 int g= src1[6*i + 1] + src1[6*i + 4];
1940 int r= src1[6*i + 2] + src1[6*i + 5];
1942 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1943 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1945 assert(src1 == src2);
1948 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1950 #if COMPILE_TEMPLATE_MMX
1951 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1954 for (i=0; i<width; i++) {
1959 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1964 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1966 #if COMPILE_TEMPLATE_MMX
1968 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1972 for (i=0; i<width; i++) {
1973 int r= src1[3*i + 0];
1974 int g= src1[3*i + 1];
1975 int b= src1[3*i + 2];
1977 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1978 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1983 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1987 for (i=0; i<width; i++) {
1988 int r= src1[6*i + 0] + src1[6*i + 3];
1989 int g= src1[6*i + 1] + src1[6*i + 4];
1990 int b= src1[6*i + 2] + src1[6*i + 5];
1992 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1993 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1998 // bilinear / bicubic scaling
1999 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2000 const int16_t *filter, const int16_t *filterPos, long filterSize)
2002 #if COMPILE_TEMPLATE_MMX
2003 assert(filterSize % 4 == 0 && filterSize>0);
2004 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2005 x86_reg counter= -2*dstW;
2007 filterPos-= counter/2;
2011 "push %%"REG_b" \n\t"
2013 "pxor %%mm7, %%mm7 \n\t"
2014 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2015 "mov %%"REG_a", %%"REG_BP" \n\t"
2018 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2019 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2020 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2021 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2022 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2023 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2024 "punpcklbw %%mm7, %%mm0 \n\t"
2025 "punpcklbw %%mm7, %%mm2 \n\t"
2026 "pmaddwd %%mm1, %%mm0 \n\t"
2027 "pmaddwd %%mm2, %%mm3 \n\t"
2028 "movq %%mm0, %%mm4 \n\t"
2029 "punpckldq %%mm3, %%mm0 \n\t"
2030 "punpckhdq %%mm3, %%mm4 \n\t"
2031 "paddd %%mm4, %%mm0 \n\t"
2032 "psrad $7, %%mm0 \n\t"
2033 "packssdw %%mm0, %%mm0 \n\t"
2034 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2035 "add $4, %%"REG_BP" \n\t"
2038 "pop %%"REG_BP" \n\t"
2040 "pop %%"REG_b" \n\t"
2043 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2048 } else if (filterSize==8) {
2049 x86_reg counter= -2*dstW;
2051 filterPos-= counter/2;
2055 "push %%"REG_b" \n\t"
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2065 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2073 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2074 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2075 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2076 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm4 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm4 \n\t"
2080 "pmaddwd %%mm2, %%mm5 \n\t"
2081 "paddd %%mm4, %%mm0 \n\t"
2082 "paddd %%mm5, %%mm3 \n\t"
2083 "movq %%mm0, %%mm4 \n\t"
2084 "punpckldq %%mm3, %%mm0 \n\t"
2085 "punpckhdq %%mm3, %%mm4 \n\t"
2086 "paddd %%mm4, %%mm0 \n\t"
2087 "psrad $7, %%mm0 \n\t"
2088 "packssdw %%mm0, %%mm0 \n\t"
2089 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2090 "add $4, %%"REG_BP" \n\t"
2093 "pop %%"REG_BP" \n\t"
2095 "pop %%"REG_b" \n\t"
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2104 uint8_t *offset = src+filterSize;
2105 x86_reg counter= -2*dstW;
2106 //filter-= counter*filterSize/2;
2107 filterPos-= counter/2;
2110 "pxor %%mm7, %%mm7 \n\t"
2113 "mov %2, %%"REG_c" \n\t"
2114 "movzwl (%%"REG_c", %0), %%eax \n\t"
2115 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2116 "mov %5, %%"REG_c" \n\t"
2117 "pxor %%mm4, %%mm4 \n\t"
2118 "pxor %%mm5, %%mm5 \n\t"
2120 "movq (%1), %%mm1 \n\t"
2121 "movq (%1, %6), %%mm3 \n\t"
2122 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2123 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2124 "punpcklbw %%mm7, %%mm0 \n\t"
2125 "punpcklbw %%mm7, %%mm2 \n\t"
2126 "pmaddwd %%mm1, %%mm0 \n\t"
2127 "pmaddwd %%mm2, %%mm3 \n\t"
2128 "paddd %%mm3, %%mm5 \n\t"
2129 "paddd %%mm0, %%mm4 \n\t"
2131 "add $4, %%"REG_c" \n\t"
2132 "cmp %4, %%"REG_c" \n\t"
2135 "movq %%mm4, %%mm0 \n\t"
2136 "punpckldq %%mm5, %%mm4 \n\t"
2137 "punpckhdq %%mm5, %%mm0 \n\t"
2138 "paddd %%mm0, %%mm4 \n\t"
2139 "psrad $7, %%mm4 \n\t"
2140 "packssdw %%mm4, %%mm4 \n\t"
2141 "mov %3, %%"REG_a" \n\t"
2142 "movd %%mm4, (%%"REG_a", %0) \n\t"
2146 : "+r" (counter), "+r" (filter)
2147 : "m" (filterPos), "m" (dst), "m"(offset),
2148 "m" (src), "r" ((x86_reg)filterSize*2)
2149 : "%"REG_a, "%"REG_c, "%"REG_d
2153 #if COMPILE_TEMPLATE_ALTIVEC
2154 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2157 for (i=0; i<dstW; i++) {
2159 int srcPos= filterPos[i];
2161 //printf("filterPos: %d\n", filterPos[i]);
2162 for (j=0; j<filterSize; j++) {
2163 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2164 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2166 //filter += hFilterSize;
2167 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2170 #endif /* COMPILE_ALTIVEC */
2171 #endif /* COMPILE_MMX */
2174 #define FAST_BILINEAR_X86 \
2175 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2176 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2177 "shll $16, %%edi \n\t" \
2178 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2179 "mov %1, %%"REG_D"\n\t" \
2180 "shrl $9, %%esi \n\t" \
2182 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2183 int dstWidth, const uint8_t *src, int srcW,
2187 unsigned int xpos=0;
2188 for (i=0;i<dstWidth;i++) {
2189 register unsigned int xx=xpos>>16;
2190 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2191 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2196 // *** horizontal scale Y line to temp buffer
2197 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2198 int flags, const int16_t *hLumFilter,
2199 const int16_t *hLumFilterPos, int hLumFilterSize,
2200 int srcFormat, uint8_t *formatConvBuffer,
2201 uint32_t *pal, int isAlpha)
2203 int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2204 int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
2205 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2206 void av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2207 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2210 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2213 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2217 if (srcFormat == PIX_FMT_RGB48LE)
2220 if (internal_func) {
2221 internal_func(formatConvBuffer, src, srcW, pal);
2222 src= formatConvBuffer;
2225 #if COMPILE_TEMPLATE_MMX
2226 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2227 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2229 if (!(flags&SWS_FAST_BILINEAR))
2232 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2233 } else { // fast bilinear upscale / crap downscale
2234 #if ARCH_X86 && CONFIG_GPL
2235 #if COMPILE_TEMPLATE_MMX2
2238 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2240 if (canMMX2BeUsed) {
2243 "mov %%"REG_b", %5 \n\t"
2245 "pxor %%mm7, %%mm7 \n\t"
2246 "mov %0, %%"REG_c" \n\t"
2247 "mov %1, %%"REG_D" \n\t"
2248 "mov %2, %%"REG_d" \n\t"
2249 "mov %3, %%"REG_b" \n\t"
2250 "xor %%"REG_a", %%"REG_a" \n\t" // i
2251 PREFETCH" (%%"REG_c") \n\t"
2252 PREFETCH" 32(%%"REG_c") \n\t"
2253 PREFETCH" 64(%%"REG_c") \n\t"
2257 #define CALL_MMX2_FILTER_CODE \
2258 "movl (%%"REG_b"), %%esi \n\t"\
2260 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2261 "add %%"REG_S", %%"REG_c" \n\t"\
2262 "add %%"REG_a", %%"REG_D" \n\t"\
2263 "xor %%"REG_a", %%"REG_a" \n\t"\
2267 #define CALL_MMX2_FILTER_CODE \
2268 "movl (%%"REG_b"), %%esi \n\t"\
2270 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2271 "add %%"REG_a", %%"REG_D" \n\t"\
2272 "xor %%"REG_a", %%"REG_a" \n\t"\
2274 #endif /* ARCH_X86_64 */
2276 CALL_MMX2_FILTER_CODE
2277 CALL_MMX2_FILTER_CODE
2278 CALL_MMX2_FILTER_CODE
2279 CALL_MMX2_FILTER_CODE
2280 CALL_MMX2_FILTER_CODE
2281 CALL_MMX2_FILTER_CODE
2282 CALL_MMX2_FILTER_CODE
2283 CALL_MMX2_FILTER_CODE
2286 "mov %5, %%"REG_b" \n\t"
2288 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2289 "m" (mmx2FilterCode)
2293 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2298 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2300 #endif /* COMPILE_TEMPLATE_MMX2 */
2301 x86_reg xInc_shr16 = xInc >> 16;
2302 uint16_t xInc_mask = xInc & 0xffff;
2303 //NO MMX just normal asm ...
2305 "xor %%"REG_a", %%"REG_a" \n\t" // i
2306 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2307 "xorl %%ecx, %%ecx \n\t" // xalpha
2310 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2311 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2313 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2314 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2315 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2317 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2318 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2320 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2321 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2322 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2325 "add $2, %%"REG_a" \n\t"
2326 "cmp %2, %%"REG_a" \n\t"
2330 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2331 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333 #if COMPILE_TEMPLATE_MMX2
2334 } //if MMX2 can't be used
2337 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2338 #endif /* ARCH_X86 */
2341 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2343 //FIXME all pal and rgb srcFormats could do this convertion as well
2344 //FIXME all scalers more complex than bilinear could do half of this transform
2346 for (i=0; i<dstWidth; i++)
2347 dst[i]= (dst[i]*14071 + 33561947)>>14;
2349 for (i=0; i<dstWidth; i++)
2350 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2355 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2356 int dstWidth, const uint8_t *src1,
2357 const uint8_t *src2, int srcW, int xInc)
2360 unsigned int xpos=0;
2361 for (i=0;i<dstWidth;i++) {
2362 register unsigned int xx=xpos>>16;
2363 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2364 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2365 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2367 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2368 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2374 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2375 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2376 const int16_t *hChrFilterPos, int hChrFilterSize,
2377 int srcFormat, uint8_t *formatConvBuffer,
2380 int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2381 int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
2382 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2383 void av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2385 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2388 if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
2393 if (srcFormat==PIX_FMT_RGB48LE) {
2398 if (c->hcscale_internal) {
2399 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2400 src1= formatConvBuffer;
2401 src2= formatConvBuffer+VOFW;
2404 #if COMPILE_TEMPLATE_MMX
2405 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2406 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2408 if (!(flags&SWS_FAST_BILINEAR))
2411 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2412 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2413 } else { // fast bilinear upscale / crap downscale
2414 #if ARCH_X86 && CONFIG_GPL
2415 #if COMPILE_TEMPLATE_MMX2
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2420 if (canMMX2BeUsed) {
2423 "mov %%"REG_b", %6 \n\t"
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2453 "mov %6, %%"REG_b" \n\t"
2455 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2456 "m" (mmx2FilterCode), "m" (src2)
2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473 uint16_t xInc_mask = xInc & 0xffff;
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2511 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2512 #endif /* ARCH_X86 */
2514 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2516 //FIXME all pal and rgb srcFormats could do this convertion as well
2517 //FIXME all scalers more complex than bilinear could do half of this transform
2519 for (i=0; i<dstWidth; i++) {
2520 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2521 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2524 for (i=0; i<dstWidth; i++) {
2525 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2526 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2532 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2533 int srcSliceH, uint8_t* dst[], int dstStride[])
2535 /* load a few things into local vars to make the code more readable? and faster */
2536 const int srcW= c->srcW;
2537 const int dstW= c->dstW;
2538 const int dstH= c->dstH;
2539 const int chrDstW= c->chrDstW;
2540 const int chrSrcW= c->chrSrcW;
2541 const int lumXInc= c->lumXInc;
2542 const int chrXInc= c->chrXInc;
2543 const int dstFormat= c->dstFormat;
2544 const int srcFormat= c->srcFormat;
2545 const int flags= c->flags;
2546 int16_t *vLumFilterPos= c->vLumFilterPos;
2547 int16_t *vChrFilterPos= c->vChrFilterPos;
2548 int16_t *hLumFilterPos= c->hLumFilterPos;
2549 int16_t *hChrFilterPos= c->hChrFilterPos;
2550 int16_t *vLumFilter= c->vLumFilter;
2551 int16_t *vChrFilter= c->vChrFilter;
2552 int16_t *hLumFilter= c->hLumFilter;
2553 int16_t *hChrFilter= c->hChrFilter;
2554 int32_t *lumMmxFilter= c->lumMmxFilter;
2555 int32_t *chrMmxFilter= c->chrMmxFilter;
2556 int32_t *alpMmxFilter= c->alpMmxFilter;
2557 const int vLumFilterSize= c->vLumFilterSize;
2558 const int vChrFilterSize= c->vChrFilterSize;
2559 const int hLumFilterSize= c->hLumFilterSize;
2560 const int hChrFilterSize= c->hChrFilterSize;
2561 int16_t **lumPixBuf= c->lumPixBuf;
2562 int16_t **chrPixBuf= c->chrPixBuf;
2563 int16_t **alpPixBuf= c->alpPixBuf;
2564 const int vLumBufSize= c->vLumBufSize;
2565 const int vChrBufSize= c->vChrBufSize;
2566 uint8_t *formatConvBuffer= c->formatConvBuffer;
2567 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2568 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2570 uint32_t *pal=c->pal_yuv;
2572 /* vars which will change and which we need to store back in the context */
2574 int lumBufIndex= c->lumBufIndex;
2575 int chrBufIndex= c->chrBufIndex;
2576 int lastInLumBuf= c->lastInLumBuf;
2577 int lastInChrBuf= c->lastInChrBuf;
2579 if (isPacked(c->srcFormat)) {
2587 srcStride[3]= srcStride[0];
2589 srcStride[1]<<= c->vChrDrop;
2590 srcStride[2]<<= c->vChrDrop;
2592 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2593 // (int)dst[0], (int)dst[1], (int)dst[2]);
2595 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2596 //dstStride[0],dstStride[1],dstStride[2]);
2598 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2599 static int warnedAlready=0; //FIXME move this into the context perhaps
2600 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2601 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2602 " ->cannot do aligned memory accesses anymore\n");
2607 /* Note the user might start scaling the picture in the middle so this
2608 will not get executed. This is not really intended but works
2609 currently, so people might do it. */
2610 if (srcSliceY ==0) {
2620 for (;dstY < dstH; dstY++) {
2621 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2622 const int chrDstY= dstY>>c->chrDstVSubSample;
2623 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2624 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2625 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2627 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2628 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2629 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2630 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2633 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2634 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2635 //handle holes (FAST_BILINEAR & weird filters)
2636 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2637 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2638 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2639 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2640 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2642 // Do we have enough lines in this slice to output the dstY line
2643 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2644 if (!enough_lines) {
2645 lastLumSrcY = srcSliceY + srcSliceH - 1;
2646 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2649 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2650 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2651 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2652 vChrBufSize, vLumBufSize);*/
2654 //Do horizontal scaling
2655 while(lastInLumBuf < lastLumSrcY) {
2656 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2657 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2659 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2660 assert(lumBufIndex < 2*vLumBufSize);
2661 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2662 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2663 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2664 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2665 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2666 c->srcFormat, formatConvBuffer,
2668 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2669 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2670 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2671 c->srcFormat, formatConvBuffer,
2675 while(lastInChrBuf < lastChrSrcY) {
2676 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2677 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2679 assert(chrBufIndex < 2*vChrBufSize);
2680 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2681 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2682 //FIXME replace parameters through context struct (some at least)
2684 if (!(isGray(srcFormat) || isGray(dstFormat)))
2685 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2686 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2687 c->srcFormat, formatConvBuffer,
2691 //wrap buf index around to stay inside the ring buffer
2692 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2693 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2695 break; //we can't output a dstY line so let's try with the next slice
2697 #if COMPILE_TEMPLATE_MMX
2698 c->blueDither= ff_dither8[dstY&1];
2699 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2700 c->greenDither= ff_dither8[dstY&1];
2702 c->greenDither= ff_dither4[dstY&1];
2703 c->redDither= ff_dither8[(dstY+1)&1];
2705 if (dstY < dstH-2) {
2706 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2707 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2708 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2709 #if COMPILE_TEMPLATE_MMX
2711 if (flags & SWS_ACCURATE_RND) {
2712 int s= APCK_SIZE / 8;
2713 for (i=0; i<vLumFilterSize; i+=2) {
2714 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2715 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2716 lumMmxFilter[s*i+APCK_COEF/4 ]=
2717 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2718 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2719 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2720 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2721 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2722 alpMmxFilter[s*i+APCK_COEF/4 ]=
2723 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2726 for (i=0; i<vChrFilterSize; i+=2) {
2727 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2728 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2729 chrMmxFilter[s*i+APCK_COEF/4 ]=
2730 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2731 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2734 for (i=0; i<vLumFilterSize; i++) {
2735 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2736 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2737 lumMmxFilter[4*i+2]=
2738 lumMmxFilter[4*i+3]=
2739 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2740 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2741 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2742 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2743 alpMmxFilter[4*i+2]=
2744 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2747 for (i=0; i<vChrFilterSize; i++) {
2748 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2749 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2750 chrMmxFilter[4*i+2]=
2751 chrMmxFilter[4*i+3]=
2752 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2756 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2757 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2758 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2760 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2761 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2762 dest, uDest, dstW, chrDstW, dstFormat);
2763 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2764 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2765 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2766 if (is16BPS(dstFormat)) {
2768 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2769 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2770 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2772 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2773 int16_t *lumBuf = lumPixBuf[0];
2774 int16_t *chrBuf= chrPixBuf[0];
2775 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
2776 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2777 } else { //General YV12
2779 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2780 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2781 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2784 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2785 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2786 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2787 int chrAlpha= vChrFilter[2*dstY+1];
2788 if(flags & SWS_FULL_CHR_H_INT) {
2789 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2790 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2791 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2792 alpSrcPtr, dest, dstW, dstY);
2794 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2795 alpPixBuf ? *alpSrcPtr : NULL,
2796 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2798 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2799 int lumAlpha= vLumFilter[2*dstY+1];
2800 int chrAlpha= vChrFilter[2*dstY+1];
2802 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2804 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2805 if(flags & SWS_FULL_CHR_H_INT) {
2806 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2807 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2808 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809 alpSrcPtr, dest, dstW, dstY);
2811 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2812 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2813 dest, dstW, lumAlpha, chrAlpha, dstY);
2815 } else { //general RGB
2816 if(flags & SWS_FULL_CHR_H_INT) {
2818 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2819 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820 alpSrcPtr, dest, dstW, dstY);
2823 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2824 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2825 alpSrcPtr, dest, dstW, dstY);
2829 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2830 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2831 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2832 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2833 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2834 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2835 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2837 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2838 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2839 dest, uDest, dstW, chrDstW, dstFormat);
2840 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2841 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2842 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2843 if (is16BPS(dstFormat)) {
2845 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2846 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2847 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2851 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2852 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2856 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2857 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2858 if(flags & SWS_FULL_CHR_H_INT) {
2860 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2862 alpSrcPtr, dest, dstW, dstY);
2865 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2866 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2867 alpSrcPtr, dest, dstW, dstY);
2873 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2874 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2876 #if COMPILE_TEMPLATE_MMX
2877 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2878 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2879 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2880 else __asm__ volatile("emms" :::"memory");
2882 /* store changed local vars back in the context */
2884 c->lumBufIndex= lumBufIndex;
2885 c->chrBufIndex= chrBufIndex;
2886 c->lastInLumBuf= lastInLumBuf;
2887 c->lastInChrBuf= lastInChrBuf;
2889 return dstY - lastDstY;
2892 static void RENAME(sws_init_swScale)(SwsContext *c)
2894 enum PixelFormat srcFormat = c->srcFormat;
2896 c->yuv2nv12X = RENAME(yuv2nv12X );
2897 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2898 c->yuv2yuvX = RENAME(yuv2yuvX );
2899 c->yuv2packed1 = RENAME(yuv2packed1 );
2900 c->yuv2packed2 = RENAME(yuv2packed2 );
2901 c->yuv2packedX = RENAME(yuv2packedX );
2903 c->hScale = RENAME(hScale );
2905 c->hyscale_fast = RENAME(hyscale_fast);
2906 c->hcscale_fast = RENAME(hcscale_fast);
2908 c->hcscale_internal = NULL;
2910 case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
2911 case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
2915 case PIX_FMT_BGR4_BYTE:
2916 case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
2917 case PIX_FMT_YUV420PBE:
2918 case PIX_FMT_YUV422PBE:
2919 case PIX_FMT_YUV444PBE: c->hcscale_internal = RENAME(BEToUV); break;
2920 case PIX_FMT_YUV420PLE:
2921 case PIX_FMT_YUV422PLE:
2922 case PIX_FMT_YUV444PLE: c->hcscale_internal = RENAME(LEToUV); break;
2924 if (c->chrSrcHSubSample) {
2926 case PIX_FMT_RGB48BE:
2927 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
2928 case PIX_FMT_RGB32 :
2929 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
2930 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
2931 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
2932 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
2933 case PIX_FMT_BGR32 :
2934 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
2935 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
2936 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
2937 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
2941 case PIX_FMT_RGB48BE:
2942 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
2943 case PIX_FMT_RGB32 :
2944 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
2945 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break;
2946 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
2947 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
2948 case PIX_FMT_BGR32 :
2949 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
2950 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break;
2951 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
2952 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
2956 c->hyscale_internal = NULL;
2957 c->hascale_internal = NULL;
2958 switch (srcFormat) {
2959 case PIX_FMT_YUYV422 :
2960 case PIX_FMT_YUV420PBE:
2961 case PIX_FMT_YUV422PBE:
2962 case PIX_FMT_YUV444PBE:
2963 case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
2964 case PIX_FMT_UYVY422 :
2965 case PIX_FMT_YUV420PLE:
2966 case PIX_FMT_YUV422PLE:
2967 case PIX_FMT_YUV444PLE:
2968 case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
2969 case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break;
2970 case PIX_FMT_BGR565 : c->hyscale_internal = bgr16ToY; break;
2971 case PIX_FMT_BGR555 : c->hyscale_internal = bgr15ToY; break;
2972 case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break;
2973 case PIX_FMT_RGB565 : c->hyscale_internal = rgb16ToY; break;
2974 case PIX_FMT_RGB555 : c->hyscale_internal = rgb15ToY; break;
2978 case PIX_FMT_BGR4_BYTE:
2979 case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
2980 case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
2981 case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
2982 case PIX_FMT_RGB32 :
2983 case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
2984 case PIX_FMT_BGR32 :
2985 case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
2986 case PIX_FMT_RGB48BE:
2987 case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
2990 switch (srcFormat) {
2991 case PIX_FMT_RGB32 :
2992 case PIX_FMT_RGB32_1:
2993 case PIX_FMT_BGR32 :
2994 case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;