2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
194 #define YSCALEYUV2PACKEDX_UV \
196 "xor %%"REG_a", %%"REG_a" \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
250 "xor %%"REG_a", %%"REG_a" \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
824 "add $24, "#dst" \n\t"\
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
872 "add $24, "#dst" \n\t"\
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
878 #if COMPILE_TEMPLATE_MMX2
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)) {
910 if (c->flags & SWS_ACCURATE_RND) {
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
915 if (CONFIG_SWSCALE_ALPHA && aDest) {
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
925 if (CONFIG_SWSCALE_ALPHA && aDest) {
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)) {
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
965 if (c->flags & SWS_ACCURATE_RND) {
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
991 for (i=0; i<dstW; i++) {
992 int val= (lumSrc[i]+64)>>7;
1003 for (i=0; i<chrDstW; i++) {
1004 int u=(chrSrc[i ]+64)>>7;
1005 int v=(chrSrc[i + VOFW]+64)>>7;
1009 else if (u>255) u=255;
1011 else if (v>255) v=255;
1018 if (CONFIG_SWSCALE_ALPHA && aDest)
1019 for (i=0; i<dstW; i++) {
1020 int val= (alpSrc[i]+64)>>7;
1021 aDest[i]= av_clip_uint8(val);
1027 * vertical scale YV12 to RGB
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1033 #if COMPILE_TEMPLATE_MMX
1035 if(!(c->flags & SWS_BITEXACT)) {
1036 if (c->flags & SWS_ACCURATE_RND) {
1037 switch(c->dstFormat) {
1039 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 "movq %%mm2, "U_TEMP"(%0) \n\t"
1043 "movq %%mm4, "V_TEMP"(%0) \n\t"
1044 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1045 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1047 "psraw $3, %%mm1 \n\t"
1048 "psraw $3, %%mm7 \n\t"
1049 "packuswb %%mm7, %%mm1 \n\t"
1050 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1052 YSCALEYUV2PACKEDX_END
1054 YSCALEYUV2PACKEDX_ACCURATE
1056 "pcmpeqd %%mm7, %%mm7 \n\t"
1057 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1059 YSCALEYUV2PACKEDX_END
1063 YSCALEYUV2PACKEDX_ACCURATE
1065 "pxor %%mm7, %%mm7 \n\t"
1066 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067 "add %4, %%"REG_c" \n\t"
1068 WRITEBGR24(%%REGc, %5, %%REGa)
1071 :: "r" (&c->redDither),
1072 "m" (dummy), "m" (dummy), "m" (dummy),
1073 "r" (dest), "m" (dstW)
1074 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 case PIX_FMT_RGB555:
1078 YSCALEYUV2PACKEDX_ACCURATE
1080 "pxor %%mm7, %%mm7 \n\t"
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088 WRITERGB15(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_RGB565:
1092 YSCALEYUV2PACKEDX_ACCURATE
1094 "pxor %%mm7, %%mm7 \n\t"
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102 WRITERGB16(%4, %5, %%REGa)
1103 YSCALEYUV2PACKEDX_END
1105 case PIX_FMT_YUYV422:
1106 YSCALEYUV2PACKEDX_ACCURATE
1107 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1109 "psraw $3, %%mm3 \n\t"
1110 "psraw $3, %%mm4 \n\t"
1111 "psraw $3, %%mm1 \n\t"
1112 "psraw $3, %%mm7 \n\t"
1113 WRITEYUY2(%4, %5, %%REGa)
1114 YSCALEYUV2PACKEDX_END
1118 switch(c->dstFormat) {
1120 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1123 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124 "psraw $3, %%mm1 \n\t"
1125 "psraw $3, %%mm7 \n\t"
1126 "packuswb %%mm7, %%mm1 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1132 "pcmpeqd %%mm7, %%mm7 \n\t"
1133 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 YSCALEYUV2PACKEDX_END
1140 "pxor %%mm7, %%mm7 \n\t"
1141 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1142 "add %4, %%"REG_c" \n\t"
1143 WRITEBGR24(%%REGc, %5, %%REGa)
1145 :: "r" (&c->redDither),
1146 "m" (dummy), "m" (dummy), "m" (dummy),
1147 "r" (dest), "m" (dstW)
1148 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1151 case PIX_FMT_RGB555:
1154 "pxor %%mm7, %%mm7 \n\t"
1155 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1157 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1158 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1159 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1162 WRITERGB15(%4, %5, %%REGa)
1163 YSCALEYUV2PACKEDX_END
1165 case PIX_FMT_RGB565:
1168 "pxor %%mm7, %%mm7 \n\t"
1169 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1171 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1172 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1173 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1176 WRITERGB16(%4, %5, %%REGa)
1177 YSCALEYUV2PACKEDX_END
1179 case PIX_FMT_YUYV422:
1181 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1183 "psraw $3, %%mm3 \n\t"
1184 "psraw $3, %%mm4 \n\t"
1185 "psraw $3, %%mm1 \n\t"
1186 "psraw $3, %%mm7 \n\t"
1187 WRITEYUY2(%4, %5, %%REGa)
1188 YSCALEYUV2PACKEDX_END
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195 /* The following list of supported dstFormat values should
1196 match what's found in the body of ff_yuv2packedX_altivec() */
1197 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1199 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1201 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1206 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207 chrFilter, chrSrc, chrFilterSize,
1208 alpSrc, dest, dstW, dstY);
1212 * vertical bilinear scale YV12 to RGB
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1217 int yalpha1=4095- yalpha;
1218 int uvalpha1=4095-uvalpha;
1221 #if COMPILE_TEMPLATE_MMX
1222 if(!(c->flags & SWS_BITEXACT)) {
1223 switch(c->dstFormat) {
1224 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1226 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1229 YSCALEYUV2RGB(%%REGBP, %5)
1230 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233 "packuswb %%mm7, %%mm1 \n\t"
1234 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1236 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1238 ,"r" (abuf0), "r" (abuf1)
1242 *(uint16_t **)(&c->u_temp)=abuf0;
1243 *(uint16_t **)(&c->v_temp)=abuf1;
1245 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1246 "mov %4, %%"REG_b" \n\t"
1247 "push %%"REG_BP" \n\t"
1248 YSCALEYUV2RGB(%%REGBP, %5)
1251 "mov "U_TEMP"(%5), %0 \n\t"
1252 "mov "V_TEMP"(%5), %1 \n\t"
1253 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256 "packuswb %%mm7, %%mm1 \n\t"
1259 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 "pcmpeqd %%mm7, %%mm7 \n\t"
1274 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275 "pop %%"REG_BP" \n\t"
1276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1278 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1285 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_b" \n\t"
1287 "push %%"REG_BP" \n\t"
1288 YSCALEYUV2RGB(%%REGBP, %5)
1289 "pxor %%mm7, %%mm7 \n\t"
1290 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291 "pop %%"REG_BP" \n\t"
1292 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1293 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1297 case PIX_FMT_RGB555:
1299 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1300 "mov %4, %%"REG_b" \n\t"
1301 "push %%"REG_BP" \n\t"
1302 YSCALEYUV2RGB(%%REGBP, %5)
1303 "pxor %%mm7, %%mm7 \n\t"
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1307 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1308 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1311 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312 "pop %%"REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1315 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1319 case PIX_FMT_RGB565:
1321 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"REG_b" \n\t"
1323 "push %%"REG_BP" \n\t"
1324 YSCALEYUV2RGB(%%REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1333 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1340 case PIX_FMT_YUYV422:
1342 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1343 "mov %4, %%"REG_b" \n\t"
1344 "push %%"REG_BP" \n\t"
1345 YSCALEYUV2PACKED(%%REGBP, %5)
1346 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347 "pop %%"REG_BP" \n\t"
1348 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1349 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1356 #endif //COMPILE_TEMPLATE_MMX
1357 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1361 * YV12 to RGB without scaling or interpolating
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1366 const int yalpha1=0;
1369 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370 const int yalpha= 4096; //FIXME ...
1372 if (flags&SWS_FULL_CHR_H_INT) {
1373 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1377 #if COMPILE_TEMPLATE_MMX
1378 if(!(flags & SWS_BITEXACT)) {
1379 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1382 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2RGB1(%%REGBP, %5)
1388 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1393 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1398 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1399 "mov %4, %%"REG_b" \n\t"
1400 "push %%"REG_BP" \n\t"
1401 YSCALEYUV2RGB1(%%REGBP, %5)
1402 "pcmpeqd %%mm7, %%mm7 \n\t"
1403 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404 "pop %%"REG_BP" \n\t"
1405 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1407 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1415 "mov %4, %%"REG_b" \n\t"
1416 "push %%"REG_BP" \n\t"
1417 YSCALEYUV2RGB1(%%REGBP, %5)
1418 "pxor %%mm7, %%mm7 \n\t"
1419 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420 "pop %%"REG_BP" \n\t"
1421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1423 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1427 case PIX_FMT_RGB555:
1429 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_b" \n\t"
1431 "push %%"REG_BP" \n\t"
1432 YSCALEYUV2RGB1(%%REGBP, %5)
1433 "pxor %%mm7, %%mm7 \n\t"
1434 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1436 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1437 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1438 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1440 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441 "pop %%"REG_BP" \n\t"
1442 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1444 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1448 case PIX_FMT_RGB565:
1450 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1451 "mov %4, %%"REG_b" \n\t"
1452 "push %%"REG_BP" \n\t"
1453 YSCALEYUV2RGB1(%%REGBP, %5)
1454 "pxor %%mm7, %%mm7 \n\t"
1455 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1457 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1458 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1459 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1462 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1470 case PIX_FMT_YUYV422:
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2PACKED1(%%REGBP, %5)
1476 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1491 "mov %4, %%"REG_b" \n\t"
1492 "push %%"REG_BP" \n\t"
1493 YSCALEYUV2RGB1b(%%REGBP, %5)
1494 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496 "pop %%"REG_BP" \n\t"
1497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1499 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1505 "mov %4, %%"REG_b" \n\t"
1506 "push %%"REG_BP" \n\t"
1507 YSCALEYUV2RGB1b(%%REGBP, %5)
1508 "pcmpeqd %%mm7, %%mm7 \n\t"
1509 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510 "pop %%"REG_BP" \n\t"
1511 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1b(%%REGBP, %5)
1524 "pxor %%mm7, %%mm7 \n\t"
1525 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1533 case PIX_FMT_RGB555:
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1b(%%REGBP, %5)
1539 "pxor %%mm7, %%mm7 \n\t"
1540 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1542 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1543 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1544 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1546 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1554 case PIX_FMT_RGB565:
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1b(%%REGBP, %5)
1560 "pxor %%mm7, %%mm7 \n\t"
1561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1563 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1564 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1565 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1568 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 case PIX_FMT_YUYV422:
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2PACKED1b(%%REGBP, %5)
1582 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594 if (uvalpha < 2048) {
1595 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1597 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1601 //FIXME yuy2* can read up to 7 samples too much
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1605 #if COMPILE_TEMPLATE_MMX
1607 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1608 "mov %0, %%"REG_a" \n\t"
1610 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1611 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1612 "pand %%mm2, %%mm0 \n\t"
1613 "pand %%mm2, %%mm1 \n\t"
1614 "packuswb %%mm1, %%mm0 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "add $8, %%"REG_a" \n\t"
1618 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1623 for (i=0; i<width; i++)
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1630 #if COMPILE_TEMPLATE_MMX
1632 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1633 "mov %0, %%"REG_a" \n\t"
1635 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1636 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "psrlw $8, %%mm1 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "movq %%mm0, %%mm1 \n\t"
1641 "psrlw $8, %%mm0 \n\t"
1642 "pand %%mm4, %%mm1 \n\t"
1643 "packuswb %%mm0, %%mm0 \n\t"
1644 "packuswb %%mm1, %%mm1 \n\t"
1645 "movd %%mm0, (%3, %%"REG_a") \n\t"
1646 "movd %%mm1, (%2, %%"REG_a") \n\t"
1647 "add $4, %%"REG_a" \n\t"
1649 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1654 for (i=0; i<width; i++) {
1655 dstU[i]= src1[4*i + 1];
1656 dstV[i]= src1[4*i + 3];
1659 assert(src1 == src2);
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1664 #if COMPILE_TEMPLATE_MMX
1666 "mov %0, %%"REG_a" \n\t"
1668 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1669 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1670 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1671 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1672 "psrlw $8, %%mm0 \n\t"
1673 "psrlw $8, %%mm1 \n\t"
1674 "psrlw $8, %%mm2 \n\t"
1675 "psrlw $8, %%mm3 \n\t"
1676 "packuswb %%mm1, %%mm0 \n\t"
1677 "packuswb %%mm3, %%mm2 \n\t"
1678 "movq %%mm0, (%3, %%"REG_a") \n\t"
1679 "movq %%mm2, (%4, %%"REG_a") \n\t"
1680 "add $8, %%"REG_a" \n\t"
1682 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1687 for (i=0; i<width; i++) {
1688 dstU[i]= src1[2*i + 1];
1689 dstV[i]= src2[2*i + 1];
1694 /* This is almost identical to the previous, end exists only because
1695 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1698 #if COMPILE_TEMPLATE_MMX
1700 "mov %0, %%"REG_a" \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "psrlw $8, %%mm0 \n\t"
1705 "psrlw $8, %%mm1 \n\t"
1706 "packuswb %%mm1, %%mm0 \n\t"
1707 "movq %%mm0, (%2, %%"REG_a") \n\t"
1708 "add $8, %%"REG_a" \n\t"
1710 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1715 for (i=0; i<width; i++)
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1722 #if COMPILE_TEMPLATE_MMX
1724 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1725 "mov %0, %%"REG_a" \n\t"
1727 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1728 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1729 "pand %%mm4, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm1, %%mm0 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "psrlw $8, %%mm0 \n\t"
1734 "pand %%mm4, %%mm1 \n\t"
1735 "packuswb %%mm0, %%mm0 \n\t"
1736 "packuswb %%mm1, %%mm1 \n\t"
1737 "movd %%mm0, (%3, %%"REG_a") \n\t"
1738 "movd %%mm1, (%2, %%"REG_a") \n\t"
1739 "add $4, %%"REG_a" \n\t"
1741 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1746 for (i=0; i<width; i++) {
1747 dstU[i]= src1[4*i + 0];
1748 dstV[i]= src1[4*i + 2];
1751 assert(src1 == src2);
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1756 #if COMPILE_TEMPLATE_MMX
1758 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1759 "mov %0, %%"REG_a" \n\t"
1761 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1762 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1763 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1764 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1765 "pand %%mm4, %%mm0 \n\t"
1766 "pand %%mm4, %%mm1 \n\t"
1767 "pand %%mm4, %%mm2 \n\t"
1768 "pand %%mm4, %%mm3 \n\t"
1769 "packuswb %%mm1, %%mm0 \n\t"
1770 "packuswb %%mm3, %%mm2 \n\t"
1771 "movq %%mm0, (%3, %%"REG_a") \n\t"
1772 "movq %%mm2, (%4, %%"REG_a") \n\t"
1773 "add $8, %%"REG_a" \n\t"
1775 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1780 for (i=0; i<width; i++) {
1787 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1788 const uint8_t *src, long width)
1790 #if COMPILE_TEMPLATE_MMX
1792 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1793 "mov %0, %%"REG_a" \n\t"
1795 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1796 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1797 "movq %%mm0, %%mm2 \n\t"
1798 "movq %%mm1, %%mm3 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "psrlw $8, %%mm2 \n\t"
1802 "psrlw $8, %%mm3 \n\t"
1803 "packuswb %%mm1, %%mm0 \n\t"
1804 "packuswb %%mm3, %%mm2 \n\t"
1805 "movq %%mm0, (%2, %%"REG_a") \n\t"
1806 "movq %%mm2, (%3, %%"REG_a") \n\t"
1807 "add $8, %%"REG_a" \n\t"
1809 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1814 for (i = 0; i < width; i++) {
1815 dst1[i] = src[2*i+0];
1816 dst2[i] = src[2*i+1];
1821 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822 const uint8_t *src1, const uint8_t *src2,
1823 long width, uint32_t *unused)
1825 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1828 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1829 const uint8_t *src1, const uint8_t *src2,
1830 long width, uint32_t *unused)
1832 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1835 #if COMPILE_TEMPLATE_MMX
1836 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1839 if(srcFormat == PIX_FMT_BGR24) {
1841 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1842 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1847 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1848 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1854 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1855 "mov %2, %%"REG_a" \n\t"
1856 "pxor %%mm7, %%mm7 \n\t"
1858 PREFETCH" 64(%0) \n\t"
1859 "movd (%0), %%mm0 \n\t"
1860 "movd 2(%0), %%mm1 \n\t"
1861 "movd 6(%0), %%mm2 \n\t"
1862 "movd 8(%0), %%mm3 \n\t"
1864 "punpcklbw %%mm7, %%mm0 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm5, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm5, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872 "paddd %%mm1, %%mm0 \n\t"
1873 "paddd %%mm3, %%mm2 \n\t"
1874 "paddd %%mm4, %%mm0 \n\t"
1875 "paddd %%mm4, %%mm2 \n\t"
1876 "psrad $15, %%mm0 \n\t"
1877 "psrad $15, %%mm2 \n\t"
1878 "packssdw %%mm2, %%mm0 \n\t"
1879 "packuswb %%mm0, %%mm0 \n\t"
1880 "movd %%mm0, (%1, %%"REG_a") \n\t"
1881 "add $4, %%"REG_a" \n\t"
1884 : "r" (dst+width), "g" ((x86_reg)-width)
1889 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1892 "movq 24+%4, %%mm6 \n\t"
1893 "mov %3, %%"REG_a" \n\t"
1894 "pxor %%mm7, %%mm7 \n\t"
1896 PREFETCH" 64(%0) \n\t"
1897 "movd (%0), %%mm0 \n\t"
1898 "movd 2(%0), %%mm1 \n\t"
1899 "punpcklbw %%mm7, %%mm0 \n\t"
1900 "punpcklbw %%mm7, %%mm1 \n\t"
1901 "movq %%mm0, %%mm2 \n\t"
1902 "movq %%mm1, %%mm3 \n\t"
1903 "pmaddwd %4, %%mm0 \n\t"
1904 "pmaddwd 8+%4, %%mm1 \n\t"
1905 "pmaddwd 16+%4, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 "paddd %%mm1, %%mm0 \n\t"
1908 "paddd %%mm3, %%mm2 \n\t"
1910 "movd 6(%0), %%mm1 \n\t"
1911 "movd 8(%0), %%mm3 \n\t"
1913 "punpcklbw %%mm7, %%mm1 \n\t"
1914 "punpcklbw %%mm7, %%mm3 \n\t"
1915 "movq %%mm1, %%mm4 \n\t"
1916 "movq %%mm3, %%mm5 \n\t"
1917 "pmaddwd %4, %%mm1 \n\t"
1918 "pmaddwd 8+%4, %%mm3 \n\t"
1919 "pmaddwd 16+%4, %%mm4 \n\t"
1920 "pmaddwd %%mm6, %%mm5 \n\t"
1921 "paddd %%mm3, %%mm1 \n\t"
1922 "paddd %%mm5, %%mm4 \n\t"
1924 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1925 "paddd %%mm3, %%mm0 \n\t"
1926 "paddd %%mm3, %%mm2 \n\t"
1927 "paddd %%mm3, %%mm1 \n\t"
1928 "paddd %%mm3, %%mm4 \n\t"
1929 "psrad $15, %%mm0 \n\t"
1930 "psrad $15, %%mm2 \n\t"
1931 "psrad $15, %%mm1 \n\t"
1932 "psrad $15, %%mm4 \n\t"
1933 "packssdw %%mm1, %%mm0 \n\t"
1934 "packssdw %%mm4, %%mm2 \n\t"
1935 "packuswb %%mm0, %%mm0 \n\t"
1936 "packuswb %%mm2, %%mm2 \n\t"
1937 "movd %%mm0, (%1, %%"REG_a") \n\t"
1938 "movd %%mm2, (%2, %%"REG_a") \n\t"
1939 "add $4, %%"REG_a" \n\t"
1942 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1948 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1950 #if COMPILE_TEMPLATE_MMX
1951 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1954 for (i=0; i<width; i++) {
1959 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1961 #endif /* COMPILE_TEMPLATE_MMX */
1964 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1966 #if COMPILE_TEMPLATE_MMX
1967 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1970 for (i=0; i<width; i++) {
1971 int b= src1[3*i + 0];
1972 int g= src1[3*i + 1];
1973 int r= src1[3*i + 2];
1975 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1976 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1978 #endif /* COMPILE_TEMPLATE_MMX */
1979 assert(src1 == src2);
1982 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1985 for (i=0; i<width; i++) {
1986 int b= src1[6*i + 0] + src1[6*i + 3];
1987 int g= src1[6*i + 1] + src1[6*i + 4];
1988 int r= src1[6*i + 2] + src1[6*i + 5];
1990 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1991 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1993 assert(src1 == src2);
1996 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1998 #if COMPILE_TEMPLATE_MMX
1999 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2002 for (i=0; i<width; i++) {
2007 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2012 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2014 #if COMPILE_TEMPLATE_MMX
2016 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2020 for (i=0; i<width; i++) {
2021 int r= src1[3*i + 0];
2022 int g= src1[3*i + 1];
2023 int b= src1[3*i + 2];
2025 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2026 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2031 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2035 for (i=0; i<width; i++) {
2036 int r= src1[6*i + 0] + src1[6*i + 3];
2037 int g= src1[6*i + 1] + src1[6*i + 4];
2038 int b= src1[6*i + 2] + src1[6*i + 5];
2040 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2041 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2046 // bilinear / bicubic scaling
2047 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2048 const int16_t *filter, const int16_t *filterPos, long filterSize)
2050 #if COMPILE_TEMPLATE_MMX
2051 assert(filterSize % 4 == 0 && filterSize>0);
2052 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2053 x86_reg counter= -2*dstW;
2055 filterPos-= counter/2;
2059 "push %%"REG_b" \n\t"
2061 "pxor %%mm7, %%mm7 \n\t"
2062 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2063 "mov %%"REG_a", %%"REG_BP" \n\t"
2066 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2067 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2068 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2069 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2070 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2071 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2072 "punpcklbw %%mm7, %%mm0 \n\t"
2073 "punpcklbw %%mm7, %%mm2 \n\t"
2074 "pmaddwd %%mm1, %%mm0 \n\t"
2075 "pmaddwd %%mm2, %%mm3 \n\t"
2076 "movq %%mm0, %%mm4 \n\t"
2077 "punpckldq %%mm3, %%mm0 \n\t"
2078 "punpckhdq %%mm3, %%mm4 \n\t"
2079 "paddd %%mm4, %%mm0 \n\t"
2080 "psrad $7, %%mm0 \n\t"
2081 "packssdw %%mm0, %%mm0 \n\t"
2082 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2083 "add $4, %%"REG_BP" \n\t"
2086 "pop %%"REG_BP" \n\t"
2088 "pop %%"REG_b" \n\t"
2091 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2096 } else if (filterSize==8) {
2097 x86_reg counter= -2*dstW;
2099 filterPos-= counter/2;
2103 "push %%"REG_b" \n\t"
2105 "pxor %%mm7, %%mm7 \n\t"
2106 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2107 "mov %%"REG_a", %%"REG_BP" \n\t"
2110 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2111 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2112 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2113 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2114 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2115 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm0 \n\t"
2117 "punpcklbw %%mm7, %%mm2 \n\t"
2118 "pmaddwd %%mm1, %%mm0 \n\t"
2119 "pmaddwd %%mm2, %%mm3 \n\t"
2121 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2122 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2123 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2124 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm4 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "pmaddwd %%mm1, %%mm4 \n\t"
2128 "pmaddwd %%mm2, %%mm5 \n\t"
2129 "paddd %%mm4, %%mm0 \n\t"
2130 "paddd %%mm5, %%mm3 \n\t"
2131 "movq %%mm0, %%mm4 \n\t"
2132 "punpckldq %%mm3, %%mm0 \n\t"
2133 "punpckhdq %%mm3, %%mm4 \n\t"
2134 "paddd %%mm4, %%mm0 \n\t"
2135 "psrad $7, %%mm0 \n\t"
2136 "packssdw %%mm0, %%mm0 \n\t"
2137 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2138 "add $4, %%"REG_BP" \n\t"
2141 "pop %%"REG_BP" \n\t"
2143 "pop %%"REG_b" \n\t"
2146 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2152 uint8_t *offset = src+filterSize;
2153 x86_reg counter= -2*dstW;
2154 //filter-= counter*filterSize/2;
2155 filterPos-= counter/2;
2158 "pxor %%mm7, %%mm7 \n\t"
2161 "mov %2, %%"REG_c" \n\t"
2162 "movzwl (%%"REG_c", %0), %%eax \n\t"
2163 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2164 "mov %5, %%"REG_c" \n\t"
2165 "pxor %%mm4, %%mm4 \n\t"
2166 "pxor %%mm5, %%mm5 \n\t"
2168 "movq (%1), %%mm1 \n\t"
2169 "movq (%1, %6), %%mm3 \n\t"
2170 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2171 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm0 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "pmaddwd %%mm1, %%mm0 \n\t"
2175 "pmaddwd %%mm2, %%mm3 \n\t"
2176 "paddd %%mm3, %%mm5 \n\t"
2177 "paddd %%mm0, %%mm4 \n\t"
2179 "add $4, %%"REG_c" \n\t"
2180 "cmp %4, %%"REG_c" \n\t"
2183 "movq %%mm4, %%mm0 \n\t"
2184 "punpckldq %%mm5, %%mm4 \n\t"
2185 "punpckhdq %%mm5, %%mm0 \n\t"
2186 "paddd %%mm0, %%mm4 \n\t"
2187 "psrad $7, %%mm4 \n\t"
2188 "packssdw %%mm4, %%mm4 \n\t"
2189 "mov %3, %%"REG_a" \n\t"
2190 "movd %%mm4, (%%"REG_a", %0) \n\t"
2194 : "+r" (counter), "+r" (filter)
2195 : "m" (filterPos), "m" (dst), "m"(offset),
2196 "m" (src), "r" ((x86_reg)filterSize*2)
2197 : "%"REG_a, "%"REG_c, "%"REG_d
2201 #if COMPILE_TEMPLATE_ALTIVEC
2202 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2205 for (i=0; i<dstW; i++) {
2207 int srcPos= filterPos[i];
2209 //printf("filterPos: %d\n", filterPos[i]);
2210 for (j=0; j<filterSize; j++) {
2211 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2212 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2214 //filter += hFilterSize;
2215 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2218 #endif /* COMPILE_ALTIVEC */
2219 #endif /* COMPILE_MMX */
2222 //FIXME all pal and rgb srcFormats could do this convertion as well
2223 //FIXME all scalers more complex than bilinear could do half of this transform
2224 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2227 for (i = 0; i < width; i++) {
2228 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2229 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2232 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2235 for (i = 0; i < width; i++) {
2236 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2237 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2240 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2243 for (i = 0; i < width; i++)
2244 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2246 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2249 for (i = 0; i < width; i++)
2250 dst[i] = (dst[i]*14071 + 33561947)>>14;
2253 #define FAST_BILINEAR_X86 \
2254 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2255 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2256 "shll $16, %%edi \n\t" \
2257 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2258 "mov %1, %%"REG_D"\n\t" \
2259 "shrl $9, %%esi \n\t" \
2261 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2262 long dstWidth, const uint8_t *src, int srcW,
2265 #if ARCH_X86 && CONFIG_GPL
2266 #if COMPILE_TEMPLATE_MMX2
2267 int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2268 int16_t *mmx2Filter = c->lumMmx2Filter;
2269 int canMMX2BeUsed = c->canMMX2BeUsed;
2270 void *mmx2FilterCode= c->lumMmx2FilterCode;
2273 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2275 if (canMMX2BeUsed) {
2278 "mov %%"REG_b", %5 \n\t"
2280 "pxor %%mm7, %%mm7 \n\t"
2281 "mov %0, %%"REG_c" \n\t"
2282 "mov %1, %%"REG_D" \n\t"
2283 "mov %2, %%"REG_d" \n\t"
2284 "mov %3, %%"REG_b" \n\t"
2285 "xor %%"REG_a", %%"REG_a" \n\t" // i
2286 PREFETCH" (%%"REG_c") \n\t"
2287 PREFETCH" 32(%%"REG_c") \n\t"
2288 PREFETCH" 64(%%"REG_c") \n\t"
2292 #define CALL_MMX2_FILTER_CODE \
2293 "movl (%%"REG_b"), %%esi \n\t"\
2295 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2296 "add %%"REG_S", %%"REG_c" \n\t"\
2297 "add %%"REG_a", %%"REG_D" \n\t"\
2298 "xor %%"REG_a", %%"REG_a" \n\t"\
2302 #define CALL_MMX2_FILTER_CODE \
2303 "movl (%%"REG_b"), %%esi \n\t"\
2305 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2306 "add %%"REG_a", %%"REG_D" \n\t"\
2307 "xor %%"REG_a", %%"REG_a" \n\t"\
2309 #endif /* ARCH_X86_64 */
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2315 CALL_MMX2_FILTER_CODE
2316 CALL_MMX2_FILTER_CODE
2317 CALL_MMX2_FILTER_CODE
2318 CALL_MMX2_FILTER_CODE
2321 "mov %5, %%"REG_b" \n\t"
2323 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2324 "m" (mmx2FilterCode)
2328 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2333 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2335 #endif /* COMPILE_TEMPLATE_MMX2 */
2336 x86_reg xInc_shr16 = xInc >> 16;
2337 uint16_t xInc_mask = xInc & 0xffff;
2338 //NO MMX just normal asm ...
2340 "xor %%"REG_a", %%"REG_a" \n\t" // i
2341 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2342 "xorl %%ecx, %%ecx \n\t" // xalpha
2345 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2346 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2348 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2349 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2350 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2352 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2353 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2355 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2356 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2357 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2360 "add $2, %%"REG_a" \n\t"
2361 "cmp %2, %%"REG_a" \n\t"
2365 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2366 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2368 #if COMPILE_TEMPLATE_MMX2
2369 } //if MMX2 can't be used
2373 unsigned int xpos=0;
2374 for (i=0;i<dstWidth;i++) {
2375 register unsigned int xx=xpos>>16;
2376 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2377 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2380 #endif /* ARCH_X86 */
2383 // *** horizontal scale Y line to temp buffer
2384 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2385 int flags, const int16_t *hLumFilter,
2386 const int16_t *hLumFilterPos, int hLumFilterSize,
2387 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2388 uint32_t *pal, int isAlpha)
2390 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2391 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2393 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2396 toYV12(formatConvBuffer, src, srcW, pal);
2397 src= formatConvBuffer;
2400 if (!c->hyscale_fast)
2402 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2403 } else { // fast bilinear upscale / crap downscale
2404 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2408 convertRange(dst, dstWidth);
2411 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2412 long dstWidth, const uint8_t *src1,
2413 const uint8_t *src2, int srcW, int xInc)
2415 #if ARCH_X86 && CONFIG_GPL
2416 #if COMPILE_TEMPLATE_MMX2
2417 int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2418 int16_t *mmx2Filter = c->chrMmx2Filter;
2419 int canMMX2BeUsed = c->canMMX2BeUsed;
2420 void *mmx2FilterCode= c->chrMmx2FilterCode;
2423 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2425 if (canMMX2BeUsed) {
2428 "mov %%"REG_b", %6 \n\t"
2430 "pxor %%mm7, %%mm7 \n\t"
2431 "mov %0, %%"REG_c" \n\t"
2432 "mov %1, %%"REG_D" \n\t"
2433 "mov %2, %%"REG_d" \n\t"
2434 "mov %3, %%"REG_b" \n\t"
2435 "xor %%"REG_a", %%"REG_a" \n\t" // i
2436 PREFETCH" (%%"REG_c") \n\t"
2437 PREFETCH" 32(%%"REG_c") \n\t"
2438 PREFETCH" 64(%%"REG_c") \n\t"
2440 CALL_MMX2_FILTER_CODE
2441 CALL_MMX2_FILTER_CODE
2442 CALL_MMX2_FILTER_CODE
2443 CALL_MMX2_FILTER_CODE
2444 "xor %%"REG_a", %%"REG_a" \n\t" // i
2445 "mov %5, %%"REG_c" \n\t" // src
2446 "mov %1, %%"REG_D" \n\t" // buf1
2447 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2448 PREFETCH" (%%"REG_c") \n\t"
2449 PREFETCH" 32(%%"REG_c") \n\t"
2450 PREFETCH" 64(%%"REG_c") \n\t"
2452 CALL_MMX2_FILTER_CODE
2453 CALL_MMX2_FILTER_CODE
2454 CALL_MMX2_FILTER_CODE
2455 CALL_MMX2_FILTER_CODE
2458 "mov %6, %%"REG_b" \n\t"
2460 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2461 "m" (mmx2FilterCode), "m" (src2)
2465 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2470 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2471 //printf("%d %d %d\n", dstWidth, i, srcW);
2472 dst[i] = src1[srcW-1]*128;
2473 dst[i+VOFW] = src2[srcW-1]*128;
2476 #endif /* COMPILE_TEMPLATE_MMX2 */
2477 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2478 uint16_t xInc_mask = xInc & 0xffff;
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i
2481 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2482 "xorl %%ecx, %%ecx \n\t" // xalpha
2485 "mov %0, %%"REG_S" \n\t"
2486 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2491 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2492 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2494 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2496 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2497 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2498 "add $1, %%"REG_a" \n\t"
2499 "cmp %2, %%"REG_a" \n\t"
2502 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2503 which is needed to support GCC 4.0. */
2504 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2505 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2507 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2510 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2512 #if COMPILE_TEMPLATE_MMX2
2513 } //if MMX2 can't be used
2517 unsigned int xpos=0;
2518 for (i=0;i<dstWidth;i++) {
2519 register unsigned int xx=xpos>>16;
2520 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2521 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2522 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2524 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2525 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2529 #endif /* ARCH_X86 */
2532 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2533 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2534 const int16_t *hChrFilterPos, int hChrFilterSize,
2535 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2539 src1 += c->chrSrcOffset;
2540 src2 += c->chrSrcOffset;
2543 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2544 src1= formatConvBuffer;
2545 src2= formatConvBuffer+VOFW;
2548 if (!c->hcscale_fast)
2550 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2551 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2552 } else { // fast bilinear upscale / crap downscale
2553 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2556 if (c->chrConvertRange)
2557 c->chrConvertRange(dst, dstWidth);
2560 #define DEBUG_SWSCALE_BUFFERS 0
2561 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2563 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2564 int srcSliceH, uint8_t* dst[], int dstStride[])
2566 /* load a few things into local vars to make the code more readable? and faster */
2567 const int srcW= c->srcW;
2568 const int dstW= c->dstW;
2569 const int dstH= c->dstH;
2570 const int chrDstW= c->chrDstW;
2571 const int chrSrcW= c->chrSrcW;
2572 const int lumXInc= c->lumXInc;
2573 const int chrXInc= c->chrXInc;
2574 const enum PixelFormat dstFormat= c->dstFormat;
2575 const enum PixelFormat srcFormat= c->srcFormat;
2576 const int flags= c->flags;
2577 int16_t *vLumFilterPos= c->vLumFilterPos;
2578 int16_t *vChrFilterPos= c->vChrFilterPos;
2579 int16_t *hLumFilterPos= c->hLumFilterPos;
2580 int16_t *hChrFilterPos= c->hChrFilterPos;
2581 int16_t *vLumFilter= c->vLumFilter;
2582 int16_t *vChrFilter= c->vChrFilter;
2583 int16_t *hLumFilter= c->hLumFilter;
2584 int16_t *hChrFilter= c->hChrFilter;
2585 int32_t *lumMmxFilter= c->lumMmxFilter;
2586 int32_t *chrMmxFilter= c->chrMmxFilter;
2587 int32_t *alpMmxFilter= c->alpMmxFilter;
2588 const int vLumFilterSize= c->vLumFilterSize;
2589 const int vChrFilterSize= c->vChrFilterSize;
2590 const int hLumFilterSize= c->hLumFilterSize;
2591 const int hChrFilterSize= c->hChrFilterSize;
2592 int16_t **lumPixBuf= c->lumPixBuf;
2593 int16_t **chrPixBuf= c->chrPixBuf;
2594 int16_t **alpPixBuf= c->alpPixBuf;
2595 const int vLumBufSize= c->vLumBufSize;
2596 const int vChrBufSize= c->vChrBufSize;
2597 uint8_t *formatConvBuffer= c->formatConvBuffer;
2598 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2599 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2601 uint32_t *pal=c->pal_yuv;
2603 /* vars which will change and which we need to store back in the context */
2605 int lumBufIndex= c->lumBufIndex;
2606 int chrBufIndex= c->chrBufIndex;
2607 int lastInLumBuf= c->lastInLumBuf;
2608 int lastInChrBuf= c->lastInChrBuf;
2610 if (isPacked(c->srcFormat)) {
2618 srcStride[3]= srcStride[0];
2620 srcStride[1]<<= c->vChrDrop;
2621 srcStride[2]<<= c->vChrDrop;
2623 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2624 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2625 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2626 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2627 srcSliceY, srcSliceH, dstY, dstH);
2628 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2629 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2631 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2632 static int warnedAlready=0; //FIXME move this into the context perhaps
2633 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2634 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2635 " ->cannot do aligned memory accesses anymore\n");
2640 /* Note the user might start scaling the picture in the middle so this
2641 will not get executed. This is not really intended but works
2642 currently, so people might do it. */
2643 if (srcSliceY ==0) {
2653 for (;dstY < dstH; dstY++) {
2654 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2655 const int chrDstY= dstY>>c->chrDstVSubSample;
2656 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2657 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2658 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2660 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2661 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2662 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2663 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2666 //handle holes (FAST_BILINEAR & weird filters)
2667 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2668 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2669 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2670 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2672 // Do we have enough lines in this slice to output the dstY line
2673 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2674 if (!enough_lines) {
2675 lastLumSrcY = srcSliceY + srcSliceH - 1;
2676 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2679 DEBUG_BUFFERS("dstY: %d\n", dstY);
2680 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2681 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2682 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2683 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2685 //Do horizontal scaling
2686 while(lastInLumBuf < lastLumSrcY) {
2687 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2688 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2690 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2691 lumBufIndex, lastInLumBuf);
2692 assert(lumBufIndex < 2*vLumBufSize);
2693 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2694 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2695 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2696 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2697 c->srcFormat, formatConvBuffer,
2699 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2700 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2701 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2702 c->srcFormat, formatConvBuffer,
2706 while(lastInChrBuf < lastChrSrcY) {
2707 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2708 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2710 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2711 chrBufIndex, lastInChrBuf);
2712 assert(chrBufIndex < 2*vChrBufSize);
2713 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2714 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2715 //FIXME replace parameters through context struct (some at least)
2717 if (c->needs_hcscale)
2718 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2719 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2720 c->srcFormat, formatConvBuffer,
2724 //wrap buf index around to stay inside the ring buffer
2725 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2726 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2728 break; //we can't output a dstY line so let's try with the next slice
2730 #if COMPILE_TEMPLATE_MMX
2731 c->blueDither= ff_dither8[dstY&1];
2732 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2733 c->greenDither= ff_dither8[dstY&1];
2735 c->greenDither= ff_dither4[dstY&1];
2736 c->redDither= ff_dither8[(dstY+1)&1];
2738 if (dstY < dstH-2) {
2739 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2740 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2741 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2742 #if COMPILE_TEMPLATE_MMX
2744 if (flags & SWS_ACCURATE_RND) {
2745 int s= APCK_SIZE / 8;
2746 for (i=0; i<vLumFilterSize; i+=2) {
2747 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2748 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2749 lumMmxFilter[s*i+APCK_COEF/4 ]=
2750 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2751 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2752 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2753 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2754 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2755 alpMmxFilter[s*i+APCK_COEF/4 ]=
2756 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2759 for (i=0; i<vChrFilterSize; i+=2) {
2760 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2761 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2762 chrMmxFilter[s*i+APCK_COEF/4 ]=
2763 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2764 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2767 for (i=0; i<vLumFilterSize; i++) {
2768 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2769 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2770 lumMmxFilter[4*i+2]=
2771 lumMmxFilter[4*i+3]=
2772 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2773 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2774 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2775 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2776 alpMmxFilter[4*i+2]=
2777 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2780 for (i=0; i<vChrFilterSize; i++) {
2781 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2782 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2783 chrMmxFilter[4*i+2]=
2784 chrMmxFilter[4*i+3]=
2785 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2789 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2790 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2791 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2793 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2794 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2795 dest, uDest, dstW, chrDstW, dstFormat);
2796 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2797 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799 if (is16BPS(dstFormat)) {
2801 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2802 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2803 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2805 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2806 int16_t *lumBuf = lumSrcPtr[0];
2807 int16_t *chrBuf= chrSrcPtr[0];
2808 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2809 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2810 } else { //General YV12
2812 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2813 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2814 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2817 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2818 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2819 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2820 int chrAlpha= vChrFilter[2*dstY+1];
2821 if(flags & SWS_FULL_CHR_H_INT) {
2822 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2823 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2824 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2825 alpSrcPtr, dest, dstW, dstY);
2827 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2828 alpPixBuf ? *alpSrcPtr : NULL,
2829 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2831 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2832 int lumAlpha= vLumFilter[2*dstY+1];
2833 int chrAlpha= vChrFilter[2*dstY+1];
2835 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2837 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2838 if(flags & SWS_FULL_CHR_H_INT) {
2839 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2840 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2841 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2842 alpSrcPtr, dest, dstW, dstY);
2844 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2845 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2846 dest, dstW, lumAlpha, chrAlpha, dstY);
2848 } else { //general RGB
2849 if(flags & SWS_FULL_CHR_H_INT) {
2851 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2852 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853 alpSrcPtr, dest, dstW, dstY);
2856 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2857 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2858 alpSrcPtr, dest, dstW, dstY);
2862 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2863 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2866 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2867 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2868 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2870 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2871 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2872 dest, uDest, dstW, chrDstW, dstFormat);
2873 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2874 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2875 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2876 if (is16BPS(dstFormat)) {
2878 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2884 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2885 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2886 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2889 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2890 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2891 if(flags & SWS_FULL_CHR_H_INT) {
2893 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2894 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895 alpSrcPtr, dest, dstW, dstY);
2898 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2899 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2900 alpSrcPtr, dest, dstW, dstY);
2906 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2907 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2909 #if COMPILE_TEMPLATE_MMX
2910 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2911 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2912 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2913 else __asm__ volatile("emms" :::"memory");
2915 /* store changed local vars back in the context */
2917 c->lumBufIndex= lumBufIndex;
2918 c->chrBufIndex= chrBufIndex;
2919 c->lastInLumBuf= lastInLumBuf;
2920 c->lastInChrBuf= lastInChrBuf;
2922 return dstY - lastDstY;
2925 static void RENAME(sws_init_swScale)(SwsContext *c)
2927 enum PixelFormat srcFormat = c->srcFormat;
2929 c->yuv2nv12X = RENAME(yuv2nv12X );
2930 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2931 c->yuv2yuvX = RENAME(yuv2yuvX );
2932 c->yuv2packed1 = RENAME(yuv2packed1 );
2933 c->yuv2packed2 = RENAME(yuv2packed2 );
2934 c->yuv2packedX = RENAME(yuv2packedX );
2936 c->hScale = RENAME(hScale );
2938 #if COMPILE_TEMPLATE_MMX
2939 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2940 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2942 if (c->flags & SWS_FAST_BILINEAR)
2945 c->hyscale_fast = RENAME(hyscale_fast);
2946 c->hcscale_fast = RENAME(hcscale_fast);
2949 c->chrToYV12 = NULL;
2951 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2952 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2953 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2954 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2958 case PIX_FMT_BGR4_BYTE:
2959 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2960 case PIX_FMT_YUV420P16BE:
2961 case PIX_FMT_YUV422P16BE:
2962 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2963 case PIX_FMT_YUV420P16LE:
2964 case PIX_FMT_YUV422P16LE:
2965 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2967 if (c->chrSrcHSubSample) {
2969 case PIX_FMT_RGB48BE:
2970 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2971 case PIX_FMT_RGB32 :
2972 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2973 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2974 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2975 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2976 case PIX_FMT_BGR32 :
2977 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2978 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2979 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2980 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2984 case PIX_FMT_RGB48BE:
2985 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2986 case PIX_FMT_RGB32 :
2987 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2988 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2989 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2990 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2991 case PIX_FMT_BGR32 :
2992 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2993 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2994 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2995 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2999 c->lumToYV12 = NULL;
3000 c->alpToYV12 = NULL;
3001 switch (srcFormat) {
3002 case PIX_FMT_YUYV422 :
3003 case PIX_FMT_YUV420P16BE:
3004 case PIX_FMT_YUV422P16BE:
3005 case PIX_FMT_YUV444P16BE:
3006 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3007 case PIX_FMT_UYVY422 :
3008 case PIX_FMT_YUV420P16LE:
3009 case PIX_FMT_YUV422P16LE:
3010 case PIX_FMT_YUV444P16LE:
3011 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3012 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3013 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3014 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3015 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3016 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3017 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
3021 case PIX_FMT_BGR4_BYTE:
3022 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3023 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3024 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3025 case PIX_FMT_RGB32 :
3026 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3027 case PIX_FMT_BGR32 :
3028 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3029 case PIX_FMT_RGB48BE:
3030 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3033 switch (srcFormat) {
3034 case PIX_FMT_RGB32 :
3035 case PIX_FMT_RGB32_1:
3036 case PIX_FMT_BGR32 :
3037 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3041 switch (srcFormat) {
3042 case PIX_FMT_RGB32 :
3043 case PIX_FMT_BGR32 :
3044 c->alpSrcOffset = 3;
3046 case PIX_FMT_RGB32_1:
3047 case PIX_FMT_BGR32_1:
3048 c->lumSrcOffset = ALT32_CORR;
3049 c->chrSrcOffset = ALT32_CORR;
3051 case PIX_FMT_RGB48LE:
3052 c->lumSrcOffset = 1;
3053 c->chrSrcOffset = 1;
3054 c->alpSrcOffset = 1;
3058 if (c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
3060 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3061 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3063 c->lumConvertRange = RENAME(lumRangeToJpeg);
3064 c->chrConvertRange = RENAME(chrRangeToJpeg);
3068 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3069 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3070 c->needs_hcscale = 1;