2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
29 #if COMPILE_TEMPLATE_AMD3DNOW
30 #define PREFETCH "prefetch"
31 #elif COMPILE_TEMPLATE_MMX2
32 #define PREFETCH "prefetchnta"
34 #define PREFETCH " # nop"
37 #if COMPILE_TEMPLATE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif COMPILE_TEMPLATE_AMD3DNOW
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #if COMPILE_TEMPLATE_MMX2
44 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
46 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
48 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
50 #if COMPILE_TEMPLATE_ALTIVEC
51 #include "ppc/swscale_altivec_template.c"
54 #define YSCALEYUV2YV12X(x, offset, dest, width) \
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
90 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
152 #define YSCALEYUV2YV121 \
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
165 #define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 #define YSCALEYUV2PACKEDX_UV \
192 "xor %%"REG_a", %%"REG_a" \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
214 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215 "lea "offset"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
237 #define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
244 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
246 "xor %%"REG_a", %%"REG_a" \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
294 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
339 #define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
343 #define YSCALEYUV2RGBX \
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
379 #define REAL_YSCALEYUV2PACKED(index, c) \
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
415 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
417 #define REAL_YSCALEYUV2RGB_UV(index, c) \
418 "xor "#index", "#index" \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
442 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 #define REAL_YSCALEYUV2RGB_COEFF(c) \
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
485 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
487 #define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490 REAL_YSCALEYUV2RGB_COEFF(c)
492 #define REAL_YSCALEYUV2PACKED1(index, c) \
493 "xor "#index", "#index" \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
505 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
507 #define REAL_YSCALEYUV2RGB1(index, c) \
508 "xor "#index", "#index" \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
554 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
556 #define REAL_YSCALEYUV2PACKED1b(index, c) \
557 "xor "#index", "#index" \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
572 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
574 // do vertical chrominance interpolation
575 #define REAL_YSCALEYUV2RGB1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
626 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
628 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
636 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
658 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
660 #define REAL_WRITERGB16(dst, dstw, index) \
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
686 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
688 #define REAL_WRITERGB15(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
715 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
717 #define WRITEBGR24OLD(dst, dstw, index) \
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
773 #define WRITEBGR24MMX(dst, dstw, index) \
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
820 "add $24, "#dst" \n\t"\
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
826 #define WRITEBGR24MMX2(dst, dstw, index) \
827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #if COMPILE_TEMPLATE_MMX2
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
879 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
882 #define REAL_WRITEYUY2(dst, dstw, index) \
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
897 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
900 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
902 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
904 #if COMPILE_TEMPLATE_MMX
905 if(!(c->flags & SWS_BITEXACT)) {
906 if (c->flags & SWS_ACCURATE_RND) {
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
911 if (CONFIG_SWSCALE_ALPHA && aDest) {
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921 if (CONFIG_SWSCALE_ALPHA && aDest) {
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 #if COMPILE_TEMPLATE_ALTIVEC
931 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932 chrFilter, chrSrc, chrFilterSize,
933 dest, uDest, vDest, dstW, chrDstW);
934 #else //COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
938 #endif //!COMPILE_TEMPLATE_ALTIVEC
941 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
943 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
945 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946 chrFilter, chrSrc, chrFilterSize,
947 dest, uDest, dstW, chrDstW, dstFormat);
950 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
951 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
954 #if COMPILE_TEMPLATE_MMX
955 if(!(c->flags & SWS_BITEXACT)) {
957 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
958 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
961 if (c->flags & SWS_ACCURATE_RND) {
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src[p]), "r" (dst[p] + counter[p]),
977 :: "r" (src[p]), "r" (dst[p] + counter[p]),
987 for (i=0; i<dstW; i++) {
988 int val= (lumSrc[i]+64)>>7;
999 for (i=0; i<chrDstW; i++) {
1000 int u=(chrSrc[i ]+64)>>7;
1001 int v=(chrSrc[i + VOFW]+64)>>7;
1005 else if (u>255) u=255;
1007 else if (v>255) v=255;
1014 if (CONFIG_SWSCALE_ALPHA && aDest)
1015 for (i=0; i<dstW; i++) {
1016 int val= (alpSrc[i]+64)>>7;
1017 aDest[i]= av_clip_uint8(val);
1023 * vertical scale YV12 to RGB
1025 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1029 #if COMPILE_TEMPLATE_MMX
1031 if(!(c->flags & SWS_BITEXACT)) {
1032 if (c->flags & SWS_ACCURATE_RND) {
1033 switch(c->dstFormat) {
1035 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1036 YSCALEYUV2PACKEDX_ACCURATE
1038 "movq %%mm2, "U_TEMP"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1048 YSCALEYUV2PACKEDX_END
1050 YSCALEYUV2PACKEDX_ACCURATE
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1055 YSCALEYUV2PACKEDX_END
1059 YSCALEYUV2PACKEDX_ACCURATE
1061 "pxor %%mm7, %%mm7 \n\t"
1062 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c" \n\t"
1064 WRITEBGR24(%%REGc, %5, %%REGa)
1067 :: "r" (&c->redDither),
1068 "m" (dummy), "m" (dummy), "m" (dummy),
1069 "r" (dest), "m" (dstW)
1070 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1073 case PIX_FMT_RGB555:
1074 YSCALEYUV2PACKEDX_ACCURATE
1076 "pxor %%mm7, %%mm7 \n\t"
1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1079 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1084 WRITERGB15(%4, %5, %%REGa)
1085 YSCALEYUV2PACKEDX_END
1087 case PIX_FMT_RGB565:
1088 YSCALEYUV2PACKEDX_ACCURATE
1090 "pxor %%mm7, %%mm7 \n\t"
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1098 WRITERGB16(%4, %5, %%REGa)
1099 YSCALEYUV2PACKEDX_END
1101 case PIX_FMT_YUYV422:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1114 switch(c->dstFormat) {
1116 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124 YSCALEYUV2PACKEDX_END
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130 YSCALEYUV2PACKEDX_END
1136 "pxor %%mm7, %%mm7 \n\t"
1137 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c" \n\t"
1139 WRITEBGR24(%%REGc, %5, %%REGa)
1141 :: "r" (&c->redDither),
1142 "m" (dummy), "m" (dummy), "m" (dummy),
1143 "r" (dest), "m" (dstW)
1144 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147 case PIX_FMT_RGB555:
1150 "pxor %%mm7, %%mm7 \n\t"
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1158 WRITERGB15(%4, %5, %%REGa)
1159 YSCALEYUV2PACKEDX_END
1161 case PIX_FMT_RGB565:
1164 "pxor %%mm7, %%mm7 \n\t"
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1175 case PIX_FMT_YUYV422:
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1189 #endif /* COMPILE_TEMPLATE_MMX */
1190 #if COMPILE_TEMPLATE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of ff_yuv2packedX_altivec() */
1193 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1194 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1195 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1197 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198 chrFilter, chrSrc, chrFilterSize,
1202 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203 chrFilter, chrSrc, chrFilterSize,
1204 alpSrc, dest, dstW, dstY);
1208 * vertical bilinear scale YV12 to RGB
1210 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1213 int yalpha1=4095- yalpha;
1214 int uvalpha1=4095-uvalpha;
1217 #if COMPILE_TEMPLATE_MMX
1218 if(!(c->flags & SWS_BITEXACT)) {
1219 switch(c->dstFormat) {
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1222 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1225 YSCALEYUV2RGB(%%r8, %5)
1226 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
1230 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1234 ,"r" (abuf0), "r" (abuf1)
1238 *(const uint16_t **)(&c->u_temp)=abuf0;
1239 *(const uint16_t **)(&c->v_temp)=abuf1;
1241 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1242 "mov %4, %%"REG_b" \n\t"
1243 "push %%"REG_BP" \n\t"
1244 YSCALEYUV2RGB(%%REGBP, %5)
1247 "mov "U_TEMP"(%5), %0 \n\t"
1248 "mov "V_TEMP"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1255 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256 "pop %%"REG_BP" \n\t"
1257 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1259 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271 "pop %%"REG_BP" \n\t"
1272 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1281 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1282 "mov %4, %%"REG_b" \n\t"
1283 "push %%"REG_BP" \n\t"
1284 YSCALEYUV2RGB(%%REGBP, %5)
1285 "pxor %%mm7, %%mm7 \n\t"
1286 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1293 case PIX_FMT_RGB555:
1295 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1296 "mov %4, %%"REG_b" \n\t"
1297 "push %%"REG_BP" \n\t"
1298 YSCALEYUV2RGB(%%REGBP, %5)
1299 "pxor %%mm7, %%mm7 \n\t"
1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1302 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1307 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1308 "pop %%"REG_BP" \n\t"
1309 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1311 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1315 case PIX_FMT_RGB565:
1317 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1318 "mov %4, %%"REG_b" \n\t"
1319 "push %%"REG_BP" \n\t"
1320 YSCALEYUV2RGB(%%REGBP, %5)
1321 "pxor %%mm7, %%mm7 \n\t"
1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1324 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1329 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 case PIX_FMT_YUYV422:
1338 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1339 "mov %4, %%"REG_b" \n\t"
1340 "push %%"REG_BP" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP, %5)
1342 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343 "pop %%"REG_BP" \n\t"
1344 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1352 #endif //COMPILE_TEMPLATE_MMX
1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1357 * YV12 to RGB without scaling or interpolating
1359 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1360 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1362 const int yalpha1=0;
1365 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1366 const int yalpha= 4096; //FIXME ...
1368 if (flags&SWS_FULL_CHR_H_INT) {
1369 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1373 #if COMPILE_TEMPLATE_MMX
1374 if(!(flags & SWS_BITEXACT)) {
1375 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1378 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1380 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1381 "mov %4, %%"REG_b" \n\t"
1382 "push %%"REG_BP" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386 "pop %%"REG_BP" \n\t"
1387 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1389 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400 "pop %%"REG_BP" \n\t"
1401 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1403 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP, %5)
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1423 case PIX_FMT_RGB555:
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1432 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1436 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1444 case PIX_FMT_RGB565:
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1458 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1466 case PIX_FMT_YUYV422:
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP, %5)
1472 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1476 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1495 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1509 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1525 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1529 case PIX_FMT_RGB555:
1531 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1532 "mov %4, %%"REG_b" \n\t"
1533 "push %%"REG_BP" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1538 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1542 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543 "pop %%"REG_BP" \n\t"
1544 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1546 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1550 case PIX_FMT_RGB565:
1552 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1553 "mov %4, %%"REG_b" \n\t"
1554 "push %%"REG_BP" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1559 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1564 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565 "pop %%"REG_BP" \n\t"
1566 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1568 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1572 case PIX_FMT_YUYV422:
1574 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1575 "mov %4, %%"REG_b" \n\t"
1576 "push %%"REG_BP" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP, %5)
1578 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1589 #endif /* COMPILE_TEMPLATE_MMX */
1590 if (uvalpha < 2048) {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1597 //FIXME yuy2* can read up to 7 samples too much
1599 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1601 #if COMPILE_TEMPLATE_MMX
1603 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1606 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a") \n\t"
1612 "add $8, %%"REG_a" \n\t"
1614 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1619 for (i=0; i<width; i++)
1624 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1626 #if COMPILE_TEMPLATE_MMX
1628 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a" \n\t"
1631 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a") \n\t"
1643 "add $4, %%"REG_a" \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1650 for (i=0; i<width; i++) {
1651 dstU[i]= src1[4*i + 1];
1652 dstV[i]= src1[4*i + 3];
1655 assert(src1 == src2);
1658 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1660 #if COMPILE_TEMPLATE_MMX
1662 "mov %0, %%"REG_a" \n\t"
1664 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a") \n\t"
1676 "add $8, %%"REG_a" \n\t"
1678 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1683 for (i=0; i<width; i++) {
1684 dstU[i]= src1[2*i + 1];
1685 dstV[i]= src2[2*i + 1];
1690 /* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1694 #if COMPILE_TEMPLATE_MMX
1696 "mov %0, %%"REG_a" \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1711 for (i=0; i<width; i++)
1716 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1718 #if COMPILE_TEMPLATE_MMX
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1742 for (i=0; i<width; i++) {
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1747 assert(src1 == src2);
1750 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1752 #if COMPILE_TEMPLATE_MMX
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1757 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a") \n\t"
1769 "add $8, %%"REG_a" \n\t"
1771 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1776 for (i=0; i<width; i++) {
1783 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784 const uint8_t *src, long width)
1786 #if COMPILE_TEMPLATE_MMX
1788 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a" \n\t"
1791 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a") \n\t"
1803 "add $8, %%"REG_a" \n\t"
1805 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1810 for (i = 0; i < width; i++) {
1811 dst1[i] = src[2*i+0];
1812 dst2[i] = src[2*i+1];
1817 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818 const uint8_t *src1, const uint8_t *src2,
1819 long width, uint32_t *unused)
1821 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1824 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825 const uint8_t *src1, const uint8_t *src2,
1826 long width, uint32_t *unused)
1828 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1831 #if COMPILE_TEMPLATE_MMX
1832 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1835 if(srcFormat == PIX_FMT_BGR24) {
1837 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1843 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1850 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1854 PREFETCH" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a") \n\t"
1877 "add $4, %%"REG_a" \n\t"
1880 : "r" (dst+width), "g" ((x86_reg)-width)
1885 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1892 PREFETCH" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1920 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a") \n\t"
1935 "add $4, %%"REG_a" \n\t"
1938 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1944 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1946 #if COMPILE_TEMPLATE_MMX
1947 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1950 for (i=0; i<width; i++) {
1955 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1957 #endif /* COMPILE_TEMPLATE_MMX */
1960 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1962 #if COMPILE_TEMPLATE_MMX
1963 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1966 for (i=0; i<width; i++) {
1967 int b= src1[3*i + 0];
1968 int g= src1[3*i + 1];
1969 int r= src1[3*i + 2];
1971 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1974 #endif /* COMPILE_TEMPLATE_MMX */
1975 assert(src1 == src2);
1978 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1981 for (i=0; i<width; i++) {
1982 int b= src1[6*i + 0] + src1[6*i + 3];
1983 int g= src1[6*i + 1] + src1[6*i + 4];
1984 int r= src1[6*i + 2] + src1[6*i + 5];
1986 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1989 assert(src1 == src2);
1992 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1994 #if COMPILE_TEMPLATE_MMX
1995 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1998 for (i=0; i<width; i++) {
2003 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2008 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2010 #if COMPILE_TEMPLATE_MMX
2012 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2016 for (i=0; i<width; i++) {
2017 int r= src1[3*i + 0];
2018 int g= src1[3*i + 1];
2019 int b= src1[3*i + 2];
2021 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2027 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2031 for (i=0; i<width; i++) {
2032 int r= src1[6*i + 0] + src1[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5];
2036 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2042 // bilinear / bicubic scaling
2043 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044 const int16_t *filter, const int16_t *filterPos, long filterSize)
2046 #if COMPILE_TEMPLATE_MMX
2047 assert(filterSize % 4 == 0 && filterSize>0);
2048 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2049 x86_reg counter= -2*dstW;
2051 filterPos-= counter/2;
2055 "push %%"REG_b" \n\t"
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2079 "add $4, %%"REG_BP" \n\t"
2082 "pop %%"REG_BP" \n\t"
2084 "pop %%"REG_b" \n\t"
2087 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2092 } else if (filterSize==8) {
2093 x86_reg counter= -2*dstW;
2095 filterPos-= counter/2;
2099 "push %%"REG_b" \n\t"
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a", %%"REG_BP" \n\t"
2106 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2117 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2134 "add $4, %%"REG_BP" \n\t"
2137 "pop %%"REG_BP" \n\t"
2139 "pop %%"REG_b" \n\t"
2142 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2148 const uint8_t *offset = src+filterSize;
2149 x86_reg counter= -2*dstW;
2150 //filter-= counter*filterSize/2;
2151 filterPos-= counter/2;
2154 "pxor %%mm7, %%mm7 \n\t"
2157 "mov %2, %%"REG_c" \n\t"
2158 "movzwl (%%"REG_c", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2167 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2175 "add $4, %%"REG_c" \n\t"
2176 "cmp %4, %%"REG_c" \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a" \n\t"
2186 "movd %%mm4, (%%"REG_a", %0) \n\t"
2190 : "+r" (counter), "+r" (filter)
2191 : "m" (filterPos), "m" (dst), "m"(offset),
2192 "m" (src), "r" ((x86_reg)filterSize*2)
2193 : "%"REG_a, "%"REG_c, "%"REG_d
2197 #if COMPILE_TEMPLATE_ALTIVEC
2198 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2201 for (i=0; i<dstW; i++) {
2203 int srcPos= filterPos[i];
2205 //printf("filterPos: %d\n", filterPos[i]);
2206 for (j=0; j<filterSize; j++) {
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2210 //filter += hFilterSize;
2211 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2214 #endif /* COMPILE_ALTIVEC */
2215 #endif /* COMPILE_MMX */
2218 //FIXME all pal and rgb srcFormats could do this convertion as well
2219 //FIXME all scalers more complex than bilinear could do half of this transform
2220 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2223 for (i = 0; i < width; i++) {
2224 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2225 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2228 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2231 for (i = 0; i < width; i++) {
2232 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2233 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2236 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2239 for (i = 0; i < width; i++)
2240 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2242 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2245 for (i = 0; i < width; i++)
2246 dst[i] = (dst[i]*14071 + 33561947)>>14;
2249 #define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2257 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258 long dstWidth, const uint8_t *src, int srcW,
2261 #if ARCH_X86 && CONFIG_GPL
2262 #if COMPILE_TEMPLATE_MMX2
2263 int32_t *filterPos = c->hLumFilterPos;
2264 int16_t *filter = c->hLumFilter;
2265 int canMMX2BeUsed = c->canMMX2BeUsed;
2266 void *mmx2FilterCode= c->lumMmx2FilterCode;
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2271 if (canMMX2BeUsed) {
2274 "mov %%"REG_b", %5 \n\t"
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c" \n\t"
2278 "mov %1, %%"REG_D" \n\t"
2279 "mov %2, %%"REG_d" \n\t"
2280 "mov %3, %%"REG_b" \n\t"
2281 "xor %%"REG_a", %%"REG_a" \n\t" // i
2282 PREFETCH" (%%"REG_c") \n\t"
2283 PREFETCH" 32(%%"REG_c") \n\t"
2284 PREFETCH" 64(%%"REG_c") \n\t"
2288 #define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2298 #define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2305 #endif /* ARCH_X86_64 */
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2317 "mov %5, %%"REG_b" \n\t"
2319 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2320 "m" (mmx2FilterCode)
2324 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2329 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2331 #endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16 = xInc >> 16;
2333 uint16_t xInc_mask = xInc & 0xffff;
2334 //NO MMX just normal asm ...
2336 "xor %%"REG_a", %%"REG_a" \n\t" // i
2337 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2341 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2344 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2348 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2364 #if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2369 unsigned int xpos=0;
2370 for (i=0;i<dstWidth;i++) {
2371 register unsigned int xx=xpos>>16;
2372 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2376 #endif /* ARCH_X86 */
2379 // *** horizontal scale Y line to temp buffer
2380 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2381 const int16_t *hLumFilter,
2382 const int16_t *hLumFilterPos, int hLumFilterSize,
2383 uint8_t *formatConvBuffer,
2384 uint32_t *pal, int isAlpha)
2386 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2387 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2389 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2392 toYV12(formatConvBuffer, src, srcW, pal);
2393 src= formatConvBuffer;
2396 if (!c->hyscale_fast) {
2397 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2398 } else { // fast bilinear upscale / crap downscale
2399 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2403 convertRange(dst, dstWidth);
2406 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407 long dstWidth, const uint8_t *src1,
2408 const uint8_t *src2, int srcW, int xInc)
2410 #if ARCH_X86 && CONFIG_GPL
2411 #if COMPILE_TEMPLATE_MMX2
2412 int32_t *filterPos = c->hChrFilterPos;
2413 int16_t *filter = c->hChrFilter;
2414 int canMMX2BeUsed = c->canMMX2BeUsed;
2415 void *mmx2FilterCode= c->chrMmx2FilterCode;
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2420 if (canMMX2BeUsed) {
2423 "mov %%"REG_b", %6 \n\t"
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2453 "mov %6, %%"REG_b" \n\t"
2455 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2456 "m" (mmx2FilterCode), "m" (src2)
2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473 uint16_t xInc_mask = xInc & 0xffff;
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2512 unsigned int xpos=0;
2513 for (i=0;i<dstWidth;i++) {
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528 int srcW, int xInc, const int16_t *hChrFilter,
2529 const int16_t *hChrFilterPos, int hChrFilterSize,
2530 uint8_t *formatConvBuffer,
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539 src1= formatConvBuffer;
2540 src2= formatConvBuffer+VOFW;
2543 if (!c->hcscale_fast) {
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546 } else { // fast bilinear upscale / crap downscale
2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
2568 const enum PixelFormat dstFormat= c->dstFormat;
2569 const int flags= c->flags;
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
2587 int16_t **alpPixBuf= c->alpPixBuf;
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2594 uint32_t *pal=c->pal_yuv;
2596 /* vars which will change and which we need to store back in the context */
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2603 if (isPacked(c->srcFormat)) {
2611 srcStride[3]= srcStride[0];
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625 static int warnedAlready=0; //FIXME move this into the context perhaps
2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY ==0) {
2646 for (;dstY < dstH; dstY++) {
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2655 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2656 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2657 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2658 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2661 //handle holes (FAST_BILINEAR & weird filters)
2662 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2664 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2667 DEBUG_BUFFERS("dstY: %d\n", dstY);
2668 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2669 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2670 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2671 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2673 // Do we have enough lines in this slice to output the dstY line
2674 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2676 if (!enough_lines) {
2677 lastLumSrcY = srcSliceY + srcSliceH - 1;
2678 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2679 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2680 lastLumSrcY, lastChrSrcY);
2683 //Do horizontal scaling
2684 while(lastInLumBuf < lastLumSrcY) {
2685 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2686 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2688 assert(lumBufIndex < 2*vLumBufSize);
2689 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2690 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2691 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2692 hLumFilter, hLumFilterPos, hLumFilterSize,
2695 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2696 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2697 hLumFilter, hLumFilterPos, hLumFilterSize,
2701 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2702 lumBufIndex, lastInLumBuf);
2704 while(lastInChrBuf < lastChrSrcY) {
2705 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2706 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2708 assert(chrBufIndex < 2*vChrBufSize);
2709 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2710 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2711 //FIXME replace parameters through context struct (some at least)
2713 if (c->needs_hcscale)
2714 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2715 hChrFilter, hChrFilterPos, hChrFilterSize,
2719 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2720 chrBufIndex, lastInChrBuf);
2722 //wrap buf index around to stay inside the ring buffer
2723 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2724 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2726 break; //we can't output a dstY line so let's try with the next slice
2728 #if COMPILE_TEMPLATE_MMX
2729 c->blueDither= ff_dither8[dstY&1];
2730 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2731 c->greenDither= ff_dither8[dstY&1];
2733 c->greenDither= ff_dither4[dstY&1];
2734 c->redDither= ff_dither8[(dstY+1)&1];
2736 if (dstY < dstH-2) {
2737 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2738 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2739 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2740 #if COMPILE_TEMPLATE_MMX
2742 if (flags & SWS_ACCURATE_RND) {
2743 int s= APCK_SIZE / 8;
2744 for (i=0; i<vLumFilterSize; i+=2) {
2745 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2746 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2747 lumMmxFilter[s*i+APCK_COEF/4 ]=
2748 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2749 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2750 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2751 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2752 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2753 alpMmxFilter[s*i+APCK_COEF/4 ]=
2754 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2757 for (i=0; i<vChrFilterSize; i+=2) {
2758 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2759 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2760 chrMmxFilter[s*i+APCK_COEF/4 ]=
2761 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2762 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2765 for (i=0; i<vLumFilterSize; i++) {
2766 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2767 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2768 lumMmxFilter[4*i+2]=
2769 lumMmxFilter[4*i+3]=
2770 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2771 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2772 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2773 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2774 alpMmxFilter[4*i+2]=
2775 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2778 for (i=0; i<vChrFilterSize; i++) {
2779 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2780 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2781 chrMmxFilter[4*i+2]=
2782 chrMmxFilter[4*i+3]=
2783 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2787 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2788 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2789 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2791 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2792 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2793 dest, uDest, dstW, chrDstW, dstFormat);
2794 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2795 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2796 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2797 if (is16BPS(dstFormat)) {
2799 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2800 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2801 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2803 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2804 const int16_t *lumBuf = lumSrcPtr[0];
2805 const int16_t *chrBuf= chrSrcPtr[0];
2806 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2807 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2808 } else { //General YV12
2810 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2811 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2812 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2815 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2818 int chrAlpha= vChrFilter[2*dstY+1];
2819 if(flags & SWS_FULL_CHR_H_INT) {
2820 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2821 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2822 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2823 alpSrcPtr, dest, dstW, dstY);
2825 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2826 alpPixBuf ? *alpSrcPtr : NULL,
2827 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2829 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2830 int lumAlpha= vLumFilter[2*dstY+1];
2831 int chrAlpha= vChrFilter[2*dstY+1];
2833 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2835 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2836 if(flags & SWS_FULL_CHR_H_INT) {
2837 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2838 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2839 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2840 alpSrcPtr, dest, dstW, dstY);
2842 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2843 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2844 dest, dstW, lumAlpha, chrAlpha, dstY);
2846 } else { //general RGB
2847 if(flags & SWS_FULL_CHR_H_INT) {
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851 alpSrcPtr, dest, dstW, dstY);
2854 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2855 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2856 alpSrcPtr, dest, dstW, dstY);
2860 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2861 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2862 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2863 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2864 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2865 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2866 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2868 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2869 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2870 dest, uDest, dstW, chrDstW, dstFormat);
2871 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2872 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2873 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2874 if (is16BPS(dstFormat)) {
2876 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2877 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2878 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2882 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2883 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2887 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2888 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2889 if(flags & SWS_FULL_CHR_H_INT) {
2891 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 alpSrcPtr, dest, dstW, dstY);
2896 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2897 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2898 alpSrcPtr, dest, dstW, dstY);
2904 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2905 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2907 #if COMPILE_TEMPLATE_MMX
2908 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2909 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2910 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2911 else __asm__ volatile("emms" :::"memory");
2913 /* store changed local vars back in the context */
2915 c->lumBufIndex= lumBufIndex;
2916 c->chrBufIndex= chrBufIndex;
2917 c->lastInLumBuf= lastInLumBuf;
2918 c->lastInChrBuf= lastInChrBuf;
2920 return dstY - lastDstY;
2923 static void RENAME(sws_init_swScale)(SwsContext *c)
2925 enum PixelFormat srcFormat = c->srcFormat;
2927 c->yuv2nv12X = RENAME(yuv2nv12X );
2928 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2929 c->yuv2yuvX = RENAME(yuv2yuvX );
2930 c->yuv2packed1 = RENAME(yuv2packed1 );
2931 c->yuv2packed2 = RENAME(yuv2packed2 );
2932 c->yuv2packedX = RENAME(yuv2packedX );
2934 c->hScale = RENAME(hScale );
2936 #if COMPILE_TEMPLATE_MMX
2937 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2938 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2940 if (c->flags & SWS_FAST_BILINEAR)
2943 c->hyscale_fast = RENAME(hyscale_fast);
2944 c->hcscale_fast = RENAME(hcscale_fast);
2947 c->chrToYV12 = NULL;
2949 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2950 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2951 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2952 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2956 case PIX_FMT_BGR4_BYTE:
2957 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2958 case PIX_FMT_YUV420P16BE:
2959 case PIX_FMT_YUV422P16BE:
2960 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2961 case PIX_FMT_YUV420P16LE:
2962 case PIX_FMT_YUV422P16LE:
2963 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2965 if (c->chrSrcHSubSample) {
2967 case PIX_FMT_RGB48BE:
2968 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2969 case PIX_FMT_RGB32 :
2970 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2971 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2972 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2973 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2974 case PIX_FMT_BGR32 :
2975 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2976 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2977 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2978 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2982 case PIX_FMT_RGB48BE:
2983 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2984 case PIX_FMT_RGB32 :
2985 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2986 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2987 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2988 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2989 case PIX_FMT_BGR32 :
2990 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2991 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2992 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2993 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2997 c->lumToYV12 = NULL;
2998 c->alpToYV12 = NULL;
2999 switch (srcFormat) {
3000 case PIX_FMT_YUYV422 :
3001 case PIX_FMT_YUV420P16BE:
3002 case PIX_FMT_YUV422P16BE:
3003 case PIX_FMT_YUV444P16BE:
3004 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3005 case PIX_FMT_UYVY422 :
3006 case PIX_FMT_YUV420P16LE:
3007 case PIX_FMT_YUV422P16LE:
3008 case PIX_FMT_YUV444P16LE:
3009 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3010 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3011 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3012 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3013 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3014 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3015 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
3019 case PIX_FMT_BGR4_BYTE:
3020 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3021 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3022 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3023 case PIX_FMT_RGB32 :
3024 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3025 case PIX_FMT_BGR32 :
3026 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3027 case PIX_FMT_RGB48BE:
3028 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3031 switch (srcFormat) {
3032 case PIX_FMT_RGB32 :
3033 case PIX_FMT_RGB32_1:
3034 case PIX_FMT_BGR32 :
3035 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3039 switch (srcFormat) {
3040 case PIX_FMT_RGB32 :
3041 case PIX_FMT_BGR32 :
3042 c->alpSrcOffset = 3;
3044 case PIX_FMT_RGB32_1:
3045 case PIX_FMT_BGR32_1:
3046 c->lumSrcOffset = ALT32_CORR;
3047 c->chrSrcOffset = ALT32_CORR;
3049 case PIX_FMT_RGB48LE:
3050 c->lumSrcOffset = 1;
3051 c->chrSrcOffset = 1;
3052 c->alpSrcOffset = 1;
3056 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3058 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3059 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3061 c->lumConvertRange = RENAME(lumRangeToJpeg);
3062 c->chrConvertRange = RENAME(chrRangeToJpeg);
3066 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3067 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3068 c->needs_hcscale = 1;