2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #if COMPILE_TEMPLATE_AMD3DNOW
27 #define PREFETCH "prefetch"
28 #elif COMPILE_TEMPLATE_MMX2
29 #define PREFETCH "prefetchnta"
31 #define PREFETCH " # nop"
34 #if COMPILE_TEMPLATE_MMX2
35 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif COMPILE_TEMPLATE_AMD3DNOW
37 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
40 #if COMPILE_TEMPLATE_MMX2
41 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
43 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
45 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
47 #if COMPILE_TEMPLATE_ALTIVEC
48 #include "ppc/swscale_altivec_template.c"
51 #define YSCALEYUV2YV12X(x, offset, dest, width) \
53 "xor %%"REG_a", %%"REG_a" \n\t"\
54 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
55 "movq %%mm3, %%mm4 \n\t"\
56 "lea " offset "(%0), %%"REG_d" \n\t"\
57 "mov (%%"REG_d"), %%"REG_S" \n\t"\
58 ".p2align 4 \n\t" /* FIXME Unroll? */\
60 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
61 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
63 "add $16, %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 "test %%"REG_S", %%"REG_S" \n\t"\
66 "pmulhw %%mm0, %%mm2 \n\t"\
67 "pmulhw %%mm0, %%mm5 \n\t"\
68 "paddw %%mm2, %%mm3 \n\t"\
69 "paddw %%mm5, %%mm4 \n\t"\
71 "psraw $3, %%mm3 \n\t"\
72 "psraw $3, %%mm4 \n\t"\
73 "packuswb %%mm4, %%mm3 \n\t"\
74 MOVNTQ(%%mm3, (%1, %%REGa))\
75 "add $8, %%"REG_a" \n\t"\
76 "cmp %2, %%"REG_a" \n\t"\
77 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
78 "movq %%mm3, %%mm4 \n\t"\
79 "lea " offset "(%0), %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
82 :: "r" (&c->redDither),\
83 "r" (dest), "g" ((x86_reg)width)\
84 : "%"REG_a, "%"REG_d, "%"REG_S\
87 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
89 "lea " offset "(%0), %%"REG_d" \n\t"\
90 "xor %%"REG_a", %%"REG_a" \n\t"\
91 "pxor %%mm4, %%mm4 \n\t"\
92 "pxor %%mm5, %%mm5 \n\t"\
93 "pxor %%mm6, %%mm6 \n\t"\
94 "pxor %%mm7, %%mm7 \n\t"\
95 "mov (%%"REG_d"), %%"REG_S" \n\t"\
98 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
99 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
102 "movq %%mm0, %%mm3 \n\t"\
103 "punpcklwd %%mm1, %%mm0 \n\t"\
104 "punpckhwd %%mm1, %%mm3 \n\t"\
105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
106 "pmaddwd %%mm1, %%mm0 \n\t"\
107 "pmaddwd %%mm1, %%mm3 \n\t"\
108 "paddd %%mm0, %%mm4 \n\t"\
109 "paddd %%mm3, %%mm5 \n\t"\
110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
113 "test %%"REG_S", %%"REG_S" \n\t"\
114 "movq %%mm2, %%mm0 \n\t"\
115 "punpcklwd %%mm3, %%mm2 \n\t"\
116 "punpckhwd %%mm3, %%mm0 \n\t"\
117 "pmaddwd %%mm1, %%mm2 \n\t"\
118 "pmaddwd %%mm1, %%mm0 \n\t"\
119 "paddd %%mm2, %%mm6 \n\t"\
120 "paddd %%mm0, %%mm7 \n\t"\
122 "psrad $16, %%mm4 \n\t"\
123 "psrad $16, %%mm5 \n\t"\
124 "psrad $16, %%mm6 \n\t"\
125 "psrad $16, %%mm7 \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
127 "packssdw %%mm5, %%mm4 \n\t"\
128 "packssdw %%mm7, %%mm6 \n\t"\
129 "paddw %%mm0, %%mm4 \n\t"\
130 "paddw %%mm0, %%mm6 \n\t"\
131 "psraw $3, %%mm4 \n\t"\
132 "psraw $3, %%mm6 \n\t"\
133 "packuswb %%mm6, %%mm4 \n\t"\
134 MOVNTQ(%%mm4, (%1, %%REGa))\
135 "add $8, %%"REG_a" \n\t"\
136 "cmp %2, %%"REG_a" \n\t"\
137 "lea " offset "(%0), %%"REG_d" \n\t"\
138 "pxor %%mm4, %%mm4 \n\t"\
139 "pxor %%mm5, %%mm5 \n\t"\
140 "pxor %%mm6, %%mm6 \n\t"\
141 "pxor %%mm7, %%mm7 \n\t"\
142 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 :: "r" (&c->redDither),\
145 "r" (dest), "g" ((x86_reg)width)\
146 : "%"REG_a, "%"REG_d, "%"REG_S\
149 #define YSCALEYUV2YV121 \
150 "mov %2, %%"REG_a" \n\t"\
151 ".p2align 4 \n\t" /* FIXME Unroll? */\
153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
155 "psraw $7, %%mm0 \n\t"\
156 "psraw $7, %%mm1 \n\t"\
157 "packuswb %%mm1, %%mm0 \n\t"\
158 MOVNTQ(%%mm0, (%1, %%REGa))\
159 "add $8, %%"REG_a" \n\t"\
162 #define YSCALEYUV2YV121_ACCURATE \
163 "mov %2, %%"REG_a" \n\t"\
164 "pcmpeqw %%mm7, %%mm7 \n\t"\
165 "psrlw $15, %%mm7 \n\t"\
166 "psllw $6, %%mm7 \n\t"\
167 ".p2align 4 \n\t" /* FIXME Unroll? */\
169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
171 "paddsw %%mm7, %%mm0 \n\t"\
172 "paddsw %%mm7, %%mm1 \n\t"\
173 "psraw $7, %%mm0 \n\t"\
174 "psraw $7, %%mm1 \n\t"\
175 "packuswb %%mm1, %%mm0 \n\t"\
176 MOVNTQ(%%mm0, (%1, %%REGa))\
177 "add $8, %%"REG_a" \n\t"\
181 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183 "r" (dest), "m" (dstW_reg),
184 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
187 #define YSCALEYUV2PACKEDX_UV \
189 "xor %%"REG_a", %%"REG_a" \n\t"\
193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
194 "mov (%%"REG_d"), %%"REG_S" \n\t"\
195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
196 "movq %%mm3, %%mm4 \n\t"\
199 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
202 "add $16, %%"REG_d" \n\t"\
203 "mov (%%"REG_d"), %%"REG_S" \n\t"\
204 "pmulhw %%mm0, %%mm2 \n\t"\
205 "pmulhw %%mm0, %%mm5 \n\t"\
206 "paddw %%mm2, %%mm3 \n\t"\
207 "paddw %%mm5, %%mm4 \n\t"\
208 "test %%"REG_S", %%"REG_S" \n\t"\
211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212 "lea "offset"(%0), %%"REG_d" \n\t"\
213 "mov (%%"REG_d"), %%"REG_S" \n\t"\
214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
215 "movq "#dst1", "#dst2" \n\t"\
218 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
221 "add $16, %%"REG_d" \n\t"\
222 "mov (%%"REG_d"), %%"REG_S" \n\t"\
223 "pmulhw "#coeff", "#src1" \n\t"\
224 "pmulhw "#coeff", "#src2" \n\t"\
225 "paddw "#src1", "#dst1" \n\t"\
226 "paddw "#src2", "#dst2" \n\t"\
227 "test %%"REG_S", %%"REG_S" \n\t"\
230 #define YSCALEYUV2PACKEDX \
231 YSCALEYUV2PACKEDX_UV \
232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
234 #define YSCALEYUV2PACKEDX_END \
235 :: "r" (&c->redDither), \
236 "m" (dummy), "m" (dummy), "m" (dummy),\
237 "r" (dest), "m" (dstW_reg) \
238 : "%"REG_a, "%"REG_d, "%"REG_S \
241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
243 "xor %%"REG_a", %%"REG_a" \n\t"\
247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
248 "mov (%%"REG_d"), %%"REG_S" \n\t"\
249 "pxor %%mm4, %%mm4 \n\t"\
250 "pxor %%mm5, %%mm5 \n\t"\
251 "pxor %%mm6, %%mm6 \n\t"\
252 "pxor %%mm7, %%mm7 \n\t"\
255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
259 "movq %%mm0, %%mm3 \n\t"\
260 "punpcklwd %%mm1, %%mm0 \n\t"\
261 "punpckhwd %%mm1, %%mm3 \n\t"\
262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
263 "pmaddwd %%mm1, %%mm0 \n\t"\
264 "pmaddwd %%mm1, %%mm3 \n\t"\
265 "paddd %%mm0, %%mm4 \n\t"\
266 "paddd %%mm3, %%mm5 \n\t"\
267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
270 "test %%"REG_S", %%"REG_S" \n\t"\
271 "movq %%mm2, %%mm0 \n\t"\
272 "punpcklwd %%mm3, %%mm2 \n\t"\
273 "punpckhwd %%mm3, %%mm0 \n\t"\
274 "pmaddwd %%mm1, %%mm2 \n\t"\
275 "pmaddwd %%mm1, %%mm0 \n\t"\
276 "paddd %%mm2, %%mm6 \n\t"\
277 "paddd %%mm0, %%mm7 \n\t"\
279 "psrad $16, %%mm4 \n\t"\
280 "psrad $16, %%mm5 \n\t"\
281 "psrad $16, %%mm6 \n\t"\
282 "psrad $16, %%mm7 \n\t"\
283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
284 "packssdw %%mm5, %%mm4 \n\t"\
285 "packssdw %%mm7, %%mm6 \n\t"\
286 "paddw %%mm0, %%mm4 \n\t"\
287 "paddw %%mm0, %%mm6 \n\t"\
288 "movq %%mm4, "U_TEMP"(%0) \n\t"\
289 "movq %%mm6, "V_TEMP"(%0) \n\t"\
291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292 "lea "offset"(%0), %%"REG_d" \n\t"\
293 "mov (%%"REG_d"), %%"REG_S" \n\t"\
294 "pxor %%mm1, %%mm1 \n\t"\
295 "pxor %%mm5, %%mm5 \n\t"\
296 "pxor %%mm7, %%mm7 \n\t"\
297 "pxor %%mm6, %%mm6 \n\t"\
300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
304 "movq %%mm0, %%mm3 \n\t"\
305 "punpcklwd %%mm4, %%mm0 \n\t"\
306 "punpckhwd %%mm4, %%mm3 \n\t"\
307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
308 "pmaddwd %%mm4, %%mm0 \n\t"\
309 "pmaddwd %%mm4, %%mm3 \n\t"\
310 "paddd %%mm0, %%mm1 \n\t"\
311 "paddd %%mm3, %%mm5 \n\t"\
312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
315 "test %%"REG_S", %%"REG_S" \n\t"\
316 "movq %%mm2, %%mm0 \n\t"\
317 "punpcklwd %%mm3, %%mm2 \n\t"\
318 "punpckhwd %%mm3, %%mm0 \n\t"\
319 "pmaddwd %%mm4, %%mm2 \n\t"\
320 "pmaddwd %%mm4, %%mm0 \n\t"\
321 "paddd %%mm2, %%mm7 \n\t"\
322 "paddd %%mm0, %%mm6 \n\t"\
324 "psrad $16, %%mm1 \n\t"\
325 "psrad $16, %%mm5 \n\t"\
326 "psrad $16, %%mm7 \n\t"\
327 "psrad $16, %%mm6 \n\t"\
328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
329 "packssdw %%mm5, %%mm1 \n\t"\
330 "packssdw %%mm6, %%mm7 \n\t"\
331 "paddw %%mm0, %%mm1 \n\t"\
332 "paddw %%mm0, %%mm7 \n\t"\
333 "movq "U_TEMP"(%0), %%mm3 \n\t"\
334 "movq "V_TEMP"(%0), %%mm4 \n\t"\
336 #define YSCALEYUV2PACKEDX_ACCURATE \
337 YSCALEYUV2PACKEDX_ACCURATE_UV \
338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
340 #define YSCALEYUV2RGBX \
341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
343 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
344 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
347 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
354 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355 "paddw %%mm3, %%mm4 \n\t"\
356 "movq %%mm2, %%mm0 \n\t"\
357 "movq %%mm5, %%mm6 \n\t"\
358 "movq %%mm4, %%mm3 \n\t"\
359 "punpcklwd %%mm2, %%mm2 \n\t"\
360 "punpcklwd %%mm5, %%mm5 \n\t"\
361 "punpcklwd %%mm4, %%mm4 \n\t"\
362 "paddw %%mm1, %%mm2 \n\t"\
363 "paddw %%mm1, %%mm5 \n\t"\
364 "paddw %%mm1, %%mm4 \n\t"\
365 "punpckhwd %%mm0, %%mm0 \n\t"\
366 "punpckhwd %%mm6, %%mm6 \n\t"\
367 "punpckhwd %%mm3, %%mm3 \n\t"\
368 "paddw %%mm7, %%mm0 \n\t"\
369 "paddw %%mm7, %%mm6 \n\t"\
370 "paddw %%mm7, %%mm3 \n\t"\
371 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372 "packuswb %%mm0, %%mm2 \n\t"\
373 "packuswb %%mm6, %%mm5 \n\t"\
374 "packuswb %%mm3, %%mm4 \n\t"\
376 #define REAL_YSCALEYUV2PACKED(index, c) \
377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
379 "psraw $3, %%mm0 \n\t"\
380 "psraw $3, %%mm1 \n\t"\
381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383 "xor "#index", "#index" \n\t"\
386 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
387 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
390 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
393 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
400 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
401 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
402 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
403 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
404 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
415 "xor "#index", "#index" \n\t"\
418 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
419 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
422 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
425 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
433 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
434 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
437 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
441 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
444 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
445 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461 "paddw %%mm3, %%mm4 \n\t"\
462 "movq %%mm2, %%mm0 \n\t"\
463 "movq %%mm5, %%mm6 \n\t"\
464 "movq %%mm4, %%mm3 \n\t"\
465 "punpcklwd %%mm2, %%mm2 \n\t"\
466 "punpcklwd %%mm5, %%mm5 \n\t"\
467 "punpcklwd %%mm4, %%mm4 \n\t"\
468 "paddw %%mm1, %%mm2 \n\t"\
469 "paddw %%mm1, %%mm5 \n\t"\
470 "paddw %%mm1, %%mm4 \n\t"\
471 "punpckhwd %%mm0, %%mm0 \n\t"\
472 "punpckhwd %%mm6, %%mm6 \n\t"\
473 "punpckhwd %%mm3, %%mm3 \n\t"\
474 "paddw %%mm7, %%mm0 \n\t"\
475 "paddw %%mm7, %%mm6 \n\t"\
476 "paddw %%mm7, %%mm3 \n\t"\
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478 "packuswb %%mm0, %%mm2 \n\t"\
479 "packuswb %%mm6, %%mm5 \n\t"\
480 "packuswb %%mm3, %%mm4 \n\t"\
482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
484 #define YSCALEYUV2RGB(index, c) \
485 REAL_YSCALEYUV2RGB_UV(index, c) \
486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487 REAL_YSCALEYUV2RGB_COEFF(c)
489 #define REAL_YSCALEYUV2PACKED1(index, c) \
490 "xor "#index", "#index" \n\t"\
493 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
495 "psraw $7, %%mm3 \n\t" \
496 "psraw $7, %%mm4 \n\t" \
497 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
498 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
499 "psraw $7, %%mm1 \n\t" \
500 "psraw $7, %%mm7 \n\t" \
502 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
504 #define REAL_YSCALEYUV2RGB1(index, c) \
505 "xor "#index", "#index" \n\t"\
508 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
510 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
514 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
515 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
518 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
529 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530 "paddw %%mm3, %%mm4 \n\t"\
531 "movq %%mm2, %%mm0 \n\t"\
532 "movq %%mm5, %%mm6 \n\t"\
533 "movq %%mm4, %%mm3 \n\t"\
534 "punpcklwd %%mm2, %%mm2 \n\t"\
535 "punpcklwd %%mm5, %%mm5 \n\t"\
536 "punpcklwd %%mm4, %%mm4 \n\t"\
537 "paddw %%mm1, %%mm2 \n\t"\
538 "paddw %%mm1, %%mm5 \n\t"\
539 "paddw %%mm1, %%mm4 \n\t"\
540 "punpckhwd %%mm0, %%mm0 \n\t"\
541 "punpckhwd %%mm6, %%mm6 \n\t"\
542 "punpckhwd %%mm3, %%mm3 \n\t"\
543 "paddw %%mm7, %%mm0 \n\t"\
544 "paddw %%mm7, %%mm6 \n\t"\
545 "paddw %%mm7, %%mm3 \n\t"\
546 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547 "packuswb %%mm0, %%mm2 \n\t"\
548 "packuswb %%mm6, %%mm5 \n\t"\
549 "packuswb %%mm3, %%mm4 \n\t"\
551 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
554 "xor "#index", "#index" \n\t"\
557 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
558 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
561 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563 "psrlw $8, %%mm3 \n\t" \
564 "psrlw $8, %%mm4 \n\t" \
565 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
566 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
567 "psraw $7, %%mm1 \n\t" \
568 "psraw $7, %%mm7 \n\t"
569 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
571 // do vertical chrominance interpolation
572 #define REAL_YSCALEYUV2RGB1b(index, c) \
573 "xor "#index", "#index" \n\t"\
576 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
577 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
580 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
583 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
586 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
587 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
590 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
592 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
593 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
601 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602 "paddw %%mm3, %%mm4 \n\t"\
603 "movq %%mm2, %%mm0 \n\t"\
604 "movq %%mm5, %%mm6 \n\t"\
605 "movq %%mm4, %%mm3 \n\t"\
606 "punpcklwd %%mm2, %%mm2 \n\t"\
607 "punpcklwd %%mm5, %%mm5 \n\t"\
608 "punpcklwd %%mm4, %%mm4 \n\t"\
609 "paddw %%mm1, %%mm2 \n\t"\
610 "paddw %%mm1, %%mm5 \n\t"\
611 "paddw %%mm1, %%mm4 \n\t"\
612 "punpckhwd %%mm0, %%mm0 \n\t"\
613 "punpckhwd %%mm6, %%mm6 \n\t"\
614 "punpckhwd %%mm3, %%mm3 \n\t"\
615 "paddw %%mm7, %%mm0 \n\t"\
616 "paddw %%mm7, %%mm6 \n\t"\
617 "paddw %%mm7, %%mm3 \n\t"\
618 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619 "packuswb %%mm0, %%mm2 \n\t"\
620 "packuswb %%mm6, %%mm5 \n\t"\
621 "packuswb %%mm3, %%mm4 \n\t"\
623 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
627 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
628 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
629 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
630 "packuswb %%mm1, %%mm7 \n\t"
631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634 "movq "#b", "#q2" \n\t" /* B */\
635 "movq "#r", "#t" \n\t" /* R */\
636 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
637 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
638 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
639 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
640 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
641 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
642 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
643 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
644 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
645 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
647 MOVNTQ( q0, (dst, index, 4))\
648 MOVNTQ( b, 8(dst, index, 4))\
649 MOVNTQ( q2, 16(dst, index, 4))\
650 MOVNTQ( q3, 24(dst, index, 4))\
652 "add $8, "#index" \n\t"\
653 "cmp "#dstw", "#index" \n\t"\
655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
657 #define REAL_WRITERGB16(dst, dstw, index) \
658 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
659 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
660 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
661 "psrlq $3, %%mm2 \n\t"\
663 "movq %%mm2, %%mm1 \n\t"\
664 "movq %%mm4, %%mm3 \n\t"\
666 "punpcklbw %%mm7, %%mm3 \n\t"\
667 "punpcklbw %%mm5, %%mm2 \n\t"\
668 "punpckhbw %%mm7, %%mm4 \n\t"\
669 "punpckhbw %%mm5, %%mm1 \n\t"\
671 "psllq $3, %%mm3 \n\t"\
672 "psllq $3, %%mm4 \n\t"\
674 "por %%mm3, %%mm2 \n\t"\
675 "por %%mm4, %%mm1 \n\t"\
677 MOVNTQ(%%mm2, (dst, index, 2))\
678 MOVNTQ(%%mm1, 8(dst, index, 2))\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
683 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
685 #define REAL_WRITERGB15(dst, dstw, index) \
686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
687 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
689 "psrlq $3, %%mm2 \n\t"\
690 "psrlq $1, %%mm5 \n\t"\
692 "movq %%mm2, %%mm1 \n\t"\
693 "movq %%mm4, %%mm3 \n\t"\
695 "punpcklbw %%mm7, %%mm3 \n\t"\
696 "punpcklbw %%mm5, %%mm2 \n\t"\
697 "punpckhbw %%mm7, %%mm4 \n\t"\
698 "punpckhbw %%mm5, %%mm1 \n\t"\
700 "psllq $2, %%mm3 \n\t"\
701 "psllq $2, %%mm4 \n\t"\
703 "por %%mm3, %%mm2 \n\t"\
704 "por %%mm4, %%mm1 \n\t"\
706 MOVNTQ(%%mm2, (dst, index, 2))\
707 MOVNTQ(%%mm1, 8(dst, index, 2))\
709 "add $8, "#index" \n\t"\
710 "cmp "#dstw", "#index" \n\t"\
712 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
714 #define WRITEBGR24OLD(dst, dstw, index) \
715 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716 "movq %%mm2, %%mm1 \n\t" /* B */\
717 "movq %%mm5, %%mm6 \n\t" /* R */\
718 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
719 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
720 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
721 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
722 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
723 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
724 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
725 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
726 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
727 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
729 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
730 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
733 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
734 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
735 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
738 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
739 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
740 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
741 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
743 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
744 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
747 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
748 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
749 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
750 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
752 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
753 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
754 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
757 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
758 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
759 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
761 MOVNTQ(%%mm0, (dst))\
762 MOVNTQ(%%mm2, 8(dst))\
763 MOVNTQ(%%mm3, 16(dst))\
764 "add $24, "#dst" \n\t"\
766 "add $8, "#index" \n\t"\
767 "cmp "#dstw", "#index" \n\t"\
770 #define WRITEBGR24MMX(dst, dstw, index) \
771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772 "movq %%mm2, %%mm1 \n\t" /* B */\
773 "movq %%mm5, %%mm6 \n\t" /* R */\
774 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
775 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
776 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
777 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
778 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
779 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
780 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
781 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
782 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
783 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
785 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
786 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
787 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
788 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
790 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
791 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
792 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
793 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
795 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
796 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
797 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
798 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
800 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
801 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
802 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
803 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
804 MOVNTQ(%%mm0, (dst))\
806 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
807 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
808 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
809 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
810 MOVNTQ(%%mm6, 8(dst))\
812 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
813 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
814 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
815 MOVNTQ(%%mm5, 16(dst))\
817 "add $24, "#dst" \n\t"\
819 "add $8, "#index" \n\t"\
820 "cmp "#dstw", "#index" \n\t"\
823 #define WRITEBGR24MMX2(dst, dstw, index) \
824 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
828 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
829 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
831 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
832 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
833 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
835 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
836 "por %%mm1, %%mm6 \n\t"\
837 "por %%mm3, %%mm6 \n\t"\
838 MOVNTQ(%%mm6, (dst))\
840 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
842 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
846 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
847 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
849 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
850 "por %%mm3, %%mm6 \n\t"\
851 MOVNTQ(%%mm6, 8(dst))\
853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
857 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
858 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
861 "por %%mm1, %%mm3 \n\t"\
862 "por %%mm3, %%mm6 \n\t"\
863 MOVNTQ(%%mm6, 16(dst))\
865 "add $24, "#dst" \n\t"\
867 "add $8, "#index" \n\t"\
868 "cmp "#dstw", "#index" \n\t"\
871 #if COMPILE_TEMPLATE_MMX2
873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
879 #define REAL_WRITEYUY2(dst, dstw, index) \
880 "packuswb %%mm3, %%mm3 \n\t"\
881 "packuswb %%mm4, %%mm4 \n\t"\
882 "packuswb %%mm7, %%mm1 \n\t"\
883 "punpcklbw %%mm4, %%mm3 \n\t"\
884 "movq %%mm1, %%mm7 \n\t"\
885 "punpcklbw %%mm3, %%mm1 \n\t"\
886 "punpckhbw %%mm3, %%mm7 \n\t"\
888 MOVNTQ(%%mm1, (dst, index, 2))\
889 MOVNTQ(%%mm7, 8(dst, index, 2))\
891 "add $8, "#index" \n\t"\
892 "cmp "#dstw", "#index" \n\t"\
894 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
901 #if COMPILE_TEMPLATE_MMX
902 if(!(c->flags & SWS_BITEXACT)) {
903 if (c->flags & SWS_ACCURATE_RND) {
905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
908 if (CONFIG_SWSCALE_ALPHA && aDest) {
909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
918 if (CONFIG_SWSCALE_ALPHA && aDest) {
919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
927 #if COMPILE_TEMPLATE_ALTIVEC
928 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929 chrFilter, chrSrc, chrFilterSize,
930 dest, uDest, vDest, dstW, chrDstW);
931 #else //COMPILE_TEMPLATE_ALTIVEC
932 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933 chrFilter, chrSrc, chrFilterSize,
934 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935 #endif //!COMPILE_TEMPLATE_ALTIVEC
938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
942 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943 chrFilter, chrSrc, chrFilterSize,
944 dest, uDest, dstW, chrDstW, dstFormat);
947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
951 #if COMPILE_TEMPLATE_MMX
952 if(!(c->flags & SWS_BITEXACT)) {
954 const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
958 if (c->flags & SWS_ACCURATE_RND) {
962 YSCALEYUV2YV121_ACCURATE
963 :: "r" (src[p]), "r" (dst[p] + counter[p]),
974 :: "r" (src[p]), "r" (dst[p] + counter[p]),
984 for (i=0; i<dstW; i++) {
985 int val= (lumSrc[i]+64)>>7;
996 for (i=0; i<chrDstW; i++) {
997 int u=(chrSrc[i ]+64)>>7;
998 int v=(chrSrc[i + VOFW]+64)>>7;
1002 else if (u>255) u=255;
1004 else if (v>255) v=255;
1011 if (CONFIG_SWSCALE_ALPHA && aDest)
1012 for (i=0; i<dstW; i++) {
1013 int val= (alpSrc[i]+64)>>7;
1014 aDest[i]= av_clip_uint8(val);
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1026 #if COMPILE_TEMPLATE_MMX
1028 x86_reg dstW_reg = dstW;
1029 if(!(c->flags & SWS_BITEXACT)) {
1030 if (c->flags & SWS_ACCURATE_RND) {
1031 switch(c->dstFormat) {
1033 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034 YSCALEYUV2PACKEDX_ACCURATE
1036 "movq %%mm2, "U_TEMP"(%0) \n\t"
1037 "movq %%mm4, "V_TEMP"(%0) \n\t"
1038 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1039 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1041 "psraw $3, %%mm1 \n\t"
1042 "psraw $3, %%mm7 \n\t"
1043 "packuswb %%mm7, %%mm1 \n\t"
1044 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1046 YSCALEYUV2PACKEDX_END
1048 YSCALEYUV2PACKEDX_ACCURATE
1050 "pcmpeqd %%mm7, %%mm7 \n\t"
1051 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1053 YSCALEYUV2PACKEDX_END
1057 YSCALEYUV2PACKEDX_ACCURATE
1059 "pxor %%mm7, %%mm7 \n\t"
1060 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061 "add %4, %%"REG_c" \n\t"
1062 WRITEBGR24(%%REGc, %5, %%REGa)
1065 :: "r" (&c->redDither),
1066 "m" (dummy), "m" (dummy), "m" (dummy),
1067 "r" (dest), "m" (dstW_reg)
1068 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071 case PIX_FMT_RGB555:
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "pxor %%mm7, %%mm7 \n\t"
1075 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1082 WRITERGB15(%4, %5, %%REGa)
1083 YSCALEYUV2PACKEDX_END
1085 case PIX_FMT_RGB565:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 "pxor %%mm7, %%mm7 \n\t"
1089 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1091 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1096 WRITERGB16(%4, %5, %%REGa)
1097 YSCALEYUV2PACKEDX_END
1099 case PIX_FMT_YUYV422:
1100 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "psraw $3, %%mm3 \n\t"
1104 "psraw $3, %%mm4 \n\t"
1105 "psraw $3, %%mm1 \n\t"
1106 "psraw $3, %%mm7 \n\t"
1107 WRITEYUY2(%4, %5, %%REGa)
1108 YSCALEYUV2PACKEDX_END
1112 switch(c->dstFormat) {
1114 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1117 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118 "psraw $3, %%mm1 \n\t"
1119 "psraw $3, %%mm7 \n\t"
1120 "packuswb %%mm7, %%mm1 \n\t"
1121 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122 YSCALEYUV2PACKEDX_END
1126 "pcmpeqd %%mm7, %%mm7 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1134 "pxor %%mm7, %%mm7 \n\t"
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW_reg)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 case PIX_FMT_RGB555:
1148 "pxor %%mm7, %%mm7 \n\t"
1149 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1151 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1152 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1153 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1156 WRITERGB15(%4, %5, %%REGa)
1157 YSCALEYUV2PACKEDX_END
1159 case PIX_FMT_RGB565:
1162 "pxor %%mm7, %%mm7 \n\t"
1163 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1165 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1166 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1167 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1170 WRITERGB16(%4, %5, %%REGa)
1171 YSCALEYUV2PACKEDX_END
1173 case PIX_FMT_YUYV422:
1175 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1177 "psraw $3, %%mm3 \n\t"
1178 "psraw $3, %%mm4 \n\t"
1179 "psraw $3, %%mm1 \n\t"
1180 "psraw $3, %%mm7 \n\t"
1181 WRITEYUY2(%4, %5, %%REGa)
1182 YSCALEYUV2PACKEDX_END
1187 #endif /* COMPILE_TEMPLATE_MMX */
1188 #if COMPILE_TEMPLATE_ALTIVEC
1189 /* The following list of supported dstFormat values should
1190 match what's found in the body of ff_yuv2packedX_altivec() */
1191 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1193 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1195 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196 chrFilter, chrSrc, chrFilterSize,
1200 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201 chrFilter, chrSrc, chrFilterSize,
1202 alpSrc, dest, dstW, dstY);
1206 * vertical bilinear scale YV12 to RGB
1208 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1211 int yalpha1=4095- yalpha;
1212 int uvalpha1=4095-uvalpha;
1215 #if COMPILE_TEMPLATE_MMX
1216 if(!(c->flags & SWS_BITEXACT)) {
1217 switch(c->dstFormat) {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1220 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1223 YSCALEYUV2RGB(%%r8, %5)
1224 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227 "packuswb %%mm7, %%mm1 \n\t"
1228 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1230 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1232 ,"r" (abuf0), "r" (abuf1)
1236 c->u_temp=(intptr_t)abuf0;
1237 c->v_temp=(intptr_t)abuf1;
1239 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_b" \n\t"
1241 "push %%"REG_BP" \n\t"
1242 YSCALEYUV2RGB(%%REGBP, %5)
1245 "mov "U_TEMP"(%5), %0 \n\t"
1246 "mov "V_TEMP"(%5), %1 \n\t"
1247 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250 "packuswb %%mm7, %%mm1 \n\t"
1253 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254 "pop %%"REG_BP" \n\t"
1255 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1257 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1263 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1264 "mov %4, %%"REG_b" \n\t"
1265 "push %%"REG_BP" \n\t"
1266 YSCALEYUV2RGB(%%REGBP, %5)
1267 "pcmpeqd %%mm7, %%mm7 \n\t"
1268 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269 "pop %%"REG_BP" \n\t"
1270 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1272 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1279 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1280 "mov %4, %%"REG_b" \n\t"
1281 "push %%"REG_BP" \n\t"
1282 YSCALEYUV2RGB(%%REGBP, %5)
1283 "pxor %%mm7, %%mm7 \n\t"
1284 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285 "pop %%"REG_BP" \n\t"
1286 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1287 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1291 case PIX_FMT_RGB555:
1293 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1294 "mov %4, %%"REG_b" \n\t"
1295 "push %%"REG_BP" \n\t"
1296 YSCALEYUV2RGB(%%REGBP, %5)
1297 "pxor %%mm7, %%mm7 \n\t"
1298 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1300 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1301 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1302 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1305 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306 "pop %%"REG_BP" \n\t"
1307 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1309 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1313 case PIX_FMT_RGB565:
1315 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1316 "mov %4, %%"REG_b" \n\t"
1317 "push %%"REG_BP" \n\t"
1318 YSCALEYUV2RGB(%%REGBP, %5)
1319 "pxor %%mm7, %%mm7 \n\t"
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1322 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1323 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1324 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1327 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328 "pop %%"REG_BP" \n\t"
1329 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1330 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1334 case PIX_FMT_YUYV422:
1336 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1337 "mov %4, %%"REG_b" \n\t"
1338 "push %%"REG_BP" \n\t"
1339 YSCALEYUV2PACKED(%%REGBP, %5)
1340 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341 "pop %%"REG_BP" \n\t"
1342 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1343 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1350 #endif //COMPILE_TEMPLATE_MMX
1351 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1355 * YV12 to RGB without scaling or interpolating
1357 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1360 const int yalpha1=0;
1363 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364 const int yalpha= 4096; //FIXME ...
1366 if (flags&SWS_FULL_CHR_H_INT) {
1367 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1371 #if COMPILE_TEMPLATE_MMX
1372 if(!(flags & SWS_BITEXACT)) {
1373 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1378 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1379 "mov %4, %%"REG_b" \n\t"
1380 "push %%"REG_BP" \n\t"
1381 YSCALEYUV2RGB1(%%REGBP, %5)
1382 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384 "pop %%"REG_BP" \n\t"
1385 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1387 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1392 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1393 "mov %4, %%"REG_b" \n\t"
1394 "push %%"REG_BP" \n\t"
1395 YSCALEYUV2RGB1(%%REGBP, %5)
1396 "pcmpeqd %%mm7, %%mm7 \n\t"
1397 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398 "pop %%"REG_BP" \n\t"
1399 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1401 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1409 "mov %4, %%"REG_b" \n\t"
1410 "push %%"REG_BP" \n\t"
1411 YSCALEYUV2RGB1(%%REGBP, %5)
1412 "pxor %%mm7, %%mm7 \n\t"
1413 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414 "pop %%"REG_BP" \n\t"
1415 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1417 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 case PIX_FMT_RGB555:
1423 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1424 "mov %4, %%"REG_b" \n\t"
1425 "push %%"REG_BP" \n\t"
1426 YSCALEYUV2RGB1(%%REGBP, %5)
1427 "pxor %%mm7, %%mm7 \n\t"
1428 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1430 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1431 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1432 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1434 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435 "pop %%"REG_BP" \n\t"
1436 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1438 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1442 case PIX_FMT_RGB565:
1444 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1445 "mov %4, %%"REG_b" \n\t"
1446 "push %%"REG_BP" \n\t"
1447 YSCALEYUV2RGB1(%%REGBP, %5)
1448 "pxor %%mm7, %%mm7 \n\t"
1449 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1451 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1452 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1453 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1456 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1460 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1464 case PIX_FMT_YUYV422:
1466 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1467 "mov %4, %%"REG_b" \n\t"
1468 "push %%"REG_BP" \n\t"
1469 YSCALEYUV2PACKED1(%%REGBP, %5)
1470 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1482 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1484 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1485 "mov %4, %%"REG_b" \n\t"
1486 "push %%"REG_BP" \n\t"
1487 YSCALEYUV2RGB1b(%%REGBP, %5)
1488 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490 "pop %%"REG_BP" \n\t"
1491 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1493 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1499 "mov %4, %%"REG_b" \n\t"
1500 "push %%"REG_BP" \n\t"
1501 YSCALEYUV2RGB1b(%%REGBP, %5)
1502 "pcmpeqd %%mm7, %%mm7 \n\t"
1503 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504 "pop %%"REG_BP" \n\t"
1505 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pxor %%mm7, %%mm7 \n\t"
1519 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1527 case PIX_FMT_RGB555:
1529 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1530 "mov %4, %%"REG_b" \n\t"
1531 "push %%"REG_BP" \n\t"
1532 YSCALEYUV2RGB1b(%%REGBP, %5)
1533 "pxor %%mm7, %%mm7 \n\t"
1534 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1536 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1537 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1538 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1540 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1548 case PIX_FMT_RGB565:
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1b(%%REGBP, %5)
1554 "pxor %%mm7, %%mm7 \n\t"
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1557 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1558 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1559 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1562 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563 "pop %%"REG_BP" \n\t"
1564 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1566 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1570 case PIX_FMT_YUYV422:
1572 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1573 "mov %4, %%"REG_b" \n\t"
1574 "push %%"REG_BP" \n\t"
1575 YSCALEYUV2PACKED1b(%%REGBP, %5)
1576 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577 "pop %%"REG_BP" \n\t"
1578 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1580 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587 #endif /* COMPILE_TEMPLATE_MMX */
1588 if (uvalpha < 2048) {
1589 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1595 //FIXME yuy2* can read up to 7 samples too much
1597 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1599 #if COMPILE_TEMPLATE_MMX
1601 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1602 "mov %0, %%"REG_a" \n\t"
1604 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1605 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1606 "pand %%mm2, %%mm0 \n\t"
1607 "pand %%mm2, %%mm1 \n\t"
1608 "packuswb %%mm1, %%mm0 \n\t"
1609 "movq %%mm0, (%2, %%"REG_a") \n\t"
1610 "add $8, %%"REG_a" \n\t"
1612 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1617 for (i=0; i<width; i++)
1622 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1624 #if COMPILE_TEMPLATE_MMX
1626 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1627 "mov %0, %%"REG_a" \n\t"
1629 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1630 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1631 "psrlw $8, %%mm0 \n\t"
1632 "psrlw $8, %%mm1 \n\t"
1633 "packuswb %%mm1, %%mm0 \n\t"
1634 "movq %%mm0, %%mm1 \n\t"
1635 "psrlw $8, %%mm0 \n\t"
1636 "pand %%mm4, %%mm1 \n\t"
1637 "packuswb %%mm0, %%mm0 \n\t"
1638 "packuswb %%mm1, %%mm1 \n\t"
1639 "movd %%mm0, (%3, %%"REG_a") \n\t"
1640 "movd %%mm1, (%2, %%"REG_a") \n\t"
1641 "add $4, %%"REG_a" \n\t"
1643 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1648 for (i=0; i<width; i++) {
1649 dstU[i]= src1[4*i + 1];
1650 dstV[i]= src1[4*i + 3];
1653 assert(src1 == src2);
1656 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1658 #if COMPILE_TEMPLATE_MMX
1660 "mov %0, %%"REG_a" \n\t"
1662 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1663 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1664 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1665 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1666 "psrlw $8, %%mm0 \n\t"
1667 "psrlw $8, %%mm1 \n\t"
1668 "psrlw $8, %%mm2 \n\t"
1669 "psrlw $8, %%mm3 \n\t"
1670 "packuswb %%mm1, %%mm0 \n\t"
1671 "packuswb %%mm3, %%mm2 \n\t"
1672 "movq %%mm0, (%3, %%"REG_a") \n\t"
1673 "movq %%mm2, (%4, %%"REG_a") \n\t"
1674 "add $8, %%"REG_a" \n\t"
1676 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1681 for (i=0; i<width; i++) {
1682 dstU[i]= src1[2*i + 1];
1683 dstV[i]= src2[2*i + 1];
1688 /* This is almost identical to the previous, end exists only because
1689 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1692 #if COMPILE_TEMPLATE_MMX
1694 "mov %0, %%"REG_a" \n\t"
1696 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1697 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1698 "psrlw $8, %%mm0 \n\t"
1699 "psrlw $8, %%mm1 \n\t"
1700 "packuswb %%mm1, %%mm0 \n\t"
1701 "movq %%mm0, (%2, %%"REG_a") \n\t"
1702 "add $8, %%"REG_a" \n\t"
1704 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1709 for (i=0; i<width; i++)
1714 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1716 #if COMPILE_TEMPLATE_MMX
1718 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1719 "mov %0, %%"REG_a" \n\t"
1721 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1722 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1723 "pand %%mm4, %%mm0 \n\t"
1724 "pand %%mm4, %%mm1 \n\t"
1725 "packuswb %%mm1, %%mm0 \n\t"
1726 "movq %%mm0, %%mm1 \n\t"
1727 "psrlw $8, %%mm0 \n\t"
1728 "pand %%mm4, %%mm1 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "packuswb %%mm1, %%mm1 \n\t"
1731 "movd %%mm0, (%3, %%"REG_a") \n\t"
1732 "movd %%mm1, (%2, %%"REG_a") \n\t"
1733 "add $4, %%"REG_a" \n\t"
1735 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1740 for (i=0; i<width; i++) {
1741 dstU[i]= src1[4*i + 0];
1742 dstV[i]= src1[4*i + 2];
1745 assert(src1 == src2);
1748 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1750 #if COMPILE_TEMPLATE_MMX
1752 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1753 "mov %0, %%"REG_a" \n\t"
1755 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1756 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1757 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1758 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1759 "pand %%mm4, %%mm0 \n\t"
1760 "pand %%mm4, %%mm1 \n\t"
1761 "pand %%mm4, %%mm2 \n\t"
1762 "pand %%mm4, %%mm3 \n\t"
1763 "packuswb %%mm1, %%mm0 \n\t"
1764 "packuswb %%mm3, %%mm2 \n\t"
1765 "movq %%mm0, (%3, %%"REG_a") \n\t"
1766 "movq %%mm2, (%4, %%"REG_a") \n\t"
1767 "add $8, %%"REG_a" \n\t"
1769 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1774 for (i=0; i<width; i++) {
1781 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782 const uint8_t *src, long width)
1784 #if COMPILE_TEMPLATE_MMX
1786 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1787 "mov %0, %%"REG_a" \n\t"
1789 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1790 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1791 "movq %%mm0, %%mm2 \n\t"
1792 "movq %%mm1, %%mm3 \n\t"
1793 "pand %%mm4, %%mm0 \n\t"
1794 "pand %%mm4, %%mm1 \n\t"
1795 "psrlw $8, %%mm2 \n\t"
1796 "psrlw $8, %%mm3 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "packuswb %%mm3, %%mm2 \n\t"
1799 "movq %%mm0, (%2, %%"REG_a") \n\t"
1800 "movq %%mm2, (%3, %%"REG_a") \n\t"
1801 "add $8, %%"REG_a" \n\t"
1803 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1808 for (i = 0; i < width; i++) {
1809 dst1[i] = src[2*i+0];
1810 dst2[i] = src[2*i+1];
1815 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816 const uint8_t *src1, const uint8_t *src2,
1817 long width, uint32_t *unused)
1819 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823 const uint8_t *src1, const uint8_t *src2,
1824 long width, uint32_t *unused)
1826 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829 // FIXME Maybe dither instead.
1830 #define YUV_NBPS(depth) \
1831 static inline void RENAME(yuv ## depth ## ToUV)(uint8_t *dstU, uint8_t *dstV, \
1832 const uint16_t *srcU, const uint16_t *srcV, \
1833 long width, uint32_t *unused) \
1836 for (i = 0; i < width; i++) { \
1837 dstU[i] = srcU[i]>>(depth-8); \
1838 dstV[i] = srcV[i]>>(depth-8); \
1842 static inline void RENAME(yuv ## depth ## ToY)(uint8_t *dstY, const uint16_t *srcY, long width, uint32_t *unused) \
1845 for (i = 0; i < width; i++) \
1846 dstY[i] = srcY[i]>>(depth-8); \
1852 #if COMPILE_TEMPLATE_MMX
1853 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1856 if(srcFormat == PIX_FMT_BGR24) {
1858 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1859 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1864 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1865 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1871 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1872 "mov %2, %%"REG_a" \n\t"
1873 "pxor %%mm7, %%mm7 \n\t"
1875 PREFETCH" 64(%0) \n\t"
1876 "movd (%0), %%mm0 \n\t"
1877 "movd 2(%0), %%mm1 \n\t"
1878 "movd 6(%0), %%mm2 \n\t"
1879 "movd 8(%0), %%mm3 \n\t"
1881 "punpcklbw %%mm7, %%mm0 \n\t"
1882 "punpcklbw %%mm7, %%mm1 \n\t"
1883 "punpcklbw %%mm7, %%mm2 \n\t"
1884 "punpcklbw %%mm7, %%mm3 \n\t"
1885 "pmaddwd %%mm5, %%mm0 \n\t"
1886 "pmaddwd %%mm6, %%mm1 \n\t"
1887 "pmaddwd %%mm5, %%mm2 \n\t"
1888 "pmaddwd %%mm6, %%mm3 \n\t"
1889 "paddd %%mm1, %%mm0 \n\t"
1890 "paddd %%mm3, %%mm2 \n\t"
1891 "paddd %%mm4, %%mm0 \n\t"
1892 "paddd %%mm4, %%mm2 \n\t"
1893 "psrad $15, %%mm0 \n\t"
1894 "psrad $15, %%mm2 \n\t"
1895 "packssdw %%mm2, %%mm0 \n\t"
1896 "packuswb %%mm0, %%mm0 \n\t"
1897 "movd %%mm0, (%1, %%"REG_a") \n\t"
1898 "add $4, %%"REG_a" \n\t"
1901 : "r" (dst+width), "g" ((x86_reg)-width)
1906 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1909 "movq 24(%4), %%mm6 \n\t"
1910 "mov %3, %%"REG_a" \n\t"
1911 "pxor %%mm7, %%mm7 \n\t"
1913 PREFETCH" 64(%0) \n\t"
1914 "movd (%0), %%mm0 \n\t"
1915 "movd 2(%0), %%mm1 \n\t"
1916 "punpcklbw %%mm7, %%mm0 \n\t"
1917 "punpcklbw %%mm7, %%mm1 \n\t"
1918 "movq %%mm0, %%mm2 \n\t"
1919 "movq %%mm1, %%mm3 \n\t"
1920 "pmaddwd (%4), %%mm0 \n\t"
1921 "pmaddwd 8(%4), %%mm1 \n\t"
1922 "pmaddwd 16(%4), %%mm2 \n\t"
1923 "pmaddwd %%mm6, %%mm3 \n\t"
1924 "paddd %%mm1, %%mm0 \n\t"
1925 "paddd %%mm3, %%mm2 \n\t"
1927 "movd 6(%0), %%mm1 \n\t"
1928 "movd 8(%0), %%mm3 \n\t"
1930 "punpcklbw %%mm7, %%mm1 \n\t"
1931 "punpcklbw %%mm7, %%mm3 \n\t"
1932 "movq %%mm1, %%mm4 \n\t"
1933 "movq %%mm3, %%mm5 \n\t"
1934 "pmaddwd (%4), %%mm1 \n\t"
1935 "pmaddwd 8(%4), %%mm3 \n\t"
1936 "pmaddwd 16(%4), %%mm4 \n\t"
1937 "pmaddwd %%mm6, %%mm5 \n\t"
1938 "paddd %%mm3, %%mm1 \n\t"
1939 "paddd %%mm5, %%mm4 \n\t"
1941 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1942 "paddd %%mm3, %%mm0 \n\t"
1943 "paddd %%mm3, %%mm2 \n\t"
1944 "paddd %%mm3, %%mm1 \n\t"
1945 "paddd %%mm3, %%mm4 \n\t"
1946 "psrad $15, %%mm0 \n\t"
1947 "psrad $15, %%mm2 \n\t"
1948 "psrad $15, %%mm1 \n\t"
1949 "psrad $15, %%mm4 \n\t"
1950 "packssdw %%mm1, %%mm0 \n\t"
1951 "packssdw %%mm4, %%mm2 \n\t"
1952 "packuswb %%mm0, %%mm0 \n\t"
1953 "packuswb %%mm2, %%mm2 \n\t"
1954 "movd %%mm0, (%1, %%"REG_a") \n\t"
1955 "movd %%mm2, (%2, %%"REG_a") \n\t"
1956 "add $4, %%"REG_a" \n\t"
1959 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1965 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1967 #if COMPILE_TEMPLATE_MMX
1968 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1971 for (i=0; i<width; i++) {
1976 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1978 #endif /* COMPILE_TEMPLATE_MMX */
1981 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1983 #if COMPILE_TEMPLATE_MMX
1984 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1987 for (i=0; i<width; i++) {
1988 int b= src1[3*i + 0];
1989 int g= src1[3*i + 1];
1990 int r= src1[3*i + 2];
1992 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1993 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1995 #endif /* COMPILE_TEMPLATE_MMX */
1996 assert(src1 == src2);
1999 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2002 for (i=0; i<width; i++) {
2003 int b= src1[6*i + 0] + src1[6*i + 3];
2004 int g= src1[6*i + 1] + src1[6*i + 4];
2005 int r= src1[6*i + 2] + src1[6*i + 5];
2007 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2008 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2010 assert(src1 == src2);
2013 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2015 #if COMPILE_TEMPLATE_MMX
2016 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2019 for (i=0; i<width; i++) {
2024 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2029 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2031 #if COMPILE_TEMPLATE_MMX
2033 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2037 for (i=0; i<width; i++) {
2038 int r= src1[3*i + 0];
2039 int g= src1[3*i + 1];
2040 int b= src1[3*i + 2];
2042 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2043 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2048 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2052 for (i=0; i<width; i++) {
2053 int r= src1[6*i + 0] + src1[6*i + 3];
2054 int g= src1[6*i + 1] + src1[6*i + 4];
2055 int b= src1[6*i + 2] + src1[6*i + 5];
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2063 // bilinear / bicubic scaling
2064 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2065 const int16_t *filter, const int16_t *filterPos, long filterSize)
2067 #if COMPILE_TEMPLATE_MMX
2068 assert(filterSize % 4 == 0 && filterSize>0);
2069 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2070 x86_reg counter= -2*dstW;
2072 filterPos-= counter/2;
2076 "push %%"REG_b" \n\t"
2078 "pxor %%mm7, %%mm7 \n\t"
2079 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2080 "mov %%"REG_a", %%"REG_BP" \n\t"
2083 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2084 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2085 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2086 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2087 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2088 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2089 "punpcklbw %%mm7, %%mm0 \n\t"
2090 "punpcklbw %%mm7, %%mm2 \n\t"
2091 "pmaddwd %%mm1, %%mm0 \n\t"
2092 "pmaddwd %%mm2, %%mm3 \n\t"
2093 "movq %%mm0, %%mm4 \n\t"
2094 "punpckldq %%mm3, %%mm0 \n\t"
2095 "punpckhdq %%mm3, %%mm4 \n\t"
2096 "paddd %%mm4, %%mm0 \n\t"
2097 "psrad $7, %%mm0 \n\t"
2098 "packssdw %%mm0, %%mm0 \n\t"
2099 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2100 "add $4, %%"REG_BP" \n\t"
2103 "pop %%"REG_BP" \n\t"
2105 "pop %%"REG_b" \n\t"
2108 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2113 } else if (filterSize==8) {
2114 x86_reg counter= -2*dstW;
2116 filterPos-= counter/2;
2120 "push %%"REG_b" \n\t"
2122 "pxor %%mm7, %%mm7 \n\t"
2123 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2124 "mov %%"REG_a", %%"REG_BP" \n\t"
2127 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2128 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2129 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2130 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2131 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2132 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2133 "punpcklbw %%mm7, %%mm0 \n\t"
2134 "punpcklbw %%mm7, %%mm2 \n\t"
2135 "pmaddwd %%mm1, %%mm0 \n\t"
2136 "pmaddwd %%mm2, %%mm3 \n\t"
2138 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2139 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2140 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2141 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2142 "punpcklbw %%mm7, %%mm4 \n\t"
2143 "punpcklbw %%mm7, %%mm2 \n\t"
2144 "pmaddwd %%mm1, %%mm4 \n\t"
2145 "pmaddwd %%mm2, %%mm5 \n\t"
2146 "paddd %%mm4, %%mm0 \n\t"
2147 "paddd %%mm5, %%mm3 \n\t"
2148 "movq %%mm0, %%mm4 \n\t"
2149 "punpckldq %%mm3, %%mm0 \n\t"
2150 "punpckhdq %%mm3, %%mm4 \n\t"
2151 "paddd %%mm4, %%mm0 \n\t"
2152 "psrad $7, %%mm0 \n\t"
2153 "packssdw %%mm0, %%mm0 \n\t"
2154 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2155 "add $4, %%"REG_BP" \n\t"
2158 "pop %%"REG_BP" \n\t"
2160 "pop %%"REG_b" \n\t"
2163 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2169 const uint8_t *offset = src+filterSize;
2170 x86_reg counter= -2*dstW;
2171 //filter-= counter*filterSize/2;
2172 filterPos-= counter/2;
2175 "pxor %%mm7, %%mm7 \n\t"
2178 "mov %2, %%"REG_c" \n\t"
2179 "movzwl (%%"REG_c", %0), %%eax \n\t"
2180 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2181 "mov %5, %%"REG_c" \n\t"
2182 "pxor %%mm4, %%mm4 \n\t"
2183 "pxor %%mm5, %%mm5 \n\t"
2185 "movq (%1), %%mm1 \n\t"
2186 "movq (%1, %6), %%mm3 \n\t"
2187 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2188 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2189 "punpcklbw %%mm7, %%mm0 \n\t"
2190 "punpcklbw %%mm7, %%mm2 \n\t"
2191 "pmaddwd %%mm1, %%mm0 \n\t"
2192 "pmaddwd %%mm2, %%mm3 \n\t"
2193 "paddd %%mm3, %%mm5 \n\t"
2194 "paddd %%mm0, %%mm4 \n\t"
2196 "add $4, %%"REG_c" \n\t"
2197 "cmp %4, %%"REG_c" \n\t"
2200 "movq %%mm4, %%mm0 \n\t"
2201 "punpckldq %%mm5, %%mm4 \n\t"
2202 "punpckhdq %%mm5, %%mm0 \n\t"
2203 "paddd %%mm0, %%mm4 \n\t"
2204 "psrad $7, %%mm4 \n\t"
2205 "packssdw %%mm4, %%mm4 \n\t"
2206 "mov %3, %%"REG_a" \n\t"
2207 "movd %%mm4, (%%"REG_a", %0) \n\t"
2211 : "+r" (counter), "+r" (filter)
2212 : "m" (filterPos), "m" (dst), "m"(offset),
2213 "m" (src), "r" ((x86_reg)filterSize*2)
2214 : "%"REG_a, "%"REG_c, "%"REG_d
2218 #if COMPILE_TEMPLATE_ALTIVEC
2219 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2222 for (i=0; i<dstW; i++) {
2224 int srcPos= filterPos[i];
2226 //printf("filterPos: %d\n", filterPos[i]);
2227 for (j=0; j<filterSize; j++) {
2228 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2231 //filter += hFilterSize;
2232 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2235 #endif /* COMPILE_TEMPLATE_ALTIVEC */
2236 #endif /* COMPILE_MMX */
2239 //FIXME all pal and rgb srcFormats could do this convertion as well
2240 //FIXME all scalers more complex than bilinear could do half of this transform
2241 static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
2244 for (i = 0; i < width; i++) {
2245 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2246 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2249 static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
2252 for (i = 0; i < width; i++) {
2253 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2254 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2257 static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
2260 for (i = 0; i < width; i++)
2261 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2263 static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
2266 for (i = 0; i < width; i++)
2267 dst[i] = (dst[i]*14071 + 33561947)>>14;
2270 #define FAST_BILINEAR_X86 \
2271 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2272 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2273 "shll $16, %%edi \n\t" \
2274 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2275 "mov %1, %%"REG_D"\n\t" \
2276 "shrl $9, %%esi \n\t" \
2278 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2279 long dstWidth, const uint8_t *src, int srcW,
2283 #if COMPILE_TEMPLATE_MMX2
2284 int32_t *filterPos = c->hLumFilterPos;
2285 int16_t *filter = c->hLumFilter;
2286 int canMMX2BeUsed = c->canMMX2BeUsed;
2287 void *mmx2FilterCode= c->lumMmx2FilterCode;
2290 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2292 if (canMMX2BeUsed) {
2295 "mov %%"REG_b", %5 \n\t"
2297 "pxor %%mm7, %%mm7 \n\t"
2298 "mov %0, %%"REG_c" \n\t"
2299 "mov %1, %%"REG_D" \n\t"
2300 "mov %2, %%"REG_d" \n\t"
2301 "mov %3, %%"REG_b" \n\t"
2302 "xor %%"REG_a", %%"REG_a" \n\t" // i
2303 PREFETCH" (%%"REG_c") \n\t"
2304 PREFETCH" 32(%%"REG_c") \n\t"
2305 PREFETCH" 64(%%"REG_c") \n\t"
2309 #define CALL_MMX2_FILTER_CODE \
2310 "movl (%%"REG_b"), %%esi \n\t"\
2312 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2313 "add %%"REG_S", %%"REG_c" \n\t"\
2314 "add %%"REG_a", %%"REG_D" \n\t"\
2315 "xor %%"REG_a", %%"REG_a" \n\t"\
2319 #define CALL_MMX2_FILTER_CODE \
2320 "movl (%%"REG_b"), %%esi \n\t"\
2322 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2323 "add %%"REG_a", %%"REG_D" \n\t"\
2324 "xor %%"REG_a", %%"REG_a" \n\t"\
2326 #endif /* ARCH_X86_64 */
2328 CALL_MMX2_FILTER_CODE
2329 CALL_MMX2_FILTER_CODE
2330 CALL_MMX2_FILTER_CODE
2331 CALL_MMX2_FILTER_CODE
2332 CALL_MMX2_FILTER_CODE
2333 CALL_MMX2_FILTER_CODE
2334 CALL_MMX2_FILTER_CODE
2335 CALL_MMX2_FILTER_CODE
2338 "mov %5, %%"REG_b" \n\t"
2340 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2341 "m" (mmx2FilterCode)
2345 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2350 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2352 #endif /* COMPILE_TEMPLATE_MMX2 */
2353 x86_reg xInc_shr16 = xInc >> 16;
2354 uint16_t xInc_mask = xInc & 0xffff;
2355 x86_reg dstWidth_reg = dstWidth;
2356 //NO MMX just normal asm ...
2358 "xor %%"REG_a", %%"REG_a" \n\t" // i
2359 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2360 "xorl %%ecx, %%ecx \n\t" // xalpha
2363 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2364 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2366 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2367 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2368 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2370 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2371 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2373 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2374 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2375 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2378 "add $2, %%"REG_a" \n\t"
2379 "cmp %2, %%"REG_a" \n\t"
2383 :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2384 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2386 #if COMPILE_TEMPLATE_MMX2
2387 } //if MMX2 can't be used
2391 unsigned int xpos=0;
2392 for (i=0;i<dstWidth;i++) {
2393 register unsigned int xx=xpos>>16;
2394 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2395 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2398 #endif /* ARCH_X86 */
2401 // *** horizontal scale Y line to temp buffer
2402 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2403 const int16_t *hLumFilter,
2404 const int16_t *hLumFilterPos, int hLumFilterSize,
2405 uint8_t *formatConvBuffer,
2406 uint32_t *pal, int isAlpha)
2408 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2409 void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2411 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2414 toYV12(formatConvBuffer, src, srcW, pal);
2415 src= formatConvBuffer;
2418 if (!c->hyscale_fast) {
2419 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2420 } else { // fast bilinear upscale / crap downscale
2421 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2425 convertRange(dst, dstWidth);
2428 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2429 long dstWidth, const uint8_t *src1,
2430 const uint8_t *src2, int srcW, int xInc)
2433 #if COMPILE_TEMPLATE_MMX2
2434 int32_t *filterPos = c->hChrFilterPos;
2435 int16_t *filter = c->hChrFilter;
2436 int canMMX2BeUsed = c->canMMX2BeUsed;
2437 void *mmx2FilterCode= c->chrMmx2FilterCode;
2440 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2442 if (canMMX2BeUsed) {
2445 "mov %%"REG_b", %6 \n\t"
2447 "pxor %%mm7, %%mm7 \n\t"
2448 "mov %0, %%"REG_c" \n\t"
2449 "mov %1, %%"REG_D" \n\t"
2450 "mov %2, %%"REG_d" \n\t"
2451 "mov %3, %%"REG_b" \n\t"
2452 "xor %%"REG_a", %%"REG_a" \n\t" // i
2453 PREFETCH" (%%"REG_c") \n\t"
2454 PREFETCH" 32(%%"REG_c") \n\t"
2455 PREFETCH" 64(%%"REG_c") \n\t"
2457 CALL_MMX2_FILTER_CODE
2458 CALL_MMX2_FILTER_CODE
2459 CALL_MMX2_FILTER_CODE
2460 CALL_MMX2_FILTER_CODE
2461 "xor %%"REG_a", %%"REG_a" \n\t" // i
2462 "mov %5, %%"REG_c" \n\t" // src
2463 "mov %1, %%"REG_D" \n\t" // buf1
2464 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2465 PREFETCH" (%%"REG_c") \n\t"
2466 PREFETCH" 32(%%"REG_c") \n\t"
2467 PREFETCH" 64(%%"REG_c") \n\t"
2469 CALL_MMX2_FILTER_CODE
2470 CALL_MMX2_FILTER_CODE
2471 CALL_MMX2_FILTER_CODE
2472 CALL_MMX2_FILTER_CODE
2475 "mov %6, %%"REG_b" \n\t"
2477 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2478 "m" (mmx2FilterCode), "m" (src2)
2482 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2487 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2488 //printf("%d %d %d\n", dstWidth, i, srcW);
2489 dst[i] = src1[srcW-1]*128;
2490 dst[i+VOFW] = src2[srcW-1]*128;
2493 #endif /* COMPILE_TEMPLATE_MMX2 */
2494 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2495 uint16_t xInc_mask = xInc & 0xffff;
2496 x86_reg dstWidth_reg = dstWidth;
2498 "xor %%"REG_a", %%"REG_a" \n\t" // i
2499 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2500 "xorl %%ecx, %%ecx \n\t" // xalpha
2503 "mov %0, %%"REG_S" \n\t"
2504 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2505 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2507 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2509 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2510 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2512 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2514 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2515 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2516 "add $1, %%"REG_a" \n\t"
2517 "cmp %2, %%"REG_a" \n\t"
2520 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2521 which is needed to support GCC 4.0. */
2522 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2523 :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2525 :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2528 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2530 #if COMPILE_TEMPLATE_MMX2
2531 } //if MMX2 can't be used
2535 unsigned int xpos=0;
2536 for (i=0;i<dstWidth;i++) {
2537 register unsigned int xx=xpos>>16;
2538 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2539 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2540 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2542 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2543 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2547 #endif /* ARCH_X86 */
2550 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2551 int srcW, int xInc, const int16_t *hChrFilter,
2552 const int16_t *hChrFilterPos, int hChrFilterSize,
2553 uint8_t *formatConvBuffer,
2557 src1 += c->chrSrcOffset;
2558 src2 += c->chrSrcOffset;
2561 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2562 src1= formatConvBuffer;
2563 src2= formatConvBuffer+VOFW;
2566 if (!c->hcscale_fast) {
2567 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2568 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2569 } else { // fast bilinear upscale / crap downscale
2570 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2573 if (c->chrConvertRange)
2574 c->chrConvertRange(dst, dstWidth);
2577 #define DEBUG_SWSCALE_BUFFERS 0
2578 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2580 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2581 int srcSliceH, uint8_t* dst[], int dstStride[])
2583 /* load a few things into local vars to make the code more readable? and faster */
2584 const int srcW= c->srcW;
2585 const int dstW= c->dstW;
2586 const int dstH= c->dstH;
2587 const int chrDstW= c->chrDstW;
2588 const int chrSrcW= c->chrSrcW;
2589 const int lumXInc= c->lumXInc;
2590 const int chrXInc= c->chrXInc;
2591 const enum PixelFormat dstFormat= c->dstFormat;
2592 const int flags= c->flags;
2593 int16_t *vLumFilterPos= c->vLumFilterPos;
2594 int16_t *vChrFilterPos= c->vChrFilterPos;
2595 int16_t *hLumFilterPos= c->hLumFilterPos;
2596 int16_t *hChrFilterPos= c->hChrFilterPos;
2597 int16_t *vLumFilter= c->vLumFilter;
2598 int16_t *vChrFilter= c->vChrFilter;
2599 int16_t *hLumFilter= c->hLumFilter;
2600 int16_t *hChrFilter= c->hChrFilter;
2601 int32_t *lumMmxFilter= c->lumMmxFilter;
2602 int32_t *chrMmxFilter= c->chrMmxFilter;
2603 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2604 const int vLumFilterSize= c->vLumFilterSize;
2605 const int vChrFilterSize= c->vChrFilterSize;
2606 const int hLumFilterSize= c->hLumFilterSize;
2607 const int hChrFilterSize= c->hChrFilterSize;
2608 int16_t **lumPixBuf= c->lumPixBuf;
2609 int16_t **chrPixBuf= c->chrPixBuf;
2610 int16_t **alpPixBuf= c->alpPixBuf;
2611 const int vLumBufSize= c->vLumBufSize;
2612 const int vChrBufSize= c->vChrBufSize;
2613 uint8_t *formatConvBuffer= c->formatConvBuffer;
2614 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2615 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2617 uint32_t *pal=c->pal_yuv;
2619 /* vars which will change and which we need to store back in the context */
2621 int lumBufIndex= c->lumBufIndex;
2622 int chrBufIndex= c->chrBufIndex;
2623 int lastInLumBuf= c->lastInLumBuf;
2624 int lastInChrBuf= c->lastInChrBuf;
2626 if (isPacked(c->srcFormat)) {
2634 srcStride[3]= srcStride[0];
2636 srcStride[1]<<= c->vChrDrop;
2637 srcStride[2]<<= c->vChrDrop;
2639 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2640 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2641 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2642 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2643 srcSliceY, srcSliceH, dstY, dstH);
2644 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2645 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2647 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2648 static int warnedAlready=0; //FIXME move this into the context perhaps
2649 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2650 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2651 " ->cannot do aligned memory accesses anymore\n");
2656 /* Note the user might start scaling the picture in the middle so this
2657 will not get executed. This is not really intended but works
2658 currently, so people might do it. */
2659 if (srcSliceY ==0) {
2669 for (;dstY < dstH; dstY++) {
2670 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2671 const int chrDstY= dstY>>c->chrDstVSubSample;
2672 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2673 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2674 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2676 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2677 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2678 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2679 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2680 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2681 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2684 //handle holes (FAST_BILINEAR & weird filters)
2685 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2686 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2687 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2688 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2690 DEBUG_BUFFERS("dstY: %d\n", dstY);
2691 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2692 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2693 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2694 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2696 // Do we have enough lines in this slice to output the dstY line
2697 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2699 if (!enough_lines) {
2700 lastLumSrcY = srcSliceY + srcSliceH - 1;
2701 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2702 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2703 lastLumSrcY, lastChrSrcY);
2706 //Do horizontal scaling
2707 while(lastInLumBuf < lastLumSrcY) {
2708 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2709 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2711 assert(lumBufIndex < 2*vLumBufSize);
2712 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2713 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2714 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2715 hLumFilter, hLumFilterPos, hLumFilterSize,
2718 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2719 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2720 hLumFilter, hLumFilterPos, hLumFilterSize,
2724 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2725 lumBufIndex, lastInLumBuf);
2727 while(lastInChrBuf < lastChrSrcY) {
2728 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2729 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2731 assert(chrBufIndex < 2*vChrBufSize);
2732 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2733 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2734 //FIXME replace parameters through context struct (some at least)
2736 if (c->needs_hcscale)
2737 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2738 hChrFilter, hChrFilterPos, hChrFilterSize,
2742 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2743 chrBufIndex, lastInChrBuf);
2745 //wrap buf index around to stay inside the ring buffer
2746 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2747 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2749 break; //we can't output a dstY line so let's try with the next slice
2751 #if COMPILE_TEMPLATE_MMX
2752 c->blueDither= ff_dither8[dstY&1];
2753 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2754 c->greenDither= ff_dither8[dstY&1];
2756 c->greenDither= ff_dither4[dstY&1];
2757 c->redDither= ff_dither8[(dstY+1)&1];
2759 if (dstY < dstH-2) {
2760 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2761 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2762 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2763 #if COMPILE_TEMPLATE_MMX
2765 if (flags & SWS_ACCURATE_RND) {
2766 int s= APCK_SIZE / 8;
2767 for (i=0; i<vLumFilterSize; i+=2) {
2768 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2769 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2770 lumMmxFilter[s*i+APCK_COEF/4 ]=
2771 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2772 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2773 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2774 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2775 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2776 alpMmxFilter[s*i+APCK_COEF/4 ]=
2777 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2780 for (i=0; i<vChrFilterSize; i+=2) {
2781 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2782 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2783 chrMmxFilter[s*i+APCK_COEF/4 ]=
2784 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2785 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2788 for (i=0; i<vLumFilterSize; i++) {
2789 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2790 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2791 lumMmxFilter[4*i+2]=
2792 lumMmxFilter[4*i+3]=
2793 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2794 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2795 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2796 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2797 alpMmxFilter[4*i+2]=
2798 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2801 for (i=0; i<vChrFilterSize; i++) {
2802 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2803 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2804 chrMmxFilter[4*i+2]=
2805 chrMmxFilter[4*i+3]=
2806 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2810 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2811 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2812 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2814 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2815 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2816 dest, uDest, dstW, chrDstW, dstFormat);
2817 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2818 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2819 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2820 if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2822 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2823 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2824 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2826 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2827 const int16_t *lumBuf = lumSrcPtr[0];
2828 const int16_t *chrBuf= chrSrcPtr[0];
2829 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2830 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2831 } else { //General YV12
2833 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2834 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2838 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2839 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2840 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2841 int chrAlpha= vChrFilter[2*dstY+1];
2842 if(flags & SWS_FULL_CHR_H_INT) {
2843 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2844 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2845 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2846 alpSrcPtr, dest, dstW, dstY);
2848 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2849 alpPixBuf ? *alpSrcPtr : NULL,
2850 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2852 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2853 int lumAlpha= vLumFilter[2*dstY+1];
2854 int chrAlpha= vChrFilter[2*dstY+1];
2856 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2858 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2859 if(flags & SWS_FULL_CHR_H_INT) {
2860 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2861 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2862 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2863 alpSrcPtr, dest, dstW, dstY);
2865 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2866 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2867 dest, dstW, lumAlpha, chrAlpha, dstY);
2869 } else { //general RGB
2870 if(flags & SWS_FULL_CHR_H_INT) {
2872 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2873 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2874 alpSrcPtr, dest, dstW, dstY);
2877 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879 alpSrcPtr, dest, dstW, dstY);
2883 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2884 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2885 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2886 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2887 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2888 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2891 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2892 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 dest, uDest, dstW, chrDstW, dstFormat);
2894 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2895 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2896 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2897 if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2899 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2900 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2901 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2905 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2906 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2907 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2910 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2911 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2912 if(flags & SWS_FULL_CHR_H_INT) {
2914 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2915 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2916 alpSrcPtr, dest, dstW, dstY);
2919 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2920 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2921 alpSrcPtr, dest, dstW, dstY);
2927 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2928 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2930 #if COMPILE_TEMPLATE_MMX
2931 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2932 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2933 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2934 else __asm__ volatile("emms" :::"memory");
2936 /* store changed local vars back in the context */
2938 c->lumBufIndex= lumBufIndex;
2939 c->chrBufIndex= chrBufIndex;
2940 c->lastInLumBuf= lastInLumBuf;
2941 c->lastInChrBuf= lastInChrBuf;
2943 return dstY - lastDstY;
2946 static void RENAME(sws_init_swScale)(SwsContext *c)
2948 enum PixelFormat srcFormat = c->srcFormat;
2950 c->yuv2nv12X = RENAME(yuv2nv12X );
2951 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2952 c->yuv2yuvX = RENAME(yuv2yuvX );
2953 c->yuv2packed1 = RENAME(yuv2packed1 );
2954 c->yuv2packed2 = RENAME(yuv2packed2 );
2955 c->yuv2packedX = RENAME(yuv2packedX );
2957 c->hScale = RENAME(hScale );
2959 #if COMPILE_TEMPLATE_MMX
2960 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2961 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2963 if (c->flags & SWS_FAST_BILINEAR)
2966 c->hyscale_fast = RENAME(hyscale_fast);
2967 c->hcscale_fast = RENAME(hcscale_fast);
2970 c->chrToYV12 = NULL;
2972 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2973 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2974 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2975 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2979 case PIX_FMT_BGR4_BYTE:
2980 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2981 case PIX_FMT_YUV420P9 : c->chrToYV12 = (void*)RENAME(yuv9ToUV ); break;
2982 case PIX_FMT_YUV422P10:
2983 case PIX_FMT_YUV420P10: c->chrToYV12 = (void*)RENAME(yuv10ToUV); break;
2984 case PIX_FMT_YUV420P16BE:
2985 case PIX_FMT_YUV422P16BE:
2986 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2987 case PIX_FMT_YUV420P16LE:
2988 case PIX_FMT_YUV422P16LE:
2989 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2991 if (c->chrSrcHSubSample) {
2993 case PIX_FMT_RGB48BE:
2994 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2995 case PIX_FMT_BGR48BE:
2996 case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
2997 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV_half; break;
2998 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
2999 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
3000 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
3001 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
3002 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV_half; break;
3003 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
3004 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
3005 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
3006 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
3010 case PIX_FMT_RGB48BE:
3011 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
3012 case PIX_FMT_BGR48BE:
3013 case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
3014 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV; break;
3015 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
3016 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
3017 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
3018 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
3019 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV; break;
3020 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
3021 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
3022 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
3023 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
3027 c->lumToYV12 = NULL;
3028 c->alpToYV12 = NULL;
3029 switch (srcFormat) {
3030 case PIX_FMT_YUV420P9 : c->lumToYV12 = (void*)RENAME(yuv9ToY ); break;
3031 case PIX_FMT_YUV422P10:
3032 case PIX_FMT_YUV420P10: c->lumToYV12 = (void*)RENAME(yuv10ToY); break;
3033 case PIX_FMT_YUYV422 :
3034 case PIX_FMT_YUV420P16BE:
3035 case PIX_FMT_YUV422P16BE:
3036 case PIX_FMT_YUV444P16BE:
3037 case PIX_FMT_GRAY8A :
3038 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3039 case PIX_FMT_UYVY422 :
3040 case PIX_FMT_YUV420P16LE:
3041 case PIX_FMT_YUV422P16LE:
3042 case PIX_FMT_YUV444P16LE:
3043 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3044 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3045 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3046 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3047 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3048 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3049 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
3053 case PIX_FMT_BGR4_BYTE:
3054 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3055 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3056 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3057 case PIX_FMT_RGB32 : c->lumToYV12 = bgr32ToY; break;
3058 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
3059 case PIX_FMT_BGR32 : c->lumToYV12 = rgb32ToY; break;
3060 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
3061 case PIX_FMT_RGB48BE:
3062 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3063 case PIX_FMT_BGR48BE:
3064 case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
3067 switch (srcFormat) {
3068 case PIX_FMT_RGB32 :
3069 case PIX_FMT_RGB32_1:
3070 case PIX_FMT_BGR32 :
3071 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3072 case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
3073 case PIX_FMT_PAL8 : c->alpToYV12 = palToA; break;
3077 switch (srcFormat) {
3078 case PIX_FMT_GRAY8A :
3079 c->alpSrcOffset = 1;
3081 case PIX_FMT_RGB32 :
3082 case PIX_FMT_BGR32 :
3083 c->alpSrcOffset = 3;
3085 case PIX_FMT_RGB48LE:
3086 case PIX_FMT_BGR48LE:
3087 c->lumSrcOffset = 1;
3088 c->chrSrcOffset = 1;
3089 c->alpSrcOffset = 1;
3093 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3095 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3096 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3098 c->lumConvertRange = RENAME(lumRangeToJpeg);
3099 c->chrConvertRange = RENAME(chrRangeToJpeg);
3103 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3104 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3105 c->needs_hcscale = 1;