2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "swscale_template.h"
27 #if COMPILE_TEMPLATE_MMX2
28 #define PREFETCH "prefetchnta"
30 #define PREFETCH " # nop"
33 #if COMPILE_TEMPLATE_MMX2
34 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
38 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
40 #define YSCALEYUV2YV12X(x, offset, dest, width) \
42 "xor %%"REG_a", %%"REG_a" \n\t"\
43 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
44 "movq %%mm3, %%mm4 \n\t"\
45 "lea " offset "(%0), %%"REG_d" \n\t"\
46 "mov (%%"REG_d"), %%"REG_S" \n\t"\
47 ".p2align 4 \n\t" /* FIXME Unroll? */\
49 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
50 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
51 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
52 "add $16, %%"REG_d" \n\t"\
53 "mov (%%"REG_d"), %%"REG_S" \n\t"\
54 "test %%"REG_S", %%"REG_S" \n\t"\
55 "pmulhw %%mm0, %%mm2 \n\t"\
56 "pmulhw %%mm0, %%mm5 \n\t"\
57 "paddw %%mm2, %%mm3 \n\t"\
58 "paddw %%mm5, %%mm4 \n\t"\
60 "psraw $3, %%mm3 \n\t"\
61 "psraw $3, %%mm4 \n\t"\
62 "packuswb %%mm4, %%mm3 \n\t"\
63 MOVNTQ(%%mm3, (%1, %%REGa))\
64 "add $8, %%"REG_a" \n\t"\
65 "cmp %2, %%"REG_a" \n\t"\
66 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
67 "movq %%mm3, %%mm4 \n\t"\
68 "lea " offset "(%0), %%"REG_d" \n\t"\
69 "mov (%%"REG_d"), %%"REG_S" \n\t"\
71 :: "r" (&c->redDither),\
72 "r" (dest), "g" ((x86_reg)width)\
73 : "%"REG_a, "%"REG_d, "%"REG_S\
76 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "xor %%"REG_a", %%"REG_a" \n\t"\
80 "pxor %%mm4, %%mm4 \n\t"\
81 "pxor %%mm5, %%mm5 \n\t"\
82 "pxor %%mm6, %%mm6 \n\t"\
83 "pxor %%mm7, %%mm7 \n\t"\
84 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
88 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
89 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
90 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
91 "movq %%mm0, %%mm3 \n\t"\
92 "punpcklwd %%mm1, %%mm0 \n\t"\
93 "punpckhwd %%mm1, %%mm3 \n\t"\
94 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
95 "pmaddwd %%mm1, %%mm0 \n\t"\
96 "pmaddwd %%mm1, %%mm3 \n\t"\
97 "paddd %%mm0, %%mm4 \n\t"\
98 "paddd %%mm3, %%mm5 \n\t"\
99 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
100 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
101 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
102 "test %%"REG_S", %%"REG_S" \n\t"\
103 "movq %%mm2, %%mm0 \n\t"\
104 "punpcklwd %%mm3, %%mm2 \n\t"\
105 "punpckhwd %%mm3, %%mm0 \n\t"\
106 "pmaddwd %%mm1, %%mm2 \n\t"\
107 "pmaddwd %%mm1, %%mm0 \n\t"\
108 "paddd %%mm2, %%mm6 \n\t"\
109 "paddd %%mm0, %%mm7 \n\t"\
111 "psrad $16, %%mm4 \n\t"\
112 "psrad $16, %%mm5 \n\t"\
113 "psrad $16, %%mm6 \n\t"\
114 "psrad $16, %%mm7 \n\t"\
115 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
116 "packssdw %%mm5, %%mm4 \n\t"\
117 "packssdw %%mm7, %%mm6 \n\t"\
118 "paddw %%mm0, %%mm4 \n\t"\
119 "paddw %%mm0, %%mm6 \n\t"\
120 "psraw $3, %%mm4 \n\t"\
121 "psraw $3, %%mm6 \n\t"\
122 "packuswb %%mm6, %%mm4 \n\t"\
123 MOVNTQ(%%mm4, (%1, %%REGa))\
124 "add $8, %%"REG_a" \n\t"\
125 "cmp %2, %%"REG_a" \n\t"\
126 "lea " offset "(%0), %%"REG_d" \n\t"\
127 "pxor %%mm4, %%mm4 \n\t"\
128 "pxor %%mm5, %%mm5 \n\t"\
129 "pxor %%mm6, %%mm6 \n\t"\
130 "pxor %%mm7, %%mm7 \n\t"\
131 "mov (%%"REG_d"), %%"REG_S" \n\t"\
133 :: "r" (&c->redDither),\
134 "r" (dest), "g" ((x86_reg)width)\
135 : "%"REG_a, "%"REG_d, "%"REG_S\
138 #define YSCALEYUV2YV121 \
139 "mov %2, %%"REG_a" \n\t"\
140 ".p2align 4 \n\t" /* FIXME Unroll? */\
142 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
143 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
144 "psraw $7, %%mm0 \n\t"\
145 "psraw $7, %%mm1 \n\t"\
146 "packuswb %%mm1, %%mm0 \n\t"\
147 MOVNTQ(%%mm0, (%1, %%REGa))\
148 "add $8, %%"REG_a" \n\t"\
151 #define YSCALEYUV2YV121_ACCURATE \
152 "mov %2, %%"REG_a" \n\t"\
153 "pcmpeqw %%mm7, %%mm7 \n\t"\
154 "psrlw $15, %%mm7 \n\t"\
155 "psllw $6, %%mm7 \n\t"\
156 ".p2align 4 \n\t" /* FIXME Unroll? */\
158 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
159 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
160 "paddsw %%mm7, %%mm0 \n\t"\
161 "paddsw %%mm7, %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
170 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
171 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
172 "r" (dest), "m" (dstW_reg),
173 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
174 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
176 #define YSCALEYUV2PACKEDX_UV \
178 "xor %%"REG_a", %%"REG_a" \n\t"\
182 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
183 "mov (%%"REG_d"), %%"REG_S" \n\t"\
184 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
185 "movq %%mm3, %%mm4 \n\t"\
188 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
189 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
190 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
191 "add $16, %%"REG_d" \n\t"\
192 "mov (%%"REG_d"), %%"REG_S" \n\t"\
193 "pmulhw %%mm0, %%mm2 \n\t"\
194 "pmulhw %%mm0, %%mm5 \n\t"\
195 "paddw %%mm2, %%mm3 \n\t"\
196 "paddw %%mm5, %%mm4 \n\t"\
197 "test %%"REG_S", %%"REG_S" \n\t"\
200 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
201 "lea "offset"(%0), %%"REG_d" \n\t"\
202 "mov (%%"REG_d"), %%"REG_S" \n\t"\
203 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
204 "movq "#dst1", "#dst2" \n\t"\
207 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
208 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
209 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
210 "add $16, %%"REG_d" \n\t"\
211 "mov (%%"REG_d"), %%"REG_S" \n\t"\
212 "pmulhw "#coeff", "#src1" \n\t"\
213 "pmulhw "#coeff", "#src2" \n\t"\
214 "paddw "#src1", "#dst1" \n\t"\
215 "paddw "#src2", "#dst2" \n\t"\
216 "test %%"REG_S", %%"REG_S" \n\t"\
219 #define YSCALEYUV2PACKEDX \
220 YSCALEYUV2PACKEDX_UV \
221 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
223 #define YSCALEYUV2PACKEDX_END \
224 :: "r" (&c->redDither), \
225 "m" (dummy), "m" (dummy), "m" (dummy),\
226 "r" (dest), "m" (dstW_reg) \
227 : "%"REG_a, "%"REG_d, "%"REG_S \
230 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
232 "xor %%"REG_a", %%"REG_a" \n\t"\
236 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
237 "mov (%%"REG_d"), %%"REG_S" \n\t"\
238 "pxor %%mm4, %%mm4 \n\t"\
239 "pxor %%mm5, %%mm5 \n\t"\
240 "pxor %%mm6, %%mm6 \n\t"\
241 "pxor %%mm7, %%mm7 \n\t"\
244 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
245 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
246 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
247 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
248 "movq %%mm0, %%mm3 \n\t"\
249 "punpcklwd %%mm1, %%mm0 \n\t"\
250 "punpckhwd %%mm1, %%mm3 \n\t"\
251 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
252 "pmaddwd %%mm1, %%mm0 \n\t"\
253 "pmaddwd %%mm1, %%mm3 \n\t"\
254 "paddd %%mm0, %%mm4 \n\t"\
255 "paddd %%mm3, %%mm5 \n\t"\
256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
257 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
258 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
259 "test %%"REG_S", %%"REG_S" \n\t"\
260 "movq %%mm2, %%mm0 \n\t"\
261 "punpcklwd %%mm3, %%mm2 \n\t"\
262 "punpckhwd %%mm3, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm2 \n\t"\
264 "pmaddwd %%mm1, %%mm0 \n\t"\
265 "paddd %%mm2, %%mm6 \n\t"\
266 "paddd %%mm0, %%mm7 \n\t"\
268 "psrad $16, %%mm4 \n\t"\
269 "psrad $16, %%mm5 \n\t"\
270 "psrad $16, %%mm6 \n\t"\
271 "psrad $16, %%mm7 \n\t"\
272 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
273 "packssdw %%mm5, %%mm4 \n\t"\
274 "packssdw %%mm7, %%mm6 \n\t"\
275 "paddw %%mm0, %%mm4 \n\t"\
276 "paddw %%mm0, %%mm6 \n\t"\
277 "movq %%mm4, "U_TEMP"(%0) \n\t"\
278 "movq %%mm6, "V_TEMP"(%0) \n\t"\
280 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
281 "lea "offset"(%0), %%"REG_d" \n\t"\
282 "mov (%%"REG_d"), %%"REG_S" \n\t"\
283 "pxor %%mm1, %%mm1 \n\t"\
284 "pxor %%mm5, %%mm5 \n\t"\
285 "pxor %%mm7, %%mm7 \n\t"\
286 "pxor %%mm6, %%mm6 \n\t"\
289 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
290 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
291 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
292 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
293 "movq %%mm0, %%mm3 \n\t"\
294 "punpcklwd %%mm4, %%mm0 \n\t"\
295 "punpckhwd %%mm4, %%mm3 \n\t"\
296 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
297 "pmaddwd %%mm4, %%mm0 \n\t"\
298 "pmaddwd %%mm4, %%mm3 \n\t"\
299 "paddd %%mm0, %%mm1 \n\t"\
300 "paddd %%mm3, %%mm5 \n\t"\
301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
302 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
303 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
304 "test %%"REG_S", %%"REG_S" \n\t"\
305 "movq %%mm2, %%mm0 \n\t"\
306 "punpcklwd %%mm3, %%mm2 \n\t"\
307 "punpckhwd %%mm3, %%mm0 \n\t"\
308 "pmaddwd %%mm4, %%mm2 \n\t"\
309 "pmaddwd %%mm4, %%mm0 \n\t"\
310 "paddd %%mm2, %%mm7 \n\t"\
311 "paddd %%mm0, %%mm6 \n\t"\
313 "psrad $16, %%mm1 \n\t"\
314 "psrad $16, %%mm5 \n\t"\
315 "psrad $16, %%mm7 \n\t"\
316 "psrad $16, %%mm6 \n\t"\
317 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
318 "packssdw %%mm5, %%mm1 \n\t"\
319 "packssdw %%mm6, %%mm7 \n\t"\
320 "paddw %%mm0, %%mm1 \n\t"\
321 "paddw %%mm0, %%mm7 \n\t"\
322 "movq "U_TEMP"(%0), %%mm3 \n\t"\
323 "movq "V_TEMP"(%0), %%mm4 \n\t"\
325 #define YSCALEYUV2PACKEDX_ACCURATE \
326 YSCALEYUV2PACKEDX_ACCURATE_UV \
327 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
329 #define YSCALEYUV2RGBX \
330 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
331 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
332 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
333 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
334 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
335 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
336 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
337 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
338 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
339 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
340 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
341 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
342 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
343 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
344 "paddw %%mm3, %%mm4 \n\t"\
345 "movq %%mm2, %%mm0 \n\t"\
346 "movq %%mm5, %%mm6 \n\t"\
347 "movq %%mm4, %%mm3 \n\t"\
348 "punpcklwd %%mm2, %%mm2 \n\t"\
349 "punpcklwd %%mm5, %%mm5 \n\t"\
350 "punpcklwd %%mm4, %%mm4 \n\t"\
351 "paddw %%mm1, %%mm2 \n\t"\
352 "paddw %%mm1, %%mm5 \n\t"\
353 "paddw %%mm1, %%mm4 \n\t"\
354 "punpckhwd %%mm0, %%mm0 \n\t"\
355 "punpckhwd %%mm6, %%mm6 \n\t"\
356 "punpckhwd %%mm3, %%mm3 \n\t"\
357 "paddw %%mm7, %%mm0 \n\t"\
358 "paddw %%mm7, %%mm6 \n\t"\
359 "paddw %%mm7, %%mm3 \n\t"\
360 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
361 "packuswb %%mm0, %%mm2 \n\t"\
362 "packuswb %%mm6, %%mm5 \n\t"\
363 "packuswb %%mm3, %%mm4 \n\t"\
365 #define REAL_YSCALEYUV2PACKED(index, c) \
366 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
367 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
368 "psraw $3, %%mm0 \n\t"\
369 "psraw $3, %%mm1 \n\t"\
370 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
371 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
372 "xor "#index", "#index" \n\t"\
375 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
376 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
377 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
378 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
379 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
380 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
381 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
382 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
383 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
384 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
385 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
386 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
387 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
388 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
389 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
390 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
391 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
392 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
393 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
394 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
395 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
396 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
397 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
398 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
399 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
401 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
403 #define REAL_YSCALEYUV2RGB_UV(index, c) \
404 "xor "#index", "#index" \n\t"\
407 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
408 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
410 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
411 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
412 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
413 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
414 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
415 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
416 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
417 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
418 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
419 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
420 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
421 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
422 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
423 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
424 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
425 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
426 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
428 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
429 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
430 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
431 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
432 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
433 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
434 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
435 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
436 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
437 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
438 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
439 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
440 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
442 #define REAL_YSCALEYUV2RGB_COEFF(c) \
443 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
444 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
445 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
446 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
447 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
448 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
449 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
450 "paddw %%mm3, %%mm4 \n\t"\
451 "movq %%mm2, %%mm0 \n\t"\
452 "movq %%mm5, %%mm6 \n\t"\
453 "movq %%mm4, %%mm3 \n\t"\
454 "punpcklwd %%mm2, %%mm2 \n\t"\
455 "punpcklwd %%mm5, %%mm5 \n\t"\
456 "punpcklwd %%mm4, %%mm4 \n\t"\
457 "paddw %%mm1, %%mm2 \n\t"\
458 "paddw %%mm1, %%mm5 \n\t"\
459 "paddw %%mm1, %%mm4 \n\t"\
460 "punpckhwd %%mm0, %%mm0 \n\t"\
461 "punpckhwd %%mm6, %%mm6 \n\t"\
462 "punpckhwd %%mm3, %%mm3 \n\t"\
463 "paddw %%mm7, %%mm0 \n\t"\
464 "paddw %%mm7, %%mm6 \n\t"\
465 "paddw %%mm7, %%mm3 \n\t"\
466 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
467 "packuswb %%mm0, %%mm2 \n\t"\
468 "packuswb %%mm6, %%mm5 \n\t"\
469 "packuswb %%mm3, %%mm4 \n\t"\
471 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
473 #define YSCALEYUV2RGB(index, c) \
474 REAL_YSCALEYUV2RGB_UV(index, c) \
475 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
476 REAL_YSCALEYUV2RGB_COEFF(c)
478 #define REAL_YSCALEYUV2PACKED1(index, c) \
479 "xor "#index", "#index" \n\t"\
482 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
483 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
484 "psraw $7, %%mm3 \n\t" \
485 "psraw $7, %%mm4 \n\t" \
486 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
487 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
488 "psraw $7, %%mm1 \n\t" \
489 "psraw $7, %%mm7 \n\t" \
491 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
493 #define REAL_YSCALEYUV2RGB1(index, c) \
494 "xor "#index", "#index" \n\t"\
497 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
498 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
499 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
500 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
501 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
502 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
503 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
504 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
505 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
506 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
507 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
508 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
509 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
510 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
511 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
512 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
513 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
514 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
515 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
516 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
517 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
518 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
519 "paddw %%mm3, %%mm4 \n\t"\
520 "movq %%mm2, %%mm0 \n\t"\
521 "movq %%mm5, %%mm6 \n\t"\
522 "movq %%mm4, %%mm3 \n\t"\
523 "punpcklwd %%mm2, %%mm2 \n\t"\
524 "punpcklwd %%mm5, %%mm5 \n\t"\
525 "punpcklwd %%mm4, %%mm4 \n\t"\
526 "paddw %%mm1, %%mm2 \n\t"\
527 "paddw %%mm1, %%mm5 \n\t"\
528 "paddw %%mm1, %%mm4 \n\t"\
529 "punpckhwd %%mm0, %%mm0 \n\t"\
530 "punpckhwd %%mm6, %%mm6 \n\t"\
531 "punpckhwd %%mm3, %%mm3 \n\t"\
532 "paddw %%mm7, %%mm0 \n\t"\
533 "paddw %%mm7, %%mm6 \n\t"\
534 "paddw %%mm7, %%mm3 \n\t"\
535 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
536 "packuswb %%mm0, %%mm2 \n\t"\
537 "packuswb %%mm6, %%mm5 \n\t"\
538 "packuswb %%mm3, %%mm4 \n\t"\
540 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
542 #define REAL_YSCALEYUV2PACKED1b(index, c) \
543 "xor "#index", "#index" \n\t"\
546 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
547 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
548 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
549 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
550 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
551 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
552 "psrlw $8, %%mm3 \n\t" \
553 "psrlw $8, %%mm4 \n\t" \
554 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
555 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
556 "psraw $7, %%mm1 \n\t" \
557 "psraw $7, %%mm7 \n\t"
558 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
560 // do vertical chrominance interpolation
561 #define REAL_YSCALEYUV2RGB1b(index, c) \
562 "xor "#index", "#index" \n\t"\
565 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
566 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
568 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
569 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
570 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
571 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
572 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
573 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
574 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
575 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
576 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
577 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
578 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
579 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
580 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
581 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
582 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
583 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
584 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
585 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
586 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
587 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
588 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
589 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
590 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
591 "paddw %%mm3, %%mm4 \n\t"\
592 "movq %%mm2, %%mm0 \n\t"\
593 "movq %%mm5, %%mm6 \n\t"\
594 "movq %%mm4, %%mm3 \n\t"\
595 "punpcklwd %%mm2, %%mm2 \n\t"\
596 "punpcklwd %%mm5, %%mm5 \n\t"\
597 "punpcklwd %%mm4, %%mm4 \n\t"\
598 "paddw %%mm1, %%mm2 \n\t"\
599 "paddw %%mm1, %%mm5 \n\t"\
600 "paddw %%mm1, %%mm4 \n\t"\
601 "punpckhwd %%mm0, %%mm0 \n\t"\
602 "punpckhwd %%mm6, %%mm6 \n\t"\
603 "punpckhwd %%mm3, %%mm3 \n\t"\
604 "paddw %%mm7, %%mm0 \n\t"\
605 "paddw %%mm7, %%mm6 \n\t"\
606 "paddw %%mm7, %%mm3 \n\t"\
607 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
608 "packuswb %%mm0, %%mm2 \n\t"\
609 "packuswb %%mm6, %%mm5 \n\t"\
610 "packuswb %%mm3, %%mm4 \n\t"\
612 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
614 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
615 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
616 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
617 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
618 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
619 "packuswb %%mm1, %%mm7 \n\t"
620 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
622 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
623 "movq "#b", "#q2" \n\t" /* B */\
624 "movq "#r", "#t" \n\t" /* R */\
625 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
626 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
627 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
628 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
629 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
630 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
631 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
632 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
633 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
634 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
636 MOVNTQ( q0, (dst, index, 4))\
637 MOVNTQ( b, 8(dst, index, 4))\
638 MOVNTQ( q2, 16(dst, index, 4))\
639 MOVNTQ( q3, 24(dst, index, 4))\
641 "add $8, "#index" \n\t"\
642 "cmp "#dstw", "#index" \n\t"\
644 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
646 #define REAL_WRITERGB16(dst, dstw, index) \
647 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
648 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
649 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
650 "psrlq $3, %%mm2 \n\t"\
652 "movq %%mm2, %%mm1 \n\t"\
653 "movq %%mm4, %%mm3 \n\t"\
655 "punpcklbw %%mm7, %%mm3 \n\t"\
656 "punpcklbw %%mm5, %%mm2 \n\t"\
657 "punpckhbw %%mm7, %%mm4 \n\t"\
658 "punpckhbw %%mm5, %%mm1 \n\t"\
660 "psllq $3, %%mm3 \n\t"\
661 "psllq $3, %%mm4 \n\t"\
663 "por %%mm3, %%mm2 \n\t"\
664 "por %%mm4, %%mm1 \n\t"\
666 MOVNTQ(%%mm2, (dst, index, 2))\
667 MOVNTQ(%%mm1, 8(dst, index, 2))\
669 "add $8, "#index" \n\t"\
670 "cmp "#dstw", "#index" \n\t"\
672 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
674 #define REAL_WRITERGB15(dst, dstw, index) \
675 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
676 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
677 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
678 "psrlq $3, %%mm2 \n\t"\
679 "psrlq $1, %%mm5 \n\t"\
681 "movq %%mm2, %%mm1 \n\t"\
682 "movq %%mm4, %%mm3 \n\t"\
684 "punpcklbw %%mm7, %%mm3 \n\t"\
685 "punpcklbw %%mm5, %%mm2 \n\t"\
686 "punpckhbw %%mm7, %%mm4 \n\t"\
687 "punpckhbw %%mm5, %%mm1 \n\t"\
689 "psllq $2, %%mm3 \n\t"\
690 "psllq $2, %%mm4 \n\t"\
692 "por %%mm3, %%mm2 \n\t"\
693 "por %%mm4, %%mm1 \n\t"\
695 MOVNTQ(%%mm2, (dst, index, 2))\
696 MOVNTQ(%%mm1, 8(dst, index, 2))\
698 "add $8, "#index" \n\t"\
699 "cmp "#dstw", "#index" \n\t"\
701 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
703 #define WRITEBGR24MMX(dst, dstw, index) \
704 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
705 "movq %%mm2, %%mm1 \n\t" /* B */\
706 "movq %%mm5, %%mm6 \n\t" /* R */\
707 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
708 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
709 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
710 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
711 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
712 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
713 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
714 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
715 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
716 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
718 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
719 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
720 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
721 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
723 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
724 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
725 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
726 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
728 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
729 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
730 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
731 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
734 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
735 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
737 MOVNTQ(%%mm0, (dst))\
739 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
740 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
741 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
742 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
743 MOVNTQ(%%mm6, 8(dst))\
745 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
746 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
747 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
748 MOVNTQ(%%mm5, 16(dst))\
750 "add $24, "#dst" \n\t"\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
756 #define WRITEBGR24MMX2(dst, dstw, index) \
757 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
758 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
759 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
760 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
761 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
762 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
764 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
765 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
766 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
768 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
769 "por %%mm1, %%mm6 \n\t"\
770 "por %%mm3, %%mm6 \n\t"\
771 MOVNTQ(%%mm6, (dst))\
773 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
774 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
775 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
776 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
778 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
779 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
780 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
782 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
783 "por %%mm3, %%mm6 \n\t"\
784 MOVNTQ(%%mm6, 8(dst))\
786 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
787 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
788 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
790 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
791 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
792 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
794 "por %%mm1, %%mm3 \n\t"\
795 "por %%mm3, %%mm6 \n\t"\
796 MOVNTQ(%%mm6, 16(dst))\
798 "add $24, "#dst" \n\t"\
800 "add $8, "#index" \n\t"\
801 "cmp "#dstw", "#index" \n\t"\
804 #if COMPILE_TEMPLATE_MMX2
806 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
809 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
812 #define REAL_WRITEYUY2(dst, dstw, index) \
813 "packuswb %%mm3, %%mm3 \n\t"\
814 "packuswb %%mm4, %%mm4 \n\t"\
815 "packuswb %%mm7, %%mm1 \n\t"\
816 "punpcklbw %%mm4, %%mm3 \n\t"\
817 "movq %%mm1, %%mm7 \n\t"\
818 "punpcklbw %%mm3, %%mm1 \n\t"\
819 "punpckhbw %%mm3, %%mm7 \n\t"\
821 MOVNTQ(%%mm1, (dst, index, 2))\
822 MOVNTQ(%%mm7, 8(dst, index, 2))\
824 "add $8, "#index" \n\t"\
825 "cmp "#dstw", "#index" \n\t"\
827 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
830 static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
831 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
832 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
835 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
836 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
838 if (CONFIG_SWSCALE_ALPHA && aDest) {
839 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
842 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
845 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
846 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
847 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
850 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
851 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
853 if (CONFIG_SWSCALE_ALPHA && aDest) {
854 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
857 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
860 static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
861 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
864 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
865 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
866 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
871 YSCALEYUV2YV121_ACCURATE
872 :: "r" (src[p]), "r" (dst[p] + counter[p]),
880 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
881 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
884 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
885 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
886 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
892 :: "r" (src[p]), "r" (dst[p] + counter[p]),
902 * vertical scale YV12 to RGB
904 static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
906 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
909 x86_reg dstW_reg = dstW;
911 switch(c->dstFormat) {
913 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
914 YSCALEYUV2PACKEDX_ACCURATE
916 "movq %%mm2, "U_TEMP"(%0) \n\t"
917 "movq %%mm4, "V_TEMP"(%0) \n\t"
918 "movq %%mm5, "Y_TEMP"(%0) \n\t"
919 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
920 "movq "Y_TEMP"(%0), %%mm5 \n\t"
921 "psraw $3, %%mm1 \n\t"
922 "psraw $3, %%mm7 \n\t"
923 "packuswb %%mm7, %%mm1 \n\t"
924 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
926 YSCALEYUV2PACKEDX_END
928 YSCALEYUV2PACKEDX_ACCURATE
930 "pcmpeqd %%mm7, %%mm7 \n\t"
931 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
933 YSCALEYUV2PACKEDX_END
937 YSCALEYUV2PACKEDX_ACCURATE
939 "pxor %%mm7, %%mm7 \n\t"
940 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
941 "add %4, %%"REG_c" \n\t"
942 WRITEBGR24(%%REGc, %5, %%REGa)
945 :: "r" (&c->redDither),
946 "m" (dummy), "m" (dummy), "m" (dummy),
947 "r" (dest), "m" (dstW_reg)
948 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
952 YSCALEYUV2PACKEDX_ACCURATE
954 "pxor %%mm7, %%mm7 \n\t"
955 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
957 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
958 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
959 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
962 WRITERGB15(%4, %5, %%REGa)
963 YSCALEYUV2PACKEDX_END
966 YSCALEYUV2PACKEDX_ACCURATE
968 "pxor %%mm7, %%mm7 \n\t"
969 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
971 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
972 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
973 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
976 WRITERGB16(%4, %5, %%REGa)
977 YSCALEYUV2PACKEDX_END
979 case PIX_FMT_YUYV422:
980 YSCALEYUV2PACKEDX_ACCURATE
981 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
983 "psraw $3, %%mm3 \n\t"
984 "psraw $3, %%mm4 \n\t"
985 "psraw $3, %%mm1 \n\t"
986 "psraw $3, %%mm7 \n\t"
987 WRITEYUY2(%4, %5, %%REGa)
988 YSCALEYUV2PACKEDX_END
992 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
993 chrFilter, chrSrc, chrFilterSize,
994 alpSrc, dest, dstW, dstY);
997 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
998 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
999 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1002 x86_reg dstW_reg = dstW;
1004 switch(c->dstFormat) {
1006 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1009 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1010 "psraw $3, %%mm1 \n\t"
1011 "psraw $3, %%mm7 \n\t"
1012 "packuswb %%mm7, %%mm1 \n\t"
1013 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1014 YSCALEYUV2PACKEDX_END
1018 "pcmpeqd %%mm7, %%mm7 \n\t"
1019 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1020 YSCALEYUV2PACKEDX_END
1026 "pxor %%mm7, %%mm7 \n\t"
1027 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1028 "add %4, %%"REG_c" \n\t"
1029 WRITEBGR24(%%REGc, %5, %%REGa)
1031 :: "r" (&c->redDither),
1032 "m" (dummy), "m" (dummy), "m" (dummy),
1033 "r" (dest), "m" (dstW_reg)
1034 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1037 case PIX_FMT_RGB555:
1040 "pxor %%mm7, %%mm7 \n\t"
1041 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1043 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1044 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1045 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1048 WRITERGB15(%4, %5, %%REGa)
1049 YSCALEYUV2PACKEDX_END
1051 case PIX_FMT_RGB565:
1054 "pxor %%mm7, %%mm7 \n\t"
1055 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1057 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1058 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1059 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1062 WRITERGB16(%4, %5, %%REGa)
1063 YSCALEYUV2PACKEDX_END
1065 case PIX_FMT_YUYV422:
1067 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069 "psraw $3, %%mm3 \n\t"
1070 "psraw $3, %%mm4 \n\t"
1071 "psraw $3, %%mm1 \n\t"
1072 "psraw $3, %%mm7 \n\t"
1073 WRITEYUY2(%4, %5, %%REGa)
1074 YSCALEYUV2PACKEDX_END
1078 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1079 chrFilter, chrSrc, chrFilterSize,
1080 alpSrc, dest, dstW, dstY);
1084 * vertical bilinear scale YV12 to RGB
1086 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1087 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1089 switch(c->dstFormat) {
1090 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1092 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1095 YSCALEYUV2RGB(%%r8, %5)
1096 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1097 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1098 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1099 "packuswb %%mm7, %%mm1 \n\t"
1100 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1102 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1104 ,"r" (abuf0), "r" (abuf1)
1108 *(const uint16_t **)(&c->u_temp)=abuf0;
1109 *(const uint16_t **)(&c->v_temp)=abuf1;
1111 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1112 "mov %4, %%"REG_b" \n\t"
1113 "push %%"REG_BP" \n\t"
1114 YSCALEYUV2RGB(%%REGBP, %5)
1117 "mov "U_TEMP"(%5), %0 \n\t"
1118 "mov "V_TEMP"(%5), %1 \n\t"
1119 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1120 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1121 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1122 "packuswb %%mm7, %%mm1 \n\t"
1125 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1126 "pop %%"REG_BP" \n\t"
1127 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1129 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1135 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1136 "mov %4, %%"REG_b" \n\t"
1137 "push %%"REG_BP" \n\t"
1138 YSCALEYUV2RGB(%%REGBP, %5)
1139 "pcmpeqd %%mm7, %%mm7 \n\t"
1140 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1141 "pop %%"REG_BP" \n\t"
1142 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1144 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1151 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1152 "mov %4, %%"REG_b" \n\t"
1153 "push %%"REG_BP" \n\t"
1154 YSCALEYUV2RGB(%%REGBP, %5)
1155 "pxor %%mm7, %%mm7 \n\t"
1156 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1157 "pop %%"REG_BP" \n\t"
1158 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1159 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1163 case PIX_FMT_RGB555:
1165 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1166 "mov %4, %%"REG_b" \n\t"
1167 "push %%"REG_BP" \n\t"
1168 YSCALEYUV2RGB(%%REGBP, %5)
1169 "pxor %%mm7, %%mm7 \n\t"
1170 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1172 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1173 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1174 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1177 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1178 "pop %%"REG_BP" \n\t"
1179 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1181 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1185 case PIX_FMT_RGB565:
1187 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1188 "mov %4, %%"REG_b" \n\t"
1189 "push %%"REG_BP" \n\t"
1190 YSCALEYUV2RGB(%%REGBP, %5)
1191 "pxor %%mm7, %%mm7 \n\t"
1192 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1194 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1195 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1196 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1199 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1200 "pop %%"REG_BP" \n\t"
1201 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1202 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1206 case PIX_FMT_YUYV422:
1208 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1209 "mov %4, %%"REG_b" \n\t"
1210 "push %%"REG_BP" \n\t"
1211 YSCALEYUV2PACKED(%%REGBP, %5)
1212 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1213 "pop %%"REG_BP" \n\t"
1214 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1215 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1221 yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
1222 dest, dstW, yalpha, uvalpha, y);
1226 * YV12 to RGB without scaling or interpolating
1228 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1229 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1231 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1233 if (flags&SWS_FULL_CHR_H_INT) {
1234 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1238 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1241 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1243 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1244 "mov %4, %%"REG_b" \n\t"
1245 "push %%"REG_BP" \n\t"
1246 YSCALEYUV2RGB1(%%REGBP, %5)
1247 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1248 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1249 "pop %%"REG_BP" \n\t"
1250 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1252 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1257 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1258 "mov %4, %%"REG_b" \n\t"
1259 "push %%"REG_BP" \n\t"
1260 YSCALEYUV2RGB1(%%REGBP, %5)
1261 "pcmpeqd %%mm7, %%mm7 \n\t"
1262 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1263 "pop %%"REG_BP" \n\t"
1264 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1266 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1274 "mov %4, %%"REG_b" \n\t"
1275 "push %%"REG_BP" \n\t"
1276 YSCALEYUV2RGB1(%%REGBP, %5)
1277 "pxor %%mm7, %%mm7 \n\t"
1278 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1286 case PIX_FMT_RGB555:
1288 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1289 "mov %4, %%"REG_b" \n\t"
1290 "push %%"REG_BP" \n\t"
1291 YSCALEYUV2RGB1(%%REGBP, %5)
1292 "pxor %%mm7, %%mm7 \n\t"
1293 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1295 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1296 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1297 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1299 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1300 "pop %%"REG_BP" \n\t"
1301 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1303 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1307 case PIX_FMT_RGB565:
1309 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1310 "mov %4, %%"REG_b" \n\t"
1311 "push %%"REG_BP" \n\t"
1312 YSCALEYUV2RGB1(%%REGBP, %5)
1313 "pxor %%mm7, %%mm7 \n\t"
1314 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1316 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1317 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1318 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1321 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1322 "pop %%"REG_BP" \n\t"
1323 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1325 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1329 case PIX_FMT_YUYV422:
1331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1332 "mov %4, %%"REG_b" \n\t"
1333 "push %%"REG_BP" \n\t"
1334 YSCALEYUV2PACKED1(%%REGBP, %5)
1335 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1336 "pop %%"REG_BP" \n\t"
1337 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1339 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1347 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1349 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1350 "mov %4, %%"REG_b" \n\t"
1351 "push %%"REG_BP" \n\t"
1352 YSCALEYUV2RGB1b(%%REGBP, %5)
1353 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1354 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1355 "pop %%"REG_BP" \n\t"
1356 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1358 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1363 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1364 "mov %4, %%"REG_b" \n\t"
1365 "push %%"REG_BP" \n\t"
1366 YSCALEYUV2RGB1b(%%REGBP, %5)
1367 "pcmpeqd %%mm7, %%mm7 \n\t"
1368 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1369 "pop %%"REG_BP" \n\t"
1370 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1372 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1380 "mov %4, %%"REG_b" \n\t"
1381 "push %%"REG_BP" \n\t"
1382 YSCALEYUV2RGB1b(%%REGBP, %5)
1383 "pxor %%mm7, %%mm7 \n\t"
1384 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1385 "pop %%"REG_BP" \n\t"
1386 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1388 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1392 case PIX_FMT_RGB555:
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1b(%%REGBP, %5)
1398 "pxor %%mm7, %%mm7 \n\t"
1399 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1401 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1402 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1403 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1405 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1406 "pop %%"REG_BP" \n\t"
1407 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1409 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1413 case PIX_FMT_RGB565:
1415 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1416 "mov %4, %%"REG_b" \n\t"
1417 "push %%"REG_BP" \n\t"
1418 YSCALEYUV2RGB1b(%%REGBP, %5)
1419 "pxor %%mm7, %%mm7 \n\t"
1420 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1422 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1423 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1424 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1427 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1428 "pop %%"REG_BP" \n\t"
1429 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1431 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435 case PIX_FMT_YUYV422:
1437 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1438 "mov %4, %%"REG_b" \n\t"
1439 "push %%"REG_BP" \n\t"
1440 YSCALEYUV2PACKED1b(%%REGBP, %5)
1441 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1442 "pop %%"REG_BP" \n\t"
1443 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
1453 dstW, uvalpha, dstFormat, flags, y);
1456 //FIXME yuy2* can read up to 7 samples too much
1458 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1461 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1462 "mov %0, %%"REG_a" \n\t"
1464 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1465 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1466 "pand %%mm2, %%mm0 \n\t"
1467 "pand %%mm2, %%mm1 \n\t"
1468 "packuswb %%mm1, %%mm0 \n\t"
1469 "movq %%mm0, (%2, %%"REG_a") \n\t"
1470 "add $8, %%"REG_a" \n\t"
1472 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1477 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1480 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1481 "mov %0, %%"REG_a" \n\t"
1483 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1484 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1485 "psrlw $8, %%mm0 \n\t"
1486 "psrlw $8, %%mm1 \n\t"
1487 "packuswb %%mm1, %%mm0 \n\t"
1488 "movq %%mm0, %%mm1 \n\t"
1489 "psrlw $8, %%mm0 \n\t"
1490 "pand %%mm4, %%mm1 \n\t"
1491 "packuswb %%mm0, %%mm0 \n\t"
1492 "packuswb %%mm1, %%mm1 \n\t"
1493 "movd %%mm0, (%3, %%"REG_a") \n\t"
1494 "movd %%mm1, (%2, %%"REG_a") \n\t"
1495 "add $4, %%"REG_a" \n\t"
1497 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1500 assert(src1 == src2);
1503 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1506 "mov %0, %%"REG_a" \n\t"
1508 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1509 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1510 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1511 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1512 "psrlw $8, %%mm0 \n\t"
1513 "psrlw $8, %%mm1 \n\t"
1514 "psrlw $8, %%mm2 \n\t"
1515 "psrlw $8, %%mm3 \n\t"
1516 "packuswb %%mm1, %%mm0 \n\t"
1517 "packuswb %%mm3, %%mm2 \n\t"
1518 "movq %%mm0, (%3, %%"REG_a") \n\t"
1519 "movq %%mm2, (%4, %%"REG_a") \n\t"
1520 "add $8, %%"REG_a" \n\t"
1522 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1527 /* This is almost identical to the previous, end exists only because
1528 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1529 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1532 "mov %0, %%"REG_a" \n\t"
1534 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1535 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1536 "psrlw $8, %%mm0 \n\t"
1537 "psrlw $8, %%mm1 \n\t"
1538 "packuswb %%mm1, %%mm0 \n\t"
1539 "movq %%mm0, (%2, %%"REG_a") \n\t"
1540 "add $8, %%"REG_a" \n\t"
1542 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1547 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1550 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1551 "mov %0, %%"REG_a" \n\t"
1553 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1554 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1555 "pand %%mm4, %%mm0 \n\t"
1556 "pand %%mm4, %%mm1 \n\t"
1557 "packuswb %%mm1, %%mm0 \n\t"
1558 "movq %%mm0, %%mm1 \n\t"
1559 "psrlw $8, %%mm0 \n\t"
1560 "pand %%mm4, %%mm1 \n\t"
1561 "packuswb %%mm0, %%mm0 \n\t"
1562 "packuswb %%mm1, %%mm1 \n\t"
1563 "movd %%mm0, (%3, %%"REG_a") \n\t"
1564 "movd %%mm1, (%2, %%"REG_a") \n\t"
1565 "add $4, %%"REG_a" \n\t"
1567 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1570 assert(src1 == src2);
1573 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1576 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1577 "mov %0, %%"REG_a" \n\t"
1579 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1580 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1581 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1582 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1583 "pand %%mm4, %%mm0 \n\t"
1584 "pand %%mm4, %%mm1 \n\t"
1585 "pand %%mm4, %%mm2 \n\t"
1586 "pand %%mm4, %%mm3 \n\t"
1587 "packuswb %%mm1, %%mm0 \n\t"
1588 "packuswb %%mm3, %%mm2 \n\t"
1589 "movq %%mm0, (%3, %%"REG_a") \n\t"
1590 "movq %%mm2, (%4, %%"REG_a") \n\t"
1591 "add $8, %%"REG_a" \n\t"
1593 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1598 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1599 const uint8_t *src, long width)
1602 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1603 "mov %0, %%"REG_a" \n\t"
1605 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1606 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1607 "movq %%mm0, %%mm2 \n\t"
1608 "movq %%mm1, %%mm3 \n\t"
1609 "pand %%mm4, %%mm0 \n\t"
1610 "pand %%mm4, %%mm1 \n\t"
1611 "psrlw $8, %%mm2 \n\t"
1612 "psrlw $8, %%mm3 \n\t"
1613 "packuswb %%mm1, %%mm0 \n\t"
1614 "packuswb %%mm3, %%mm2 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "movq %%mm2, (%3, %%"REG_a") \n\t"
1617 "add $8, %%"REG_a" \n\t"
1619 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1624 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1625 const uint8_t *src1, const uint8_t *src2,
1626 long width, uint32_t *unused)
1628 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1631 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1632 const uint8_t *src1, const uint8_t *src2,
1633 long width, uint32_t *unused)
1635 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1638 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1641 if(srcFormat == PIX_FMT_BGR24) {
1643 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1644 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1649 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1650 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1656 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1657 "mov %2, %%"REG_a" \n\t"
1658 "pxor %%mm7, %%mm7 \n\t"
1660 PREFETCH" 64(%0) \n\t"
1661 "movd (%0), %%mm0 \n\t"
1662 "movd 2(%0), %%mm1 \n\t"
1663 "movd 6(%0), %%mm2 \n\t"
1664 "movd 8(%0), %%mm3 \n\t"
1666 "punpcklbw %%mm7, %%mm0 \n\t"
1667 "punpcklbw %%mm7, %%mm1 \n\t"
1668 "punpcklbw %%mm7, %%mm2 \n\t"
1669 "punpcklbw %%mm7, %%mm3 \n\t"
1670 "pmaddwd %%mm5, %%mm0 \n\t"
1671 "pmaddwd %%mm6, %%mm1 \n\t"
1672 "pmaddwd %%mm5, %%mm2 \n\t"
1673 "pmaddwd %%mm6, %%mm3 \n\t"
1674 "paddd %%mm1, %%mm0 \n\t"
1675 "paddd %%mm3, %%mm2 \n\t"
1676 "paddd %%mm4, %%mm0 \n\t"
1677 "paddd %%mm4, %%mm2 \n\t"
1678 "psrad $15, %%mm0 \n\t"
1679 "psrad $15, %%mm2 \n\t"
1680 "packssdw %%mm2, %%mm0 \n\t"
1681 "packuswb %%mm0, %%mm0 \n\t"
1682 "movd %%mm0, (%1, %%"REG_a") \n\t"
1683 "add $4, %%"REG_a" \n\t"
1686 : "r" (dst+width), "g" ((x86_reg)-width)
1691 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1694 "movq 24(%4), %%mm6 \n\t"
1695 "mov %3, %%"REG_a" \n\t"
1696 "pxor %%mm7, %%mm7 \n\t"
1698 PREFETCH" 64(%0) \n\t"
1699 "movd (%0), %%mm0 \n\t"
1700 "movd 2(%0), %%mm1 \n\t"
1701 "punpcklbw %%mm7, %%mm0 \n\t"
1702 "punpcklbw %%mm7, %%mm1 \n\t"
1703 "movq %%mm0, %%mm2 \n\t"
1704 "movq %%mm1, %%mm3 \n\t"
1705 "pmaddwd (%4), %%mm0 \n\t"
1706 "pmaddwd 8(%4), %%mm1 \n\t"
1707 "pmaddwd 16(%4), %%mm2 \n\t"
1708 "pmaddwd %%mm6, %%mm3 \n\t"
1709 "paddd %%mm1, %%mm0 \n\t"
1710 "paddd %%mm3, %%mm2 \n\t"
1712 "movd 6(%0), %%mm1 \n\t"
1713 "movd 8(%0), %%mm3 \n\t"
1715 "punpcklbw %%mm7, %%mm1 \n\t"
1716 "punpcklbw %%mm7, %%mm3 \n\t"
1717 "movq %%mm1, %%mm4 \n\t"
1718 "movq %%mm3, %%mm5 \n\t"
1719 "pmaddwd (%4), %%mm1 \n\t"
1720 "pmaddwd 8(%4), %%mm3 \n\t"
1721 "pmaddwd 16(%4), %%mm4 \n\t"
1722 "pmaddwd %%mm6, %%mm5 \n\t"
1723 "paddd %%mm3, %%mm1 \n\t"
1724 "paddd %%mm5, %%mm4 \n\t"
1726 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1727 "paddd %%mm3, %%mm0 \n\t"
1728 "paddd %%mm3, %%mm2 \n\t"
1729 "paddd %%mm3, %%mm1 \n\t"
1730 "paddd %%mm3, %%mm4 \n\t"
1731 "psrad $15, %%mm0 \n\t"
1732 "psrad $15, %%mm2 \n\t"
1733 "psrad $15, %%mm1 \n\t"
1734 "psrad $15, %%mm4 \n\t"
1735 "packssdw %%mm1, %%mm0 \n\t"
1736 "packssdw %%mm4, %%mm2 \n\t"
1737 "packuswb %%mm0, %%mm0 \n\t"
1738 "packuswb %%mm2, %%mm2 \n\t"
1739 "movd %%mm0, (%1, %%"REG_a") \n\t"
1740 "movd %%mm2, (%2, %%"REG_a") \n\t"
1741 "add $4, %%"REG_a" \n\t"
1744 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1749 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1751 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1754 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1756 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1757 assert(src1 == src2);
1760 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1762 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1765 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1768 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1772 // bilinear / bicubic scaling
1773 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
1774 const int16_t *filter, const int16_t *filterPos, long filterSize)
1776 assert(filterSize % 4 == 0 && filterSize>0);
1777 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1778 x86_reg counter= -2*dstW;
1780 filterPos-= counter/2;
1784 "push %%"REG_b" \n\t"
1786 "pxor %%mm7, %%mm7 \n\t"
1787 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1788 "mov %%"REG_a", %%"REG_BP" \n\t"
1791 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1792 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1793 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1794 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1795 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1796 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm0 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1799 "pmaddwd %%mm1, %%mm0 \n\t"
1800 "pmaddwd %%mm2, %%mm3 \n\t"
1801 "movq %%mm0, %%mm4 \n\t"
1802 "punpckldq %%mm3, %%mm0 \n\t"
1803 "punpckhdq %%mm3, %%mm4 \n\t"
1804 "paddd %%mm4, %%mm0 \n\t"
1805 "psrad $7, %%mm0 \n\t"
1806 "packssdw %%mm0, %%mm0 \n\t"
1807 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1808 "add $4, %%"REG_BP" \n\t"
1811 "pop %%"REG_BP" \n\t"
1813 "pop %%"REG_b" \n\t"
1816 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1821 } else if (filterSize==8) {
1822 x86_reg counter= -2*dstW;
1824 filterPos-= counter/2;
1828 "push %%"REG_b" \n\t"
1830 "pxor %%mm7, %%mm7 \n\t"
1831 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1832 "mov %%"REG_a", %%"REG_BP" \n\t"
1835 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1836 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1837 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1838 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1839 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1840 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1841 "punpcklbw %%mm7, %%mm0 \n\t"
1842 "punpcklbw %%mm7, %%mm2 \n\t"
1843 "pmaddwd %%mm1, %%mm0 \n\t"
1844 "pmaddwd %%mm2, %%mm3 \n\t"
1846 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1847 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1848 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1849 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1850 "punpcklbw %%mm7, %%mm4 \n\t"
1851 "punpcklbw %%mm7, %%mm2 \n\t"
1852 "pmaddwd %%mm1, %%mm4 \n\t"
1853 "pmaddwd %%mm2, %%mm5 \n\t"
1854 "paddd %%mm4, %%mm0 \n\t"
1855 "paddd %%mm5, %%mm3 \n\t"
1856 "movq %%mm0, %%mm4 \n\t"
1857 "punpckldq %%mm3, %%mm0 \n\t"
1858 "punpckhdq %%mm3, %%mm4 \n\t"
1859 "paddd %%mm4, %%mm0 \n\t"
1860 "psrad $7, %%mm0 \n\t"
1861 "packssdw %%mm0, %%mm0 \n\t"
1862 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1863 "add $4, %%"REG_BP" \n\t"
1866 "pop %%"REG_BP" \n\t"
1868 "pop %%"REG_b" \n\t"
1871 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1877 const uint8_t *offset = src+filterSize;
1878 x86_reg counter= -2*dstW;
1879 //filter-= counter*filterSize/2;
1880 filterPos-= counter/2;
1883 "pxor %%mm7, %%mm7 \n\t"
1886 "mov %2, %%"REG_c" \n\t"
1887 "movzwl (%%"REG_c", %0), %%eax \n\t"
1888 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
1889 "mov %5, %%"REG_c" \n\t"
1890 "pxor %%mm4, %%mm4 \n\t"
1891 "pxor %%mm5, %%mm5 \n\t"
1893 "movq (%1), %%mm1 \n\t"
1894 "movq (%1, %6), %%mm3 \n\t"
1895 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
1896 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm0 \n\t"
1898 "punpcklbw %%mm7, %%mm2 \n\t"
1899 "pmaddwd %%mm1, %%mm0 \n\t"
1900 "pmaddwd %%mm2, %%mm3 \n\t"
1901 "paddd %%mm3, %%mm5 \n\t"
1902 "paddd %%mm0, %%mm4 \n\t"
1904 "add $4, %%"REG_c" \n\t"
1905 "cmp %4, %%"REG_c" \n\t"
1908 "movq %%mm4, %%mm0 \n\t"
1909 "punpckldq %%mm5, %%mm4 \n\t"
1910 "punpckhdq %%mm5, %%mm0 \n\t"
1911 "paddd %%mm0, %%mm4 \n\t"
1912 "psrad $7, %%mm4 \n\t"
1913 "packssdw %%mm4, %%mm4 \n\t"
1914 "mov %3, %%"REG_a" \n\t"
1915 "movd %%mm4, (%%"REG_a", %0) \n\t"
1919 : "+r" (counter), "+r" (filter)
1920 : "m" (filterPos), "m" (dst), "m"(offset),
1921 "m" (src), "r" ((x86_reg)filterSize*2)
1922 : "%"REG_a, "%"REG_c, "%"REG_d
1927 #if COMPILE_TEMPLATE_MMX2
1928 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
1929 long dstWidth, const uint8_t *src, int srcW,
1932 int32_t *filterPos = c->hLumFilterPos;
1933 int16_t *filter = c->hLumFilter;
1934 int canMMX2BeUsed = c->canMMX2BeUsed;
1935 void *mmx2FilterCode= c->lumMmx2FilterCode;
1938 DECLARE_ALIGNED(8, uint64_t, ebxsave);
1943 "mov %%"REG_b", %5 \n\t"
1945 "pxor %%mm7, %%mm7 \n\t"
1946 "mov %0, %%"REG_c" \n\t"
1947 "mov %1, %%"REG_D" \n\t"
1948 "mov %2, %%"REG_d" \n\t"
1949 "mov %3, %%"REG_b" \n\t"
1950 "xor %%"REG_a", %%"REG_a" \n\t" // i
1951 PREFETCH" (%%"REG_c") \n\t"
1952 PREFETCH" 32(%%"REG_c") \n\t"
1953 PREFETCH" 64(%%"REG_c") \n\t"
1957 #define CALL_MMX2_FILTER_CODE \
1958 "movl (%%"REG_b"), %%esi \n\t"\
1960 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
1961 "add %%"REG_S", %%"REG_c" \n\t"\
1962 "add %%"REG_a", %%"REG_D" \n\t"\
1963 "xor %%"REG_a", %%"REG_a" \n\t"\
1967 #define CALL_MMX2_FILTER_CODE \
1968 "movl (%%"REG_b"), %%esi \n\t"\
1970 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
1971 "add %%"REG_a", %%"REG_D" \n\t"\
1972 "xor %%"REG_a", %%"REG_a" \n\t"\
1974 #endif /* ARCH_X86_64 */
1976 CALL_MMX2_FILTER_CODE
1977 CALL_MMX2_FILTER_CODE
1978 CALL_MMX2_FILTER_CODE
1979 CALL_MMX2_FILTER_CODE
1980 CALL_MMX2_FILTER_CODE
1981 CALL_MMX2_FILTER_CODE
1982 CALL_MMX2_FILTER_CODE
1983 CALL_MMX2_FILTER_CODE
1986 "mov %5, %%"REG_b" \n\t"
1988 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
1989 "m" (mmx2FilterCode)
1993 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
1998 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2001 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2002 long dstWidth, const uint8_t *src1,
2003 const uint8_t *src2, int srcW, int xInc)
2005 int32_t *filterPos = c->hChrFilterPos;
2006 int16_t *filter = c->hChrFilter;
2007 int canMMX2BeUsed = c->canMMX2BeUsed;
2008 void *mmx2FilterCode= c->chrMmx2FilterCode;
2011 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2016 "mov %%"REG_b", %6 \n\t"
2018 "pxor %%mm7, %%mm7 \n\t"
2019 "mov %0, %%"REG_c" \n\t"
2020 "mov %1, %%"REG_D" \n\t"
2021 "mov %2, %%"REG_d" \n\t"
2022 "mov %3, %%"REG_b" \n\t"
2023 "xor %%"REG_a", %%"REG_a" \n\t" // i
2024 PREFETCH" (%%"REG_c") \n\t"
2025 PREFETCH" 32(%%"REG_c") \n\t"
2026 PREFETCH" 64(%%"REG_c") \n\t"
2028 CALL_MMX2_FILTER_CODE
2029 CALL_MMX2_FILTER_CODE
2030 CALL_MMX2_FILTER_CODE
2031 CALL_MMX2_FILTER_CODE
2032 "xor %%"REG_a", %%"REG_a" \n\t" // i
2033 "mov %5, %%"REG_c" \n\t" // src
2034 "mov %1, %%"REG_D" \n\t" // buf1
2035 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2036 PREFETCH" (%%"REG_c") \n\t"
2037 PREFETCH" 32(%%"REG_c") \n\t"
2038 PREFETCH" 64(%%"REG_c") \n\t"
2040 CALL_MMX2_FILTER_CODE
2041 CALL_MMX2_FILTER_CODE
2042 CALL_MMX2_FILTER_CODE
2043 CALL_MMX2_FILTER_CODE
2046 "mov %6, %%"REG_b" \n\t"
2048 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2049 "m" (mmx2FilterCode), "m" (src2)
2053 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2058 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2059 dst[i] = src1[srcW-1]*128;
2060 dst[i+VOFW] = src2[srcW-1]*128;
2063 #endif /* COMPILE_TEMPLATE_MMX2 */
2065 #if !COMPILE_TEMPLATE_MMX2
2066 static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
2067 int lastInLumBuf, int lastInChrBuf)
2069 const int dstH= c->dstH;
2070 const int flags= c->flags;
2071 int16_t **lumPixBuf= c->lumPixBuf;
2072 int16_t **chrPixBuf= c->chrPixBuf;
2073 int16_t **alpPixBuf= c->alpPixBuf;
2074 const int vLumBufSize= c->vLumBufSize;
2075 const int vChrBufSize= c->vChrBufSize;
2076 int16_t *vLumFilterPos= c->vLumFilterPos;
2077 int16_t *vChrFilterPos= c->vChrFilterPos;
2078 int16_t *vLumFilter= c->vLumFilter;
2079 int16_t *vChrFilter= c->vChrFilter;
2080 int32_t *lumMmxFilter= c->lumMmxFilter;
2081 int32_t *chrMmxFilter= c->chrMmxFilter;
2082 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2083 const int vLumFilterSize= c->vLumFilterSize;
2084 const int vChrFilterSize= c->vChrFilterSize;
2085 const int chrDstY= dstY>>c->chrDstVSubSample;
2086 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2087 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2089 c->blueDither= ff_dither8[dstY&1];
2090 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2091 c->greenDither= ff_dither8[dstY&1];
2093 c->greenDither= ff_dither4[dstY&1];
2094 c->redDither= ff_dither8[(dstY+1)&1];
2095 if (dstY < dstH - 2) {
2096 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2097 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2098 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2100 if (flags & SWS_ACCURATE_RND) {
2101 int s= APCK_SIZE / 8;
2102 for (i=0; i<vLumFilterSize; i+=2) {
2103 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2104 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2105 lumMmxFilter[s*i+APCK_COEF/4 ]=
2106 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2107 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2108 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2109 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2110 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2111 alpMmxFilter[s*i+APCK_COEF/4 ]=
2112 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2115 for (i=0; i<vChrFilterSize; i+=2) {
2116 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2117 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2118 chrMmxFilter[s*i+APCK_COEF/4 ]=
2119 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2120 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2123 for (i=0; i<vLumFilterSize; i++) {
2124 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2125 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2126 lumMmxFilter[4*i+2]=
2127 lumMmxFilter[4*i+3]=
2128 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2129 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2130 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2131 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2132 alpMmxFilter[4*i+2]=
2133 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2136 for (i=0; i<vChrFilterSize; i++) {
2137 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2138 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2139 chrMmxFilter[4*i+2]=
2140 chrMmxFilter[4*i+3]=
2141 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2146 #endif /* !COMPILE_TEMPLATE_MMX2 */
2148 static void RENAME(sws_init_swScale)(SwsContext *c)
2150 enum PixelFormat srcFormat = c->srcFormat;
2152 if (!(c->flags & SWS_BITEXACT)) {
2153 if (c->flags & SWS_ACCURATE_RND) {
2154 c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
2155 c->yuv2yuvX = RENAME(yuv2yuvX_ar );
2156 c->yuv2packedX = RENAME(yuv2packedX_ar );
2158 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2159 c->yuv2yuvX = RENAME(yuv2yuvX );
2160 c->yuv2packedX = RENAME(yuv2packedX );
2162 c->yuv2packed1 = RENAME(yuv2packed1 );
2163 c->yuv2packed2 = RENAME(yuv2packed2 );
2166 c->hScale = RENAME(hScale );
2168 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2169 #if COMPILE_TEMPLATE_MMX2
2170 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2172 c->hyscale_fast = RENAME(hyscale_fast);
2173 c->hcscale_fast = RENAME(hcscale_fast);
2175 #endif /* COMPILE_TEMPLATE_MMX2 */
2176 c->hyscale_fast = NULL;
2177 c->hcscale_fast = NULL;
2178 #if COMPILE_TEMPLATE_MMX2
2180 #endif /* COMPILE_TEMPLATE_MMX2 */
2183 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2184 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2185 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2186 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2187 case PIX_FMT_YUV420P16BE:
2188 case PIX_FMT_YUV422P16BE:
2189 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2190 case PIX_FMT_YUV420P16LE:
2191 case PIX_FMT_YUV422P16LE:
2192 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2195 if (!c->chrSrcHSubSample) {
2197 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2198 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2203 switch (srcFormat) {
2204 case PIX_FMT_YUYV422 :
2205 case PIX_FMT_YUV420P16BE:
2206 case PIX_FMT_YUV422P16BE:
2207 case PIX_FMT_YUV444P16BE:
2208 case PIX_FMT_Y400A :
2209 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2210 case PIX_FMT_UYVY422 :
2211 case PIX_FMT_YUV420P16LE:
2212 case PIX_FMT_YUV422P16LE:
2213 case PIX_FMT_YUV444P16LE:
2214 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2215 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
2216 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
2220 switch (srcFormat) {
2221 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;