2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
161 #define YSCALEYUV2RGBX \
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "p" (chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "p" (chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "p" (dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804 chrFilter, chrSrc, chrFilterSize,
805 dest, uDest, dstW, chrDstW, dstFormat);
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
831 :: "r" (lumSrc + dstW), "r" (dest + dstW),
837 for(i=0; i<dstW; i++)
839 int val= lumSrc[i]>>7;
850 for(i=0; i<chrDstW; i++)
853 int v=chrSrc[i + 2048]>>7;
857 else if (u>255) u=255;
859 else if (v>255) v=255;
870 * vertical scale YV12 to RGB
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874 uint8_t *dest, int dstW, int dstY)
884 WRITEBGR32(%4, %5, %%REGa)
886 :: "r" (&c->redDither),
887 "m" (dummy), "m" (dummy), "m" (dummy),
888 "r" (dest), "m" (dstW)
889 : "%"REG_a, "%"REG_d, "%"REG_S
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898 "add %4, %%"REG_b" \n\t"
899 WRITEBGR24(%%REGb, %5, %%REGa)
901 :: "r" (&c->redDither),
902 "m" (dummy), "m" (dummy), "m" (dummy),
903 "r" (dest), "m" (dstW)
904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919 WRITEBGR15(%4, %5, %%REGa)
921 :: "r" (&c->redDither),
922 "m" (dummy), "m" (dummy), "m" (dummy),
923 "r" (dest), "m" (dstW)
924 : "%"REG_a, "%"REG_d, "%"REG_S
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939 WRITEBGR16(%4, %5, %%REGa)
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
944 : "%"REG_a, "%"REG_d, "%"REG_S
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
958 WRITEYUY2(%4, %5, %%REGa)
960 :: "r" (&c->redDither),
961 "m" (dummy), "m" (dummy), "m" (dummy),
962 "r" (dest), "m" (dstW)
963 : "%"REG_a, "%"REG_d, "%"REG_S
970 /* The following list of supported dstFormat values should
971 match what's found in the body of altivec_yuv2packedX() */
972 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
973 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
974 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
975 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
976 chrFilter, chrSrc, chrFilterSize,
980 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
981 chrFilter, chrSrc, chrFilterSize,
988 * vertical bilinear scale YV12 to RGB
990 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
991 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
993 int yalpha1=yalpha^4095;
994 int uvalpha1=uvalpha^4095;
998 if(flags&SWS_FULL_CHR_H_INT)
1008 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1009 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1011 "movq %%mm3, %%mm1 \n\t"
1012 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1013 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1015 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1016 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1018 "add $4, %%"REG_a" \n\t"
1019 "cmp %5, %%"REG_a" \n\t"
1023 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1024 "m" (yalpha1), "m" (uvalpha1)
1034 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1035 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1037 "movq %%mm3, %%mm1 \n\t"
1038 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1039 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1041 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1042 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1043 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1044 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1045 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1046 "movq %%mm1, %%mm2 \n\t"
1047 "psllq $48, %%mm1 \n\t" // 000000BG
1048 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1050 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1051 "psrld $16, %%mm2 \n\t" // R000R000
1052 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1053 "por %%mm2, %%mm1 \n\t" // RBGRR000
1055 "mov %4, %%"REG_b" \n\t"
1056 "add %%"REG_a", %%"REG_b" \n\t"
1060 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1061 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1063 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1064 "psrlq $32, %%mm3 \n\t"
1065 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1066 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1068 "add $4, %%"REG_a" \n\t"
1069 "cmp %5, %%"REG_a" \n\t"
1072 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1073 "m" (yalpha1), "m" (uvalpha1)
1074 : "%"REG_a, "%"REG_b
1082 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1083 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1084 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1086 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1087 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1088 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1090 "psrlw $3, %%mm3 \n\t"
1091 "psllw $2, %%mm1 \n\t"
1092 "psllw $7, %%mm0 \n\t"
1093 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1094 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1096 "por %%mm3, %%mm1 \n\t"
1097 "por %%mm1, %%mm0 \n\t"
1099 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1101 "add $4, %%"REG_a" \n\t"
1102 "cmp %5, %%"REG_a" \n\t"
1105 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1106 "m" (yalpha1), "m" (uvalpha1)
1115 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1116 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1117 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1119 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1120 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1121 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1123 "psrlw $3, %%mm3 \n\t"
1124 "psllw $3, %%mm1 \n\t"
1125 "psllw $8, %%mm0 \n\t"
1126 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1127 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1129 "por %%mm3, %%mm1 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1132 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1134 "add $4, %%"REG_a" \n\t"
1135 "cmp %5, %%"REG_a" \n\t"
1138 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1139 "m" (yalpha1), "m" (uvalpha1)
1148 if(dstFormat==IMGFMT_BGR32)
1151 #ifdef WORDS_BIGENDIAN
1154 for(i=0;i<dstW;i++){
1155 // vertical linear interpolation && yuv2rgb in a single step:
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1165 else if(dstFormat==IMGFMT_BGR24)
1168 for(i=0;i<dstW;i++){
1169 // vertical linear interpolation && yuv2rgb in a single step:
1170 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1174 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1175 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1179 else if(dstFormat==IMGFMT_BGR16)
1182 for(i=0;i<dstW;i++){
1183 // vertical linear interpolation && yuv2rgb in a single step:
1184 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1185 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1186 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1188 ((uint16_t*)dest)[i] =
1189 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1190 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1191 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1194 else if(dstFormat==IMGFMT_BGR15)
1197 for(i=0;i<dstW;i++){
1198 // vertical linear interpolation && yuv2rgb in a single step:
1199 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1200 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1201 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1203 ((uint16_t*)dest)[i] =
1204 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1205 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1206 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1214 switch(c->dstFormat)
1216 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1220 "mov %4, %%"REG_SP" \n\t"
1221 YSCALEYUV2RGB(%%REGa, %5)
1222 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1223 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1225 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1232 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1233 "mov %4, %%"REG_SP" \n\t"
1234 YSCALEYUV2RGB(%%REGa, %5)
1235 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1236 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1237 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1244 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1245 "mov %4, %%"REG_SP" \n\t"
1246 YSCALEYUV2RGB(%%REGa, %5)
1247 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1249 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1250 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1251 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1254 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1255 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1257 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1264 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1265 "mov %4, %%"REG_SP" \n\t"
1266 YSCALEYUV2RGB(%%REGa, %5)
1267 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1269 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1270 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1271 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1274 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1275 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1276 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1283 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1284 "mov %4, %%"REG_SP" \n\t"
1285 YSCALEYUV2PACKED(%%REGa, %5)
1286 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1287 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1288 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1296 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1300 * YV12 to RGB without scaling or interpolating
1302 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1303 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1305 const int yalpha1=0;
1308 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1309 const int yalpha= 4096; //FIXME ...
1311 if(flags&SWS_FULL_CHR_H_INT)
1313 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1318 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1324 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1325 "mov %4, %%"REG_SP" \n\t"
1326 YSCALEYUV2RGB1(%%REGa, %5)
1327 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1328 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1330 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1337 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1338 "mov %4, %%"REG_SP" \n\t"
1339 YSCALEYUV2RGB1(%%REGa, %5)
1340 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1341 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1343 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1350 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1351 "mov %4, %%"REG_SP" \n\t"
1352 YSCALEYUV2RGB1(%%REGa, %5)
1353 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1355 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1356 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1357 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1359 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1360 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1362 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1369 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1370 "mov %4, %%"REG_SP" \n\t"
1371 YSCALEYUV2RGB1(%%REGa, %5)
1372 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1374 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1375 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1376 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1379 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1380 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1382 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1389 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1390 "mov %4, %%"REG_SP" \n\t"
1391 YSCALEYUV2PACKED1(%%REGa, %5)
1392 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1393 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1395 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1408 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1409 "mov %4, %%"REG_SP" \n\t"
1410 YSCALEYUV2RGB1b(%%REGa, %5)
1411 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1412 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1414 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1421 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_SP" \n\t"
1423 YSCALEYUV2RGB1b(%%REGa, %5)
1424 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1425 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1427 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1434 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_SP" \n\t"
1436 YSCALEYUV2RGB1b(%%REGa, %5)
1437 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1439 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1440 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1441 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1443 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1444 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1446 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1454 "mov %4, %%"REG_SP" \n\t"
1455 YSCALEYUV2RGB1b(%%REGa, %5)
1456 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1458 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1459 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1460 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1463 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1464 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1466 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1473 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1474 "mov %4, %%"REG_SP" \n\t"
1475 YSCALEYUV2PACKED1b(%%REGa, %5)
1476 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1477 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1479 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1487 if( uvalpha < 2048 )
1489 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1491 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1495 //FIXME yuy2* can read upto 7 samples to much
1497 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1501 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1502 "mov %0, %%"REG_a" \n\t"
1504 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1505 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1506 "pand %%mm2, %%mm0 \n\t"
1507 "pand %%mm2, %%mm1 \n\t"
1508 "packuswb %%mm1, %%mm0 \n\t"
1509 "movq %%mm0, (%2, %%"REG_a") \n\t"
1510 "add $8, %%"REG_a" \n\t"
1512 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1517 for(i=0; i<width; i++)
1522 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1524 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1526 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1527 "mov %0, %%"REG_a" \n\t"
1529 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1530 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1531 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1532 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1535 "psrlw $8, %%mm0 \n\t"
1536 "psrlw $8, %%mm1 \n\t"
1537 "packuswb %%mm1, %%mm0 \n\t"
1538 "movq %%mm0, %%mm1 \n\t"
1539 "psrlw $8, %%mm0 \n\t"
1540 "pand %%mm4, %%mm1 \n\t"
1541 "packuswb %%mm0, %%mm0 \n\t"
1542 "packuswb %%mm1, %%mm1 \n\t"
1543 "movd %%mm0, (%4, %%"REG_a") \n\t"
1544 "movd %%mm1, (%3, %%"REG_a") \n\t"
1545 "add $4, %%"REG_a" \n\t"
1547 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1552 for(i=0; i<width; i++)
1554 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1555 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1560 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1561 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1565 "mov %0, %%"REG_a" \n\t"
1567 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1568 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1569 "psrlw $8, %%mm0 \n\t"
1570 "psrlw $8, %%mm1 \n\t"
1571 "packuswb %%mm1, %%mm0 \n\t"
1572 "movq %%mm0, (%2, %%"REG_a") \n\t"
1573 "add $8, %%"REG_a" \n\t"
1575 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1580 for(i=0; i<width; i++)
1585 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1589 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1590 "mov %0, %%"REG_a" \n\t"
1592 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1593 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1594 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1595 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%4, %%"REG_a") \n\t"
1607 "movd %%mm1, (%3, %%"REG_a") \n\t"
1608 "add $4, %%"REG_a" \n\t"
1610 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1615 for(i=0; i<width; i++)
1617 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1618 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1623 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1626 for(i=0; i<width; i++)
1628 int b= ((uint32_t*)src)[i]&0xFF;
1629 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1630 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1632 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1636 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1639 for(i=0; i<width; i++)
1641 const int a= ((uint32_t*)src1)[2*i+0];
1642 const int e= ((uint32_t*)src1)[2*i+1];
1643 const int c= ((uint32_t*)src2)[2*i+0];
1644 const int d= ((uint32_t*)src2)[2*i+1];
1645 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1646 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1647 const int b= l&0x3FF;
1651 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1652 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1656 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1660 "mov %2, %%"REG_a" \n\t"
1661 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1662 "movq "MANGLE(w1111)", %%mm5 \n\t"
1663 "pxor %%mm7, %%mm7 \n\t"
1664 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1667 PREFETCH" 64(%0, %%"REG_b") \n\t"
1668 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1669 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1670 "punpcklbw %%mm7, %%mm0 \n\t"
1671 "punpcklbw %%mm7, %%mm1 \n\t"
1672 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1673 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1674 "punpcklbw %%mm7, %%mm2 \n\t"
1675 "punpcklbw %%mm7, %%mm3 \n\t"
1676 "pmaddwd %%mm6, %%mm0 \n\t"
1677 "pmaddwd %%mm6, %%mm1 \n\t"
1678 "pmaddwd %%mm6, %%mm2 \n\t"
1679 "pmaddwd %%mm6, %%mm3 \n\t"
1680 #ifndef FAST_BGR2YV12
1681 "psrad $8, %%mm0 \n\t"
1682 "psrad $8, %%mm1 \n\t"
1683 "psrad $8, %%mm2 \n\t"
1684 "psrad $8, %%mm3 \n\t"
1686 "packssdw %%mm1, %%mm0 \n\t"
1687 "packssdw %%mm3, %%mm2 \n\t"
1688 "pmaddwd %%mm5, %%mm0 \n\t"
1689 "pmaddwd %%mm5, %%mm2 \n\t"
1690 "packssdw %%mm2, %%mm0 \n\t"
1691 "psraw $7, %%mm0 \n\t"
1693 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1694 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1695 "punpcklbw %%mm7, %%mm4 \n\t"
1696 "punpcklbw %%mm7, %%mm1 \n\t"
1697 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1698 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1699 "punpcklbw %%mm7, %%mm2 \n\t"
1700 "punpcklbw %%mm7, %%mm3 \n\t"
1701 "pmaddwd %%mm6, %%mm4 \n\t"
1702 "pmaddwd %%mm6, %%mm1 \n\t"
1703 "pmaddwd %%mm6, %%mm2 \n\t"
1704 "pmaddwd %%mm6, %%mm3 \n\t"
1705 #ifndef FAST_BGR2YV12
1706 "psrad $8, %%mm4 \n\t"
1707 "psrad $8, %%mm1 \n\t"
1708 "psrad $8, %%mm2 \n\t"
1709 "psrad $8, %%mm3 \n\t"
1711 "packssdw %%mm1, %%mm4 \n\t"
1712 "packssdw %%mm3, %%mm2 \n\t"
1713 "pmaddwd %%mm5, %%mm4 \n\t"
1714 "pmaddwd %%mm5, %%mm2 \n\t"
1715 "add $24, %%"REG_b" \n\t"
1716 "packssdw %%mm2, %%mm4 \n\t"
1717 "psraw $7, %%mm4 \n\t"
1719 "packuswb %%mm4, %%mm0 \n\t"
1720 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1722 "movq %%mm0, (%1, %%"REG_a") \n\t"
1723 "add $8, %%"REG_a" \n\t"
1725 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1726 : "%"REG_a, "%"REG_b
1730 for(i=0; i<width; i++)
1736 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1741 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1745 "mov %4, %%"REG_a" \n\t"
1746 "movq "MANGLE(w1111)", %%mm5 \n\t"
1747 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1748 "pxor %%mm7, %%mm7 \n\t"
1749 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1750 "add %%"REG_b", %%"REG_b" \n\t"
1753 PREFETCH" 64(%0, %%"REG_b") \n\t"
1754 PREFETCH" 64(%1, %%"REG_b") \n\t"
1755 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1756 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1757 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1758 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1759 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "movq %%mm2, %%mm3 \n\t"
1764 "psrlq $24, %%mm0 \n\t"
1765 "psrlq $24, %%mm2 \n\t"
1768 "punpcklbw %%mm7, %%mm0 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1771 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1772 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1773 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1774 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1775 "punpcklbw %%mm7, %%mm0 \n\t"
1776 "punpcklbw %%mm7, %%mm1 \n\t"
1777 "punpcklbw %%mm7, %%mm2 \n\t"
1778 "punpcklbw %%mm7, %%mm3 \n\t"
1779 "paddw %%mm1, %%mm0 \n\t"
1780 "paddw %%mm3, %%mm2 \n\t"
1781 "paddw %%mm2, %%mm0 \n\t"
1782 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1783 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1784 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1785 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1786 "punpcklbw %%mm7, %%mm4 \n\t"
1787 "punpcklbw %%mm7, %%mm1 \n\t"
1788 "punpcklbw %%mm7, %%mm2 \n\t"
1789 "punpcklbw %%mm7, %%mm3 \n\t"
1790 "paddw %%mm1, %%mm4 \n\t"
1791 "paddw %%mm3, %%mm2 \n\t"
1792 "paddw %%mm4, %%mm2 \n\t"
1793 "psrlw $2, %%mm0 \n\t"
1794 "psrlw $2, %%mm2 \n\t"
1796 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1797 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1799 "pmaddwd %%mm0, %%mm1 \n\t"
1800 "pmaddwd %%mm2, %%mm3 \n\t"
1801 "pmaddwd %%mm6, %%mm0 \n\t"
1802 "pmaddwd %%mm6, %%mm2 \n\t"
1803 #ifndef FAST_BGR2YV12
1804 "psrad $8, %%mm0 \n\t"
1805 "psrad $8, %%mm1 \n\t"
1806 "psrad $8, %%mm2 \n\t"
1807 "psrad $8, %%mm3 \n\t"
1809 "packssdw %%mm2, %%mm0 \n\t"
1810 "packssdw %%mm3, %%mm1 \n\t"
1811 "pmaddwd %%mm5, %%mm0 \n\t"
1812 "pmaddwd %%mm5, %%mm1 \n\t"
1813 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1814 "psraw $7, %%mm0 \n\t"
1816 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1817 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1818 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1819 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1820 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1823 "movq %%mm4, %%mm1 \n\t"
1824 "movq %%mm2, %%mm3 \n\t"
1825 "psrlq $24, %%mm4 \n\t"
1826 "psrlq $24, %%mm2 \n\t"
1829 "punpcklbw %%mm7, %%mm4 \n\t"
1830 "punpcklbw %%mm7, %%mm2 \n\t"
1832 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1833 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1834 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1835 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1836 "punpcklbw %%mm7, %%mm4 \n\t"
1837 "punpcklbw %%mm7, %%mm1 \n\t"
1838 "punpcklbw %%mm7, %%mm2 \n\t"
1839 "punpcklbw %%mm7, %%mm3 \n\t"
1840 "paddw %%mm1, %%mm4 \n\t"
1841 "paddw %%mm3, %%mm2 \n\t"
1842 "paddw %%mm2, %%mm4 \n\t"
1843 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1844 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1845 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1846 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1847 "punpcklbw %%mm7, %%mm5 \n\t"
1848 "punpcklbw %%mm7, %%mm1 \n\t"
1849 "punpcklbw %%mm7, %%mm2 \n\t"
1850 "punpcklbw %%mm7, %%mm3 \n\t"
1851 "paddw %%mm1, %%mm5 \n\t"
1852 "paddw %%mm3, %%mm2 \n\t"
1853 "paddw %%mm5, %%mm2 \n\t"
1854 "movq "MANGLE(w1111)", %%mm5 \n\t"
1855 "psrlw $2, %%mm4 \n\t"
1856 "psrlw $2, %%mm2 \n\t"
1858 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1859 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1861 "pmaddwd %%mm4, %%mm1 \n\t"
1862 "pmaddwd %%mm2, %%mm3 \n\t"
1863 "pmaddwd %%mm6, %%mm4 \n\t"
1864 "pmaddwd %%mm6, %%mm2 \n\t"
1865 #ifndef FAST_BGR2YV12
1866 "psrad $8, %%mm4 \n\t"
1867 "psrad $8, %%mm1 \n\t"
1868 "psrad $8, %%mm2 \n\t"
1869 "psrad $8, %%mm3 \n\t"
1871 "packssdw %%mm2, %%mm4 \n\t"
1872 "packssdw %%mm3, %%mm1 \n\t"
1873 "pmaddwd %%mm5, %%mm4 \n\t"
1874 "pmaddwd %%mm5, %%mm1 \n\t"
1875 "add $24, %%"REG_b" \n\t"
1876 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1877 "psraw $7, %%mm4 \n\t"
1879 "movq %%mm0, %%mm1 \n\t"
1880 "punpckldq %%mm4, %%mm0 \n\t"
1881 "punpckhdq %%mm4, %%mm1 \n\t"
1882 "packsswb %%mm1, %%mm0 \n\t"
1883 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1885 "movd %%mm0, (%2, %%"REG_a") \n\t"
1886 "punpckhdq %%mm0, %%mm0 \n\t"
1887 "movd %%mm0, (%3, %%"REG_a") \n\t"
1888 "add $4, %%"REG_a" \n\t"
1890 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1891 : "%"REG_a, "%"REG_b
1895 for(i=0; i<width; i++)
1897 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1898 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1899 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1901 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1902 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1907 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1910 for(i=0; i<width; i++)
1912 int d= ((uint16_t*)src)[i];
1915 int r= (d>>11)&0x1F;
1917 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1921 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1924 for(i=0; i<width; i++)
1926 int d0= ((uint32_t*)src1)[i];
1927 int d1= ((uint32_t*)src2)[i];
1929 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1930 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1932 int dh2= (dh>>11) + (dh<<21);
1936 int r= (d>>11)&0x7F;
1938 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1939 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1943 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1946 for(i=0; i<width; i++)
1948 int d= ((uint16_t*)src)[i];
1951 int r= (d>>10)&0x1F;
1953 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1957 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1960 for(i=0; i<width; i++)
1962 int d0= ((uint32_t*)src1)[i];
1963 int d1= ((uint32_t*)src2)[i];
1965 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1966 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1968 int dh2= (dh>>11) + (dh<<21);
1972 int r= (d>>10)&0x7F;
1974 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1975 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1980 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1983 for(i=0; i<width; i++)
1985 int r= ((uint32_t*)src)[i]&0xFF;
1986 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1987 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1989 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1993 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1996 for(i=0; i<width; i++)
1998 const int a= ((uint32_t*)src1)[2*i+0];
1999 const int e= ((uint32_t*)src1)[2*i+1];
2000 const int c= ((uint32_t*)src2)[2*i+0];
2001 const int d= ((uint32_t*)src2)[2*i+1];
2002 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2003 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2004 const int r= l&0x3FF;
2008 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2009 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2013 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2016 for(i=0; i<width; i++)
2022 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2026 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2029 for(i=0; i<width; i++)
2031 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2032 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2033 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2035 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2036 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2041 // Bilinear / Bicubic scaling
2042 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2043 int16_t *filter, int16_t *filterPos, long filterSize)
2046 assert(filterSize % 4 == 0 && filterSize>0);
2047 if(filterSize==4) // allways true for upscaling, sometimes for down too
2049 long counter= -2*dstW;
2051 filterPos-= counter/2;
2054 "pxor %%mm7, %%mm7 \n\t"
2055 "movq "MANGLE(w02)", %%mm6 \n\t"
2056 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2057 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2061 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2062 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2063 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2064 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2065 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2066 "punpcklbw %%mm7, %%mm0 \n\t"
2067 "punpcklbw %%mm7, %%mm2 \n\t"
2068 "pmaddwd %%mm1, %%mm0 \n\t"
2069 "pmaddwd %%mm2, %%mm3 \n\t"
2070 "psrad $8, %%mm0 \n\t"
2071 "psrad $8, %%mm3 \n\t"
2072 "packssdw %%mm3, %%mm0 \n\t"
2073 "pmaddwd %%mm6, %%mm0 \n\t"
2074 "packssdw %%mm0, %%mm0 \n\t"
2075 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2076 "add $4, %%"REG_BP" \n\t"
2079 "pop %%"REG_BP" \n\t"
2081 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2085 else if(filterSize==8)
2087 long counter= -2*dstW;
2089 filterPos-= counter/2;
2092 "pxor %%mm7, %%mm7 \n\t"
2093 "movq "MANGLE(w02)", %%mm6 \n\t"
2094 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2095 "mov %%"REG_a", %%"REG_BP" \n\t"
2098 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2099 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2100 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2101 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2102 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2103 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2104 "punpcklbw %%mm7, %%mm0 \n\t"
2105 "punpcklbw %%mm7, %%mm2 \n\t"
2106 "pmaddwd %%mm1, %%mm0 \n\t"
2107 "pmaddwd %%mm2, %%mm3 \n\t"
2109 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2110 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2111 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2112 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2113 "punpcklbw %%mm7, %%mm4 \n\t"
2114 "punpcklbw %%mm7, %%mm2 \n\t"
2115 "pmaddwd %%mm1, %%mm4 \n\t"
2116 "pmaddwd %%mm2, %%mm5 \n\t"
2117 "paddd %%mm4, %%mm0 \n\t"
2118 "paddd %%mm5, %%mm3 \n\t"
2120 "psrad $8, %%mm0 \n\t"
2121 "psrad $8, %%mm3 \n\t"
2122 "packssdw %%mm3, %%mm0 \n\t"
2123 "pmaddwd %%mm6, %%mm0 \n\t"
2124 "packssdw %%mm0, %%mm0 \n\t"
2125 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2126 "add $4, %%"REG_BP" \n\t"
2129 "pop %%"REG_BP" \n\t"
2131 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2137 uint8_t *offset = src+filterSize;
2138 long counter= -2*dstW;
2139 // filter-= counter*filterSize/2;
2140 filterPos-= counter/2;
2143 "pxor %%mm7, %%mm7 \n\t"
2144 "movq "MANGLE(w02)", %%mm6 \n\t"
2147 "mov %2, %%"REG_c" \n\t"
2148 "movzwl (%%"REG_c", %0), %%eax \n\t"
2149 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2150 "mov %5, %%"REG_c" \n\t"
2151 "pxor %%mm4, %%mm4 \n\t"
2152 "pxor %%mm5, %%mm5 \n\t"
2154 "movq (%1), %%mm1 \n\t"
2155 "movq (%1, %6), %%mm3 \n\t"
2156 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2157 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2158 "punpcklbw %%mm7, %%mm0 \n\t"
2159 "punpcklbw %%mm7, %%mm2 \n\t"
2160 "pmaddwd %%mm1, %%mm0 \n\t"
2161 "pmaddwd %%mm2, %%mm3 \n\t"
2162 "paddd %%mm3, %%mm5 \n\t"
2163 "paddd %%mm0, %%mm4 \n\t"
2165 "add $4, %%"REG_c" \n\t"
2166 "cmp %4, %%"REG_c" \n\t"
2169 "psrad $8, %%mm4 \n\t"
2170 "psrad $8, %%mm5 \n\t"
2171 "packssdw %%mm5, %%mm4 \n\t"
2172 "pmaddwd %%mm6, %%mm4 \n\t"
2173 "packssdw %%mm4, %%mm4 \n\t"
2174 "mov %3, %%"REG_a" \n\t"
2175 "movd %%mm4, (%%"REG_a", %0) \n\t"
2179 : "+r" (counter), "+r" (filter)
2180 : "m" (filterPos), "m" (dst), "m"(offset),
2181 "m" (src), "r" (filterSize*2)
2182 : "%"REG_b, "%"REG_a, "%"REG_c
2187 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2190 for(i=0; i<dstW; i++)
2193 int srcPos= filterPos[i];
2195 // printf("filterPos: %d\n", filterPos[i]);
2196 for(j=0; j<filterSize; j++)
2198 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2199 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2201 // filter += hFilterSize;
2202 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2208 // *** horizontal scale Y line to temp buffer
2209 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2210 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2211 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2212 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2213 int32_t *mmx2FilterPos)
2215 if(srcFormat==IMGFMT_YUY2)
2217 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2218 src= formatConvBuffer;
2220 else if(srcFormat==IMGFMT_UYVY)
2222 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2223 src= formatConvBuffer;
2225 else if(srcFormat==IMGFMT_BGR32)
2227 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2228 src= formatConvBuffer;
2230 else if(srcFormat==IMGFMT_BGR24)
2232 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2233 src= formatConvBuffer;
2235 else if(srcFormat==IMGFMT_BGR16)
2237 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2238 src= formatConvBuffer;
2240 else if(srcFormat==IMGFMT_BGR15)
2242 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2243 src= formatConvBuffer;
2245 else if(srcFormat==IMGFMT_RGB32)
2247 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2248 src= formatConvBuffer;
2250 else if(srcFormat==IMGFMT_RGB24)
2252 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2253 src= formatConvBuffer;
2257 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2258 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2260 if(!(flags&SWS_FAST_BILINEAR))
2263 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2265 else // Fast Bilinear upscale / crap downscale
2267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2273 "pxor %%mm7, %%mm7 \n\t"
2274 "mov %0, %%"REG_c" \n\t"
2275 "mov %1, %%"REG_D" \n\t"
2276 "mov %2, %%"REG_d" \n\t"
2277 "mov %3, %%"REG_b" \n\t"
2278 "xor %%"REG_a", %%"REG_a" \n\t" // i
2279 PREFETCH" (%%"REG_c") \n\t"
2280 PREFETCH" 32(%%"REG_c") \n\t"
2281 PREFETCH" 64(%%"REG_c") \n\t"
2285 #define FUNNY_Y_CODE \
2286 "movl (%%"REG_b"), %%esi \n\t"\
2288 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2289 "add %%"REG_S", %%"REG_c" \n\t"\
2290 "add %%"REG_a", %%"REG_D" \n\t"\
2291 "xor %%"REG_a", %%"REG_a" \n\t"\
2295 #define FUNNY_Y_CODE \
2296 "movl (%%"REG_b"), %%esi \n\t"\
2298 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2299 "add %%"REG_a", %%"REG_D" \n\t"\
2300 "xor %%"REG_a", %%"REG_a" \n\t"\
2313 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2315 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2317 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2322 int xInc_shr16 = xInc >> 16;
2323 int xInc_mask = xInc & 0xffff;
2324 //NO MMX just normal asm ...
2326 "xor %%"REG_a", %%"REG_a" \n\t" // i
2327 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2328 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2331 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2332 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2333 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2334 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2335 "shll $16, %%edi \n\t"
2336 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2337 "mov %1, %%"REG_D" \n\t"
2338 "shrl $9, %%esi \n\t"
2339 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2340 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2341 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2343 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2344 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2345 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2346 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2347 "shll $16, %%edi \n\t"
2348 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2349 "mov %1, %%"REG_D" \n\t"
2350 "shrl $9, %%esi \n\t"
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2352 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2353 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2365 } //if MMX2 can't be used
2369 unsigned int xpos=0;
2370 for(i=0;i<dstWidth;i++)
2372 register unsigned int xx=xpos>>16;
2373 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2374 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2381 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2382 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2383 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2384 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2385 int32_t *mmx2FilterPos)
2387 if(srcFormat==IMGFMT_YUY2)
2389 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2390 src1= formatConvBuffer;
2391 src2= formatConvBuffer+2048;
2393 else if(srcFormat==IMGFMT_UYVY)
2395 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2396 src1= formatConvBuffer;
2397 src2= formatConvBuffer+2048;
2399 else if(srcFormat==IMGFMT_BGR32)
2401 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2402 src1= formatConvBuffer;
2403 src2= formatConvBuffer+2048;
2405 else if(srcFormat==IMGFMT_BGR24)
2407 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2408 src1= formatConvBuffer;
2409 src2= formatConvBuffer+2048;
2411 else if(srcFormat==IMGFMT_BGR16)
2413 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2414 src1= formatConvBuffer;
2415 src2= formatConvBuffer+2048;
2417 else if(srcFormat==IMGFMT_BGR15)
2419 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2420 src1= formatConvBuffer;
2421 src2= formatConvBuffer+2048;
2423 else if(srcFormat==IMGFMT_RGB32)
2425 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2426 src1= formatConvBuffer;
2427 src2= formatConvBuffer+2048;
2429 else if(srcFormat==IMGFMT_RGB24)
2431 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2432 src1= formatConvBuffer;
2433 src2= formatConvBuffer+2048;
2435 else if(isGray(srcFormat))
2441 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2442 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2444 if(!(flags&SWS_FAST_BILINEAR))
2447 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2448 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2450 else // Fast Bilinear upscale / crap downscale
2452 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2458 "pxor %%mm7, %%mm7 \n\t"
2459 "mov %0, %%"REG_c" \n\t"
2460 "mov %1, %%"REG_D" \n\t"
2461 "mov %2, %%"REG_d" \n\t"
2462 "mov %3, %%"REG_b" \n\t"
2463 "xor %%"REG_a", %%"REG_a" \n\t" // i
2464 PREFETCH" (%%"REG_c") \n\t"
2465 PREFETCH" 32(%%"REG_c") \n\t"
2466 PREFETCH" 64(%%"REG_c") \n\t"
2470 #define FUNNY_UV_CODE \
2471 "movl (%%"REG_b"), %%esi \n\t"\
2473 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2474 "add %%"REG_S", %%"REG_c" \n\t"\
2475 "add %%"REG_a", %%"REG_D" \n\t"\
2476 "xor %%"REG_a", %%"REG_a" \n\t"\
2480 #define FUNNY_UV_CODE \
2481 "movl (%%"REG_b"), %%esi \n\t"\
2483 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2484 "add %%"REG_a", %%"REG_D" \n\t"\
2485 "xor %%"REG_a", %%"REG_a" \n\t"\
2493 "xor %%"REG_a", %%"REG_a" \n\t" // i
2494 "mov %5, %%"REG_c" \n\t" // src
2495 "mov %1, %%"REG_D" \n\t" // buf1
2496 "add $4096, %%"REG_D" \n\t"
2497 PREFETCH" (%%"REG_c") \n\t"
2498 PREFETCH" 32(%%"REG_c") \n\t"
2499 PREFETCH" 64(%%"REG_c") \n\t"
2506 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507 "m" (funnyUVCode), "m" (src2)
2508 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2510 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2512 // printf("%d %d %d\n", dstWidth, i, srcW);
2513 dst[i] = src1[srcW-1]*128;
2514 dst[i+2048] = src2[srcW-1]*128;
2520 long xInc_shr16 = (long) (xInc >> 16);
2521 int xInc_mask = xInc & 0xffff;
2523 "xor %%"REG_a", %%"REG_a" \n\t" // i
2524 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2525 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2528 "mov %0, %%"REG_S" \n\t"
2529 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2530 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2531 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2532 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2533 "shll $16, %%edi \n\t"
2534 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2535 "mov %1, %%"REG_D" \n\t"
2536 "shrl $9, %%esi \n\t"
2537 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2539 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2540 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2541 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2542 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2543 "shll $16, %%edi \n\t"
2544 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2545 "mov %1, %%"REG_D" \n\t"
2546 "shrl $9, %%esi \n\t"
2547 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2549 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2550 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2551 "add $1, %%"REG_a" \n\t"
2552 "cmp %2, %%"REG_a" \n\t"
2555 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2556 which is needed to support GCC-4.0 */
2557 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2558 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2560 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2563 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2566 } //if MMX2 can't be used
2570 unsigned int xpos=0;
2571 for(i=0;i<dstWidth;i++)
2573 register unsigned int xx=xpos>>16;
2574 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2575 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2576 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2578 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2579 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2587 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2588 int srcSliceH, uint8_t* dst[], int dstStride[]){
2590 /* load a few things into local vars to make the code more readable? and faster */
2591 const int srcW= c->srcW;
2592 const int dstW= c->dstW;
2593 const int dstH= c->dstH;
2594 const int chrDstW= c->chrDstW;
2595 const int chrSrcW= c->chrSrcW;
2596 const int lumXInc= c->lumXInc;
2597 const int chrXInc= c->chrXInc;
2598 const int dstFormat= c->dstFormat;
2599 const int srcFormat= c->srcFormat;
2600 const int flags= c->flags;
2601 const int canMMX2BeUsed= c->canMMX2BeUsed;
2602 int16_t *vLumFilterPos= c->vLumFilterPos;
2603 int16_t *vChrFilterPos= c->vChrFilterPos;
2604 int16_t *hLumFilterPos= c->hLumFilterPos;
2605 int16_t *hChrFilterPos= c->hChrFilterPos;
2606 int16_t *vLumFilter= c->vLumFilter;
2607 int16_t *vChrFilter= c->vChrFilter;
2608 int16_t *hLumFilter= c->hLumFilter;
2609 int16_t *hChrFilter= c->hChrFilter;
2610 int32_t *lumMmxFilter= c->lumMmxFilter;
2611 int32_t *chrMmxFilter= c->chrMmxFilter;
2612 const int vLumFilterSize= c->vLumFilterSize;
2613 const int vChrFilterSize= c->vChrFilterSize;
2614 const int hLumFilterSize= c->hLumFilterSize;
2615 const int hChrFilterSize= c->hChrFilterSize;
2616 int16_t **lumPixBuf= c->lumPixBuf;
2617 int16_t **chrPixBuf= c->chrPixBuf;
2618 const int vLumBufSize= c->vLumBufSize;
2619 const int vChrBufSize= c->vChrBufSize;
2620 uint8_t *funnyYCode= c->funnyYCode;
2621 uint8_t *funnyUVCode= c->funnyUVCode;
2622 uint8_t *formatConvBuffer= c->formatConvBuffer;
2623 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2624 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2627 /* vars whch will change and which we need to storw back in the context */
2629 int lumBufIndex= c->lumBufIndex;
2630 int chrBufIndex= c->chrBufIndex;
2631 int lastInLumBuf= c->lastInLumBuf;
2632 int lastInChrBuf= c->lastInChrBuf;
2634 if(isPacked(c->srcFormat)){
2640 srcStride[2]= srcStride[0];
2642 srcStride[1]<<= c->vChrDrop;
2643 srcStride[2]<<= c->vChrDrop;
2645 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2646 // (int)dst[0], (int)dst[1], (int)dst[2]);
2648 #if 0 //self test FIXME move to a vfilter or something
2650 static volatile int i=0;
2652 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2653 selfTest(src, srcStride, c->srcW, c->srcH);
2658 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2659 //dstStride[0],dstStride[1],dstStride[2]);
2661 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2663 static int firstTime=1; //FIXME move this into the context perhaps
2664 if(flags & SWS_PRINT_INFO && firstTime)
2666 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2667 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2672 /* Note the user might start scaling the picture in the middle so this will not get executed
2673 this is not really intended but works currently, so ppl might do it */
2684 for(;dstY < dstH; dstY++){
2685 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2686 const int chrDstY= dstY>>c->chrDstVSubSample;
2687 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2688 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2690 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2691 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2692 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2693 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2695 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2696 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2697 //handle holes (FAST_BILINEAR & weird filters)
2698 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2699 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2700 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2701 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2702 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2704 // Do we have enough lines in this slice to output the dstY line
2705 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2707 //Do horizontal scaling
2708 while(lastInLumBuf < lastLumSrcY)
2710 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2712 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2713 ASSERT(lumBufIndex < 2*vLumBufSize)
2714 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2715 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2716 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2717 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2718 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2719 funnyYCode, c->srcFormat, formatConvBuffer,
2720 c->lumMmx2Filter, c->lumMmx2FilterPos);
2723 while(lastInChrBuf < lastChrSrcY)
2725 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2726 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2728 ASSERT(chrBufIndex < 2*vChrBufSize)
2729 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2730 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2731 //FIXME replace parameters through context struct (some at least)
2733 if(!(isGray(srcFormat) || isGray(dstFormat)))
2734 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2735 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2736 funnyUVCode, c->srcFormat, formatConvBuffer,
2737 c->chrMmx2Filter, c->chrMmx2FilterPos);
2740 //wrap buf index around to stay inside the ring buffer
2741 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2742 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2744 else // not enough lines left in this slice -> load the rest in the buffer
2746 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2747 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2748 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2749 vChrBufSize, vLumBufSize);*/
2751 //Do horizontal scaling
2752 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2754 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2756 ASSERT(lumBufIndex < 2*vLumBufSize)
2757 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2758 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2759 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2760 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2761 funnyYCode, c->srcFormat, formatConvBuffer,
2762 c->lumMmx2Filter, c->lumMmx2FilterPos);
2765 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2767 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2768 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2770 ASSERT(chrBufIndex < 2*vChrBufSize)
2771 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2772 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2774 if(!(isGray(srcFormat) || isGray(dstFormat)))
2775 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2776 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2777 funnyUVCode, c->srcFormat, formatConvBuffer,
2778 c->chrMmx2Filter, c->chrMmx2FilterPos);
2781 //wrap buf index around to stay inside the ring buffer
2782 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2783 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2784 break; //we can't output a dstY line so let's try with the next slice
2788 b5Dither= dither8[dstY&1];
2789 g6Dither= dither4[dstY&1];
2790 g5Dither= dither8[dstY&1];
2791 r5Dither= dither8[(dstY+1)&1];
2795 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2796 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2799 for(i=0; i<vLumFilterSize; i++)
2801 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2802 lumMmxFilter[4*i+2]=
2803 lumMmxFilter[4*i+3]=
2804 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2806 for(i=0; i<vChrFilterSize; i++)
2808 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2809 chrMmxFilter[4*i+2]=
2810 chrMmxFilter[4*i+3]=
2811 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2814 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2815 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2816 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2817 RENAME(yuv2nv12X)(c,
2818 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2819 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820 dest, uDest, dstW, chrDstW, dstFormat);
2822 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2824 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2825 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2826 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2828 int16_t *lumBuf = lumPixBuf[0];
2829 int16_t *chrBuf= chrPixBuf[0];
2830 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837 dest, uDest, vDest, dstW, chrDstW);
2842 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2843 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2844 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2846 int chrAlpha= vChrFilter[2*dstY+1];
2847 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2848 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2850 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2852 int lumAlpha= vLumFilter[2*dstY+1];
2853 int chrAlpha= vChrFilter[2*dstY+1];
2854 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2855 dest, dstW, lumAlpha, chrAlpha, dstY);
2859 RENAME(yuv2packedX)(c,
2860 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2866 else // hmm looks like we can't use MMX here without overwriting this array's tail
2868 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2869 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2870 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2871 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2872 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2874 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2875 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2876 dest, uDest, dstW, chrDstW, dstFormat);
2878 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2880 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2881 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2883 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2884 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2885 dest, uDest, vDest, dstW, chrDstW);
2889 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2890 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2892 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2893 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2900 __asm __volatile(SFENCE:::"memory");
2901 __asm __volatile(EMMS:::"memory");
2903 /* store changed local vars back in the context */
2905 c->lumBufIndex= lumBufIndex;
2906 c->chrBufIndex= chrBufIndex;
2907 c->lastInLumBuf= lastInLumBuf;
2908 c->lastInChrBuf= lastInChrBuf;
2910 return dstY - lastDstY;