2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 the C code (not assembly, mmx, ...) of the swscaler which has been written
19 by Michael Niedermayer can be used under the LGPL license too
31 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
38 #define PREFETCH "prefetch"
39 #define PREFETCHW "prefetchw"
40 #elif defined ( HAVE_MMX2 )
41 #define PREFETCH "prefetchnta"
42 #define PREFETCHW "prefetcht0"
44 #define PREFETCH "/nop"
45 #define PREFETCHW "/nop"
49 #define SFENCE "sfence"
55 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
56 #elif defined (HAVE_3DNOW)
57 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
61 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
65 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68 #include "swscale_altivec_template.c"
71 #define YSCALEYUV2YV12X(x, offset, dest, width) \
73 "xor %%"REG_a", %%"REG_a" \n\t"\
74 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
75 "movq %%mm3, %%mm4 \n\t"\
76 "lea " offset "(%0), %%"REG_d" \n\t"\
77 "mov (%%"REG_d"), %%"REG_S" \n\t"\
78 ASMALIGN(4) /* FIXME Unroll? */\
80 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
81 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
82 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
83 "add $16, %%"REG_d" \n\t"\
84 "mov (%%"REG_d"), %%"REG_S" \n\t"\
85 "test %%"REG_S", %%"REG_S" \n\t"\
86 "pmulhw %%mm0, %%mm2 \n\t"\
87 "pmulhw %%mm0, %%mm5 \n\t"\
88 "paddw %%mm2, %%mm3 \n\t"\
89 "paddw %%mm5, %%mm4 \n\t"\
91 "psraw $3, %%mm3 \n\t"\
92 "psraw $3, %%mm4 \n\t"\
93 "packuswb %%mm4, %%mm3 \n\t"\
94 MOVNTQ(%%mm3, (%1, %%REGa))\
95 "add $8, %%"REG_a" \n\t"\
96 "cmp %2, %%"REG_a" \n\t"\
97 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
98 "movq %%mm3, %%mm4 \n\t"\
99 "lea " offset "(%0), %%"REG_d" \n\t"\
100 "mov (%%"REG_d"), %%"REG_S" \n\t"\
102 :: "r" (&c->redDither),\
103 "r" (dest), "p" (width)\
104 : "%"REG_a, "%"REG_d, "%"REG_S\
107 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
109 "lea " offset "(%0), %%"REG_d" \n\t"\
110 "xor %%"REG_a", %%"REG_a" \n\t"\
111 "pxor %%mm4, %%mm4 \n\t"\
112 "pxor %%mm5, %%mm5 \n\t"\
113 "pxor %%mm6, %%mm6 \n\t"\
114 "pxor %%mm7, %%mm7 \n\t"\
115 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
119 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
120 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
121 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
122 "movq %%mm0, %%mm3 \n\t"\
123 "punpcklwd %%mm1, %%mm0 \n\t"\
124 "punpckhwd %%mm1, %%mm3 \n\t"\
125 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
126 "pmaddwd %%mm1, %%mm0 \n\t"\
127 "pmaddwd %%mm1, %%mm3 \n\t"\
128 "paddd %%mm0, %%mm4 \n\t"\
129 "paddd %%mm3, %%mm5 \n\t"\
130 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
131 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
132 "add $16, %%"REG_d" \n\t"\
133 "test %%"REG_S", %%"REG_S" \n\t"\
134 "movq %%mm2, %%mm0 \n\t"\
135 "punpcklwd %%mm3, %%mm2 \n\t"\
136 "punpckhwd %%mm3, %%mm0 \n\t"\
137 "pmaddwd %%mm1, %%mm2 \n\t"\
138 "pmaddwd %%mm1, %%mm0 \n\t"\
139 "paddd %%mm2, %%mm6 \n\t"\
140 "paddd %%mm0, %%mm7 \n\t"\
142 "psrad $16, %%mm4 \n\t"\
143 "psrad $16, %%mm5 \n\t"\
144 "psrad $16, %%mm6 \n\t"\
145 "psrad $16, %%mm7 \n\t"\
146 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
147 "packssdw %%mm5, %%mm4 \n\t"\
148 "packssdw %%mm7, %%mm6 \n\t"\
149 "paddw %%mm0, %%mm4 \n\t"\
150 "paddw %%mm0, %%mm6 \n\t"\
151 "psraw $3, %%mm4 \n\t"\
152 "psraw $3, %%mm6 \n\t"\
153 "packuswb %%mm6, %%mm4 \n\t"\
154 MOVNTQ(%%mm4, (%1, %%REGa))\
155 "add $8, %%"REG_a" \n\t"\
156 "cmp %2, %%"REG_a" \n\t"\
157 "lea " offset "(%0), %%"REG_d" \n\t"\
158 "pxor %%mm4, %%mm4 \n\t"\
159 "pxor %%mm5, %%mm5 \n\t"\
160 "pxor %%mm6, %%mm6 \n\t"\
161 "pxor %%mm7, %%mm7 \n\t"\
162 "mov (%%"REG_d"), %%"REG_S" \n\t"\
164 :: "r" (&c->redDither),\
165 "r" (dest), "p" (width)\
166 : "%"REG_a, "%"REG_d, "%"REG_S\
169 #define YSCALEYUV2YV121 \
170 "mov %2, %%"REG_a" \n\t"\
171 ASMALIGN(4) /* FIXME Unroll? */\
173 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
174 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
175 "psraw $7, %%mm0 \n\t"\
176 "psraw $7, %%mm1 \n\t"\
177 "packuswb %%mm1, %%mm0 \n\t"\
178 MOVNTQ(%%mm0, (%1, %%REGa))\
179 "add $8, %%"REG_a" \n\t"\
183 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
184 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
185 "r" (dest), "m" (dstW),
186 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
187 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
189 #define YSCALEYUV2PACKEDX \
191 "xor %%"REG_a", %%"REG_a" \n\t"\
195 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
196 "mov (%%"REG_d"), %%"REG_S" \n\t"\
197 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
198 "movq %%mm3, %%mm4 \n\t"\
201 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
202 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
203 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
204 "add $16, %%"REG_d" \n\t"\
205 "mov (%%"REG_d"), %%"REG_S" \n\t"\
206 "pmulhw %%mm0, %%mm2 \n\t"\
207 "pmulhw %%mm0, %%mm5 \n\t"\
208 "paddw %%mm2, %%mm3 \n\t"\
209 "paddw %%mm5, %%mm4 \n\t"\
210 "test %%"REG_S", %%"REG_S" \n\t"\
213 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
214 "mov (%%"REG_d"), %%"REG_S" \n\t"\
215 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
216 "movq %%mm1, %%mm7 \n\t"\
219 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
220 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
221 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
222 "add $16, %%"REG_d" \n\t"\
223 "mov (%%"REG_d"), %%"REG_S" \n\t"\
224 "pmulhw %%mm0, %%mm2 \n\t"\
225 "pmulhw %%mm0, %%mm5 \n\t"\
226 "paddw %%mm2, %%mm1 \n\t"\
227 "paddw %%mm5, %%mm7 \n\t"\
228 "test %%"REG_S", %%"REG_S" \n\t"\
231 #define YSCALEYUV2PACKEDX_END\
232 :: "r" (&c->redDither), \
233 "m" (dummy), "m" (dummy), "m" (dummy),\
234 "r" (dest), "m" (dstW)\
235 : "%"REG_a, "%"REG_d, "%"REG_S\
238 #define YSCALEYUV2PACKEDX_ACCURATE \
240 "xor %%"REG_a", %%"REG_a" \n\t"\
244 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
245 "mov (%%"REG_d"), %%"REG_S" \n\t"\
246 "pxor %%mm4, %%mm4 \n\t"\
247 "pxor %%mm5, %%mm5 \n\t"\
248 "pxor %%mm6, %%mm6 \n\t"\
249 "pxor %%mm7, %%mm7 \n\t"\
252 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
253 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
254 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
255 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
256 "movq %%mm0, %%mm3 \n\t"\
257 "punpcklwd %%mm1, %%mm0 \n\t"\
258 "punpckhwd %%mm1, %%mm3 \n\t"\
259 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
260 "pmaddwd %%mm1, %%mm0 \n\t"\
261 "pmaddwd %%mm1, %%mm3 \n\t"\
262 "paddd %%mm0, %%mm4 \n\t"\
263 "paddd %%mm3, %%mm5 \n\t"\
264 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
265 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
266 "add $16, %%"REG_d" \n\t"\
267 "test %%"REG_S", %%"REG_S" \n\t"\
268 "movq %%mm2, %%mm0 \n\t"\
269 "punpcklwd %%mm3, %%mm2 \n\t"\
270 "punpckhwd %%mm3, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm2 \n\t"\
272 "pmaddwd %%mm1, %%mm0 \n\t"\
273 "paddd %%mm2, %%mm6 \n\t"\
274 "paddd %%mm0, %%mm7 \n\t"\
276 "psrad $16, %%mm4 \n\t"\
277 "psrad $16, %%mm5 \n\t"\
278 "psrad $16, %%mm6 \n\t"\
279 "psrad $16, %%mm7 \n\t"\
280 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
281 "packssdw %%mm5, %%mm4 \n\t"\
282 "packssdw %%mm7, %%mm6 \n\t"\
283 "paddw %%mm0, %%mm4 \n\t"\
284 "paddw %%mm0, %%mm6 \n\t"\
285 "movq %%mm4, "U_TEMP"(%0) \n\t"\
286 "movq %%mm6, "V_TEMP"(%0) \n\t"\
288 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
289 "mov (%%"REG_d"), %%"REG_S" \n\t"\
290 "pxor %%mm1, %%mm1 \n\t"\
291 "pxor %%mm5, %%mm5 \n\t"\
292 "pxor %%mm7, %%mm7 \n\t"\
293 "pxor %%mm6, %%mm6 \n\t"\
296 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
297 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
298 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
299 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
300 "movq %%mm0, %%mm3 \n\t"\
301 "punpcklwd %%mm4, %%mm0 \n\t"\
302 "punpckhwd %%mm4, %%mm3 \n\t"\
303 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
304 "pmaddwd %%mm4, %%mm0 \n\t"\
305 "pmaddwd %%mm4, %%mm3 \n\t"\
306 "paddd %%mm0, %%mm1 \n\t"\
307 "paddd %%mm3, %%mm5 \n\t"\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
309 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
310 "add $16, %%"REG_d" \n\t"\
311 "test %%"REG_S", %%"REG_S" \n\t"\
312 "movq %%mm2, %%mm0 \n\t"\
313 "punpcklwd %%mm3, %%mm2 \n\t"\
314 "punpckhwd %%mm3, %%mm0 \n\t"\
315 "pmaddwd %%mm4, %%mm2 \n\t"\
316 "pmaddwd %%mm4, %%mm0 \n\t"\
317 "paddd %%mm2, %%mm7 \n\t"\
318 "paddd %%mm0, %%mm6 \n\t"\
320 "psrad $16, %%mm1 \n\t"\
321 "psrad $16, %%mm5 \n\t"\
322 "psrad $16, %%mm7 \n\t"\
323 "psrad $16, %%mm6 \n\t"\
324 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
325 "packssdw %%mm5, %%mm1 \n\t"\
326 "packssdw %%mm6, %%mm7 \n\t"\
327 "paddw %%mm0, %%mm1 \n\t"\
328 "paddw %%mm0, %%mm7 \n\t"\
329 "movq "U_TEMP"(%0), %%mm3 \n\t"\
330 "movq "V_TEMP"(%0), %%mm4 \n\t"\
332 #define YSCALEYUV2RGBX \
333 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
334 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
335 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
336 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
337 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
338 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
339 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
340 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
341 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
342 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
343 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
344 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
345 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
346 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
347 "paddw %%mm3, %%mm4 \n\t"\
348 "movq %%mm2, %%mm0 \n\t"\
349 "movq %%mm5, %%mm6 \n\t"\
350 "movq %%mm4, %%mm3 \n\t"\
351 "punpcklwd %%mm2, %%mm2 \n\t"\
352 "punpcklwd %%mm5, %%mm5 \n\t"\
353 "punpcklwd %%mm4, %%mm4 \n\t"\
354 "paddw %%mm1, %%mm2 \n\t"\
355 "paddw %%mm1, %%mm5 \n\t"\
356 "paddw %%mm1, %%mm4 \n\t"\
357 "punpckhwd %%mm0, %%mm0 \n\t"\
358 "punpckhwd %%mm6, %%mm6 \n\t"\
359 "punpckhwd %%mm3, %%mm3 \n\t"\
360 "paddw %%mm7, %%mm0 \n\t"\
361 "paddw %%mm7, %%mm6 \n\t"\
362 "paddw %%mm7, %%mm3 \n\t"\
363 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
364 "packuswb %%mm0, %%mm2 \n\t"\
365 "packuswb %%mm6, %%mm5 \n\t"\
366 "packuswb %%mm3, %%mm4 \n\t"\
367 "pxor %%mm7, %%mm7 \n\t"
369 #define FULL_YSCALEYUV2RGB \
370 "pxor %%mm7, %%mm7 \n\t"\
371 "movd %6, %%mm6 \n\t" /*yalpha1*/\
372 "punpcklwd %%mm6, %%mm6 \n\t"\
373 "punpcklwd %%mm6, %%mm6 \n\t"\
374 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
375 "punpcklwd %%mm5, %%mm5 \n\t"\
376 "punpcklwd %%mm5, %%mm5 \n\t"\
377 "xor %%"REG_a", %%"REG_a" \n\t"\
380 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
381 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
382 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
383 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
384 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
385 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
386 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
387 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
388 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
389 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
390 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
391 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
392 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
393 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
394 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
396 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
397 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
400 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
401 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
402 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
403 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
405 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
409 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
410 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
411 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
412 "paddw %%mm1, %%mm3 \n\t" /* B*/\
413 "paddw %%mm1, %%mm0 \n\t" /* R*/\
414 "packuswb %%mm3, %%mm3 \n\t"\
416 "packuswb %%mm0, %%mm0 \n\t"\
417 "paddw %%mm4, %%mm2 \n\t"\
418 "paddw %%mm2, %%mm1 \n\t" /* G*/\
420 "packuswb %%mm1, %%mm1 \n\t"
423 #define REAL_YSCALEYUV2PACKED(index, c) \
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
425 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
426 "psraw $3, %%mm0 \n\t"\
427 "psraw $3, %%mm1 \n\t"\
428 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
429 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
430 "xor "#index", "#index" \n\t"\
433 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
434 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
435 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
436 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
437 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
438 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
439 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
440 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
441 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
442 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
443 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
444 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
445 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
446 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
447 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
448 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
449 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
450 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
451 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
455 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
457 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
461 #define REAL_YSCALEYUV2RGB(index, c) \
462 "xor "#index", "#index" \n\t"\
465 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
466 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
467 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
468 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
469 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
470 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
471 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
472 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
473 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
474 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
475 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
476 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
477 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
478 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
479 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
480 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
481 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
482 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
483 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
484 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
485 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
486 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
487 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
488 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
489 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
490 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
491 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
492 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
493 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
494 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
495 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
496 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
497 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
498 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
499 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
500 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
501 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
502 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
503 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
504 "paddw %%mm3, %%mm4 \n\t"\
505 "movq %%mm2, %%mm0 \n\t"\
506 "movq %%mm5, %%mm6 \n\t"\
507 "movq %%mm4, %%mm3 \n\t"\
508 "punpcklwd %%mm2, %%mm2 \n\t"\
509 "punpcklwd %%mm5, %%mm5 \n\t"\
510 "punpcklwd %%mm4, %%mm4 \n\t"\
511 "paddw %%mm1, %%mm2 \n\t"\
512 "paddw %%mm1, %%mm5 \n\t"\
513 "paddw %%mm1, %%mm4 \n\t"\
514 "punpckhwd %%mm0, %%mm0 \n\t"\
515 "punpckhwd %%mm6, %%mm6 \n\t"\
516 "punpckhwd %%mm3, %%mm3 \n\t"\
517 "paddw %%mm7, %%mm0 \n\t"\
518 "paddw %%mm7, %%mm6 \n\t"\
519 "paddw %%mm7, %%mm3 \n\t"\
520 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
521 "packuswb %%mm0, %%mm2 \n\t"\
522 "packuswb %%mm6, %%mm5 \n\t"\
523 "packuswb %%mm3, %%mm4 \n\t"\
524 "pxor %%mm7, %%mm7 \n\t"
525 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
527 #define REAL_YSCALEYUV2PACKED1(index, c) \
528 "xor "#index", "#index" \n\t"\
531 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
532 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
533 "psraw $7, %%mm3 \n\t" \
534 "psraw $7, %%mm4 \n\t" \
535 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
536 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
537 "psraw $7, %%mm1 \n\t" \
538 "psraw $7, %%mm7 \n\t" \
540 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
542 #define REAL_YSCALEYUV2RGB1(index, c) \
543 "xor "#index", "#index" \n\t"\
546 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
547 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
548 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
549 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
550 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
551 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
552 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
553 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
554 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
555 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
556 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
557 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
558 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
559 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
560 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
561 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
562 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
563 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
564 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
565 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
566 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
567 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
568 "paddw %%mm3, %%mm4 \n\t"\
569 "movq %%mm2, %%mm0 \n\t"\
570 "movq %%mm5, %%mm6 \n\t"\
571 "movq %%mm4, %%mm3 \n\t"\
572 "punpcklwd %%mm2, %%mm2 \n\t"\
573 "punpcklwd %%mm5, %%mm5 \n\t"\
574 "punpcklwd %%mm4, %%mm4 \n\t"\
575 "paddw %%mm1, %%mm2 \n\t"\
576 "paddw %%mm1, %%mm5 \n\t"\
577 "paddw %%mm1, %%mm4 \n\t"\
578 "punpckhwd %%mm0, %%mm0 \n\t"\
579 "punpckhwd %%mm6, %%mm6 \n\t"\
580 "punpckhwd %%mm3, %%mm3 \n\t"\
581 "paddw %%mm7, %%mm0 \n\t"\
582 "paddw %%mm7, %%mm6 \n\t"\
583 "paddw %%mm7, %%mm3 \n\t"\
584 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
585 "packuswb %%mm0, %%mm2 \n\t"\
586 "packuswb %%mm6, %%mm5 \n\t"\
587 "packuswb %%mm3, %%mm4 \n\t"\
588 "pxor %%mm7, %%mm7 \n\t"
589 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
591 #define REAL_YSCALEYUV2PACKED1b(index, c) \
592 "xor "#index", "#index" \n\t"\
595 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
596 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
597 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
598 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
599 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
600 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
601 "psrlw $8, %%mm3 \n\t" \
602 "psrlw $8, %%mm4 \n\t" \
603 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
604 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
605 "psraw $7, %%mm1 \n\t" \
606 "psraw $7, %%mm7 \n\t"
607 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
609 // do vertical chrominance interpolation
610 #define REAL_YSCALEYUV2RGB1b(index, c) \
611 "xor "#index", "#index" \n\t"\
614 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
615 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
616 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
617 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
618 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
619 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
620 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
621 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
622 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
623 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
624 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
625 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
626 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
627 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
628 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
629 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
630 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
631 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
632 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
633 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
634 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
635 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
636 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
637 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
638 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
639 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
640 "paddw %%mm3, %%mm4 \n\t"\
641 "movq %%mm2, %%mm0 \n\t"\
642 "movq %%mm5, %%mm6 \n\t"\
643 "movq %%mm4, %%mm3 \n\t"\
644 "punpcklwd %%mm2, %%mm2 \n\t"\
645 "punpcklwd %%mm5, %%mm5 \n\t"\
646 "punpcklwd %%mm4, %%mm4 \n\t"\
647 "paddw %%mm1, %%mm2 \n\t"\
648 "paddw %%mm1, %%mm5 \n\t"\
649 "paddw %%mm1, %%mm4 \n\t"\
650 "punpckhwd %%mm0, %%mm0 \n\t"\
651 "punpckhwd %%mm6, %%mm6 \n\t"\
652 "punpckhwd %%mm3, %%mm3 \n\t"\
653 "paddw %%mm7, %%mm0 \n\t"\
654 "paddw %%mm7, %%mm6 \n\t"\
655 "paddw %%mm7, %%mm3 \n\t"\
656 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
657 "packuswb %%mm0, %%mm2 \n\t"\
658 "packuswb %%mm6, %%mm5 \n\t"\
659 "packuswb %%mm3, %%mm4 \n\t"\
660 "pxor %%mm7, %%mm7 \n\t"
661 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
663 #define REAL_WRITEBGR32(dst, dstw, index) \
664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
665 "movq %%mm2, %%mm1 \n\t" /* B */\
666 "movq %%mm5, %%mm6 \n\t" /* R */\
667 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
668 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
669 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
670 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
671 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
672 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
673 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
674 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
675 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
676 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
678 MOVNTQ(%%mm0, (dst, index, 4))\
679 MOVNTQ(%%mm2, 8(dst, index, 4))\
680 MOVNTQ(%%mm1, 16(dst, index, 4))\
681 MOVNTQ(%%mm3, 24(dst, index, 4))\
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
686 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
688 #define REAL_WRITEBGR16(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
694 "movq %%mm2, %%mm1 \n\t"\
695 "movq %%mm4, %%mm3 \n\t"\
697 "punpcklbw %%mm7, %%mm3 \n\t"\
698 "punpcklbw %%mm5, %%mm2 \n\t"\
699 "punpckhbw %%mm7, %%mm4 \n\t"\
700 "punpckhbw %%mm5, %%mm1 \n\t"\
702 "psllq $3, %%mm3 \n\t"\
703 "psllq $3, %%mm4 \n\t"\
705 "por %%mm3, %%mm2 \n\t"\
706 "por %%mm4, %%mm1 \n\t"\
708 MOVNTQ(%%mm2, (dst, index, 2))\
709 MOVNTQ(%%mm1, 8(dst, index, 2))\
711 "add $8, "#index" \n\t"\
712 "cmp "#dstw", "#index" \n\t"\
714 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
716 #define REAL_WRITEBGR15(dst, dstw, index) \
717 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
718 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
719 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
720 "psrlq $3, %%mm2 \n\t"\
721 "psrlq $1, %%mm5 \n\t"\
723 "movq %%mm2, %%mm1 \n\t"\
724 "movq %%mm4, %%mm3 \n\t"\
726 "punpcklbw %%mm7, %%mm3 \n\t"\
727 "punpcklbw %%mm5, %%mm2 \n\t"\
728 "punpckhbw %%mm7, %%mm4 \n\t"\
729 "punpckhbw %%mm5, %%mm1 \n\t"\
731 "psllq $2, %%mm3 \n\t"\
732 "psllq $2, %%mm4 \n\t"\
734 "por %%mm3, %%mm2 \n\t"\
735 "por %%mm4, %%mm1 \n\t"\
737 MOVNTQ(%%mm2, (dst, index, 2))\
738 MOVNTQ(%%mm1, 8(dst, index, 2))\
740 "add $8, "#index" \n\t"\
741 "cmp "#dstw", "#index" \n\t"\
743 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
745 #define WRITEBGR24OLD(dst, dstw, index) \
746 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
747 "movq %%mm2, %%mm1 \n\t" /* B */\
748 "movq %%mm5, %%mm6 \n\t" /* R */\
749 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
750 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
751 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
752 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
753 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
754 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
755 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
756 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
757 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
758 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
760 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
761 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
762 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
763 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
764 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
765 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
766 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
767 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
769 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
770 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
771 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
772 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
773 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
774 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
775 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
776 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
777 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
778 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
779 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
780 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
781 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
783 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
784 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
785 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
786 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
787 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
788 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
789 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
790 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
792 MOVNTQ(%%mm0, (dst))\
793 MOVNTQ(%%mm2, 8(dst))\
794 MOVNTQ(%%mm3, 16(dst))\
795 "add $24, "#dst" \n\t"\
797 "add $8, "#index" \n\t"\
798 "cmp "#dstw", "#index" \n\t"\
801 #define WRITEBGR24MMX(dst, dstw, index) \
802 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
803 "movq %%mm2, %%mm1 \n\t" /* B */\
804 "movq %%mm5, %%mm6 \n\t" /* R */\
805 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
806 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
807 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
808 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
809 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
810 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
811 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
812 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
813 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
814 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
816 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
817 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
818 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
819 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
821 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
822 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
823 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
824 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
826 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
827 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
828 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
829 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
831 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
832 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
833 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
834 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
835 MOVNTQ(%%mm0, (dst))\
837 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
839 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
840 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
841 MOVNTQ(%%mm6, 8(dst))\
843 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
844 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
845 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
846 MOVNTQ(%%mm5, 16(dst))\
848 "add $24, "#dst" \n\t"\
850 "add $8, "#index" \n\t"\
851 "cmp "#dstw", "#index" \n\t"\
854 #define WRITEBGR24MMX2(dst, dstw, index) \
855 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
856 "movq "MANGLE(M24A)", %%mm0 \n\t"\
857 "movq "MANGLE(M24C)", %%mm7 \n\t"\
858 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
859 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
860 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
862 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
863 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
864 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
866 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
867 "por %%mm1, %%mm6 \n\t"\
868 "por %%mm3, %%mm6 \n\t"\
869 MOVNTQ(%%mm6, (dst))\
871 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
872 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
873 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
874 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
876 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
877 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
878 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
880 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
881 "por %%mm3, %%mm6 \n\t"\
882 MOVNTQ(%%mm6, 8(dst))\
884 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
885 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
886 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
888 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
889 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
890 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
892 "por %%mm1, %%mm3 \n\t"\
893 "por %%mm3, %%mm6 \n\t"\
894 MOVNTQ(%%mm6, 16(dst))\
896 "add $24, "#dst" \n\t"\
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
904 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
907 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
910 #define REAL_WRITEYUY2(dst, dstw, index) \
911 "packuswb %%mm3, %%mm3 \n\t"\
912 "packuswb %%mm4, %%mm4 \n\t"\
913 "packuswb %%mm7, %%mm1 \n\t"\
914 "punpcklbw %%mm4, %%mm3 \n\t"\
915 "movq %%mm1, %%mm7 \n\t"\
916 "punpcklbw %%mm3, %%mm1 \n\t"\
917 "punpckhbw %%mm3, %%mm7 \n\t"\
919 MOVNTQ(%%mm1, (dst, index, 2))\
920 MOVNTQ(%%mm7, 8(dst, index, 2))\
922 "add $8, "#index" \n\t"\
923 "cmp "#dstw", "#index" \n\t"\
925 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
928 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
929 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
930 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933 if(c->flags & SWS_ACCURATE_RND){
935 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
936 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
943 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
950 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, vDest, dstW, chrDstW);
954 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
955 chrFilter, chrSrc, chrFilterSize,
956 dest, uDest, vDest, dstW, chrDstW);
957 #endif //!HAVE_ALTIVEC
961 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
962 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
963 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
965 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
966 chrFilter, chrSrc, chrFilterSize,
967 dest, uDest, dstW, chrDstW, dstFormat);
970 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
971 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
978 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
985 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
993 :: "r" (lumSrc + dstW), "r" (dest + dstW),
999 for(i=0; i<dstW; i++)
1001 int val= lumSrc[i]>>7;
1012 for(i=0; i<chrDstW; i++)
1015 int v=chrSrc[i + 2048]>>7;
1019 else if (u>255) u=255;
1021 else if (v>255) v=255;
1032 * vertical scale YV12 to RGB
1034 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1035 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1036 uint8_t *dest, long dstW, long dstY)
1040 if(c->flags & SWS_ACCURATE_RND){
1041 switch(c->dstFormat){
1043 YSCALEYUV2PACKEDX_ACCURATE
1045 WRITEBGR32(%4, %5, %%REGa)
1047 YSCALEYUV2PACKEDX_END
1050 YSCALEYUV2PACKEDX_ACCURATE
1052 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1053 "add %4, %%"REG_c" \n\t"
1054 WRITEBGR24(%%REGc, %5, %%REGa)
1057 :: "r" (&c->redDither),
1058 "m" (dummy), "m" (dummy), "m" (dummy),
1059 "r" (dest), "m" (dstW)
1060 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063 case PIX_FMT_BGR555:
1064 YSCALEYUV2PACKEDX_ACCURATE
1066 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1068 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1069 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1070 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073 WRITEBGR15(%4, %5, %%REGa)
1074 YSCALEYUV2PACKEDX_END
1076 case PIX_FMT_BGR565:
1077 YSCALEYUV2PACKEDX_ACCURATE
1079 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1081 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1082 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1083 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086 WRITEBGR16(%4, %5, %%REGa)
1087 YSCALEYUV2PACKEDX_END
1089 case PIX_FMT_YUYV422:
1090 YSCALEYUV2PACKEDX_ACCURATE
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 "psraw $3, %%mm3 \n\t"
1094 "psraw $3, %%mm4 \n\t"
1095 "psraw $3, %%mm1 \n\t"
1096 "psraw $3, %%mm7 \n\t"
1097 WRITEYUY2(%4, %5, %%REGa)
1098 YSCALEYUV2PACKEDX_END
1102 switch(c->dstFormat)
1107 WRITEBGR32(%4, %5, %%REGa)
1108 YSCALEYUV2PACKEDX_END
1113 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1114 "add %4, %%"REG_c" \n\t"
1115 WRITEBGR24(%%REGc, %5, %%REGa)
1117 :: "r" (&c->redDither),
1118 "m" (dummy), "m" (dummy), "m" (dummy),
1119 "r" (dest), "m" (dstW)
1120 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123 case PIX_FMT_BGR555:
1126 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1128 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1129 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1130 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1133 WRITEBGR15(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1136 case PIX_FMT_BGR565:
1139 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1141 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1142 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1143 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1146 WRITEBGR16(%4, %5, %%REGa)
1147 YSCALEYUV2PACKEDX_END
1149 case PIX_FMT_YUYV422:
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 "psraw $3, %%mm3 \n\t"
1154 "psraw $3, %%mm4 \n\t"
1155 "psraw $3, %%mm1 \n\t"
1156 "psraw $3, %%mm7 \n\t"
1157 WRITEYUY2(%4, %5, %%REGa)
1158 YSCALEYUV2PACKEDX_END
1164 /* The following list of supported dstFormat values should
1165 match what's found in the body of altivec_yuv2packedX() */
1166 if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1167 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1168 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1169 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1170 chrFilter, chrSrc, chrFilterSize,
1174 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1175 chrFilter, chrSrc, chrFilterSize,
1180 * vertical bilinear scale YV12 to RGB
1182 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1183 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1185 int yalpha1=yalpha^4095;
1186 int uvalpha1=uvalpha^4095;
1190 if(flags&SWS_FULL_CHR_H_INT)
1200 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1201 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1203 "movq %%mm3, %%mm1 \n\t"
1204 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1205 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1207 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1208 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1210 "add $4, %%"REG_a" \n\t"
1211 "cmp %5, %%"REG_a" \n\t"
1215 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1216 "m" (yalpha1), "m" (uvalpha1)
1226 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1227 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1229 "movq %%mm3, %%mm1 \n\t"
1230 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1231 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1233 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1234 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1235 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1236 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1237 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1238 "movq %%mm1, %%mm2 \n\t"
1239 "psllq $48, %%mm1 \n\t" // 000000BG
1240 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1242 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1243 "psrld $16, %%mm2 \n\t" // R000R000
1244 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1245 "por %%mm2, %%mm1 \n\t" // RBGRR000
1247 "mov %4, %%"REG_b" \n\t"
1248 "add %%"REG_a", %%"REG_b" \n\t"
1252 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1253 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1255 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1256 "psrlq $32, %%mm3 \n\t"
1257 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1258 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1260 "add $4, %%"REG_a" \n\t"
1261 "cmp %5, %%"REG_a" \n\t"
1264 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1265 "m" (yalpha1), "m" (uvalpha1)
1266 : "%"REG_a, "%"REG_b
1269 case PIX_FMT_BGR555:
1274 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1275 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1276 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1278 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1279 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1280 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1282 "psrlw $3, %%mm3 \n\t"
1283 "psllw $2, %%mm1 \n\t"
1284 "psllw $7, %%mm0 \n\t"
1285 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1286 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1288 "por %%mm3, %%mm1 \n\t"
1289 "por %%mm1, %%mm0 \n\t"
1291 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1293 "add $4, %%"REG_a" \n\t"
1294 "cmp %5, %%"REG_a" \n\t"
1297 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1298 "m" (yalpha1), "m" (uvalpha1)
1302 case PIX_FMT_BGR565:
1307 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1308 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1309 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1311 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1312 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1313 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1315 "psrlw $3, %%mm3 \n\t"
1316 "psllw $3, %%mm1 \n\t"
1317 "psllw $8, %%mm0 \n\t"
1318 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1319 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1321 "por %%mm3, %%mm1 \n\t"
1322 "por %%mm1, %%mm0 \n\t"
1324 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1326 "add $4, %%"REG_a" \n\t"
1327 "cmp %5, %%"REG_a" \n\t"
1330 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1331 "m" (yalpha1), "m" (uvalpha1)
1340 if(dstFormat==PIX_FMT_RGB32)
1343 #ifdef WORDS_BIGENDIAN
1346 for(i=0;i<dstW;i++){
1347 // vertical linear interpolation && yuv2rgb in a single step:
1348 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1349 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1350 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1351 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1352 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1353 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1357 else if(dstFormat==PIX_FMT_BGR24)
1360 for(i=0;i<dstW;i++){
1361 // vertical linear interpolation && yuv2rgb in a single step:
1362 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1363 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1364 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1365 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1366 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1367 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1371 else if(dstFormat==PIX_FMT_BGR565)
1374 for(i=0;i<dstW;i++){
1375 // vertical linear interpolation && yuv2rgb in a single step:
1376 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1377 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1378 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1380 ((uint16_t*)dest)[i] =
1381 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1382 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1383 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1386 else if(dstFormat==PIX_FMT_BGR555)
1389 for(i=0;i<dstW;i++){
1390 // vertical linear interpolation && yuv2rgb in a single step:
1391 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1392 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1393 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1395 ((uint16_t*)dest)[i] =
1396 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1397 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1398 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1406 switch(c->dstFormat)
1408 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1411 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1412 "mov %4, %%"REG_b" \n\t"
1413 "push %%"REG_BP" \n\t"
1414 YSCALEYUV2RGB(%%REGBP, %5)
1415 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB(%%REGBP, %5)
1429 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1430 "pop %%"REG_BP" \n\t"
1431 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1432 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1436 case PIX_FMT_BGR555:
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1446 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1449 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1450 "pop %%"REG_BP" \n\t"
1451 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1457 case PIX_FMT_BGR565:
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1466 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1470 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 case PIX_FMT_YUYV422:
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1480 "mov %4, %%"REG_b" \n\t"
1481 "push %%"REG_BP" \n\t"
1482 YSCALEYUV2PACKED(%%REGBP, %5)
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484 "pop %%"REG_BP" \n\t"
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1486 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1493 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1497 * YV12 to RGB without scaling or interpolating
1499 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1500 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1502 const int yalpha1=0;
1505 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1506 const int yalpha= 4096; //FIXME ...
1508 if(flags&SWS_FULL_CHR_H_INT)
1510 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1515 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1521 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1522 "mov %4, %%"REG_b" \n\t"
1523 "push %%"REG_BP" \n\t"
1524 YSCALEYUV2RGB1(%%REGBP, %5)
1525 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1(%%REGBP, %5)
1539 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1540 "pop %%"REG_BP" \n\t"
1541 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1543 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547 case PIX_FMT_BGR555:
1549 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1550 "mov %4, %%"REG_b" \n\t"
1551 "push %%"REG_BP" \n\t"
1552 YSCALEYUV2RGB1(%%REGBP, %5)
1553 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1556 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1557 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1559 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1560 "pop %%"REG_BP" \n\t"
1561 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1563 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567 case PIX_FMT_BGR565:
1569 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1570 "mov %4, %%"REG_b" \n\t"
1571 "push %%"REG_BP" \n\t"
1572 YSCALEYUV2RGB1(%%REGBP, %5)
1573 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1576 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1577 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1580 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1588 case PIX_FMT_YUYV422:
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2PACKED1(%%REGBP, %5)
1594 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1595 "pop %%"REG_BP" \n\t"
1596 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1598 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1610 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1611 "mov %4, %%"REG_b" \n\t"
1612 "push %%"REG_BP" \n\t"
1613 YSCALEYUV2RGB1b(%%REGBP, %5)
1614 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1615 "pop %%"REG_BP" \n\t"
1616 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1618 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1624 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1625 "mov %4, %%"REG_b" \n\t"
1626 "push %%"REG_BP" \n\t"
1627 YSCALEYUV2RGB1b(%%REGBP, %5)
1628 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1629 "pop %%"REG_BP" \n\t"
1630 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1632 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1636 case PIX_FMT_BGR555:
1638 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1639 "mov %4, %%"REG_b" \n\t"
1640 "push %%"REG_BP" \n\t"
1641 YSCALEYUV2RGB1b(%%REGBP, %5)
1642 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1645 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1646 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1648 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1649 "pop %%"REG_BP" \n\t"
1650 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1652 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1656 case PIX_FMT_BGR565:
1658 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1659 "mov %4, %%"REG_b" \n\t"
1660 "push %%"REG_BP" \n\t"
1661 YSCALEYUV2RGB1b(%%REGBP, %5)
1662 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1665 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1666 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1669 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1677 case PIX_FMT_YUYV422:
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2PACKED1b(%%REGBP, %5)
1683 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1684 "pop %%"REG_BP" \n\t"
1685 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1687 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1694 if( uvalpha < 2048 )
1696 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1702 //FIXME yuy2* can read upto 7 samples to much
1704 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1708 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1709 "mov %0, %%"REG_a" \n\t"
1711 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1712 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1713 "pand %%mm2, %%mm0 \n\t"
1714 "pand %%mm2, %%mm1 \n\t"
1715 "packuswb %%mm1, %%mm0 \n\t"
1716 "movq %%mm0, (%2, %%"REG_a") \n\t"
1717 "add $8, %%"REG_a" \n\t"
1719 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1724 for(i=0; i<width; i++)
1729 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1731 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1733 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1734 "mov %0, %%"REG_a" \n\t"
1736 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1737 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1738 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1739 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1742 "psrlw $8, %%mm0 \n\t"
1743 "psrlw $8, %%mm1 \n\t"
1744 "packuswb %%mm1, %%mm0 \n\t"
1745 "movq %%mm0, %%mm1 \n\t"
1746 "psrlw $8, %%mm0 \n\t"
1747 "pand %%mm4, %%mm1 \n\t"
1748 "packuswb %%mm0, %%mm0 \n\t"
1749 "packuswb %%mm1, %%mm1 \n\t"
1750 "movd %%mm0, (%4, %%"REG_a") \n\t"
1751 "movd %%mm1, (%3, %%"REG_a") \n\t"
1752 "add $4, %%"REG_a" \n\t"
1754 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1759 for(i=0; i<width; i++)
1761 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1762 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1767 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1768 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1772 "mov %0, %%"REG_a" \n\t"
1774 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1775 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1776 "psrlw $8, %%mm0 \n\t"
1777 "psrlw $8, %%mm1 \n\t"
1778 "packuswb %%mm1, %%mm0 \n\t"
1779 "movq %%mm0, (%2, %%"REG_a") \n\t"
1780 "add $8, %%"REG_a" \n\t"
1782 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1787 for(i=0; i<width; i++)
1792 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1794 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1796 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1797 "mov %0, %%"REG_a" \n\t"
1799 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1800 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1801 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1802 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1805 "pand %%mm4, %%mm0 \n\t"
1806 "pand %%mm4, %%mm1 \n\t"
1807 "packuswb %%mm1, %%mm0 \n\t"
1808 "movq %%mm0, %%mm1 \n\t"
1809 "psrlw $8, %%mm0 \n\t"
1810 "pand %%mm4, %%mm1 \n\t"
1811 "packuswb %%mm0, %%mm0 \n\t"
1812 "packuswb %%mm1, %%mm1 \n\t"
1813 "movd %%mm0, (%4, %%"REG_a") \n\t"
1814 "movd %%mm1, (%3, %%"REG_a") \n\t"
1815 "add $4, %%"REG_a" \n\t"
1817 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1822 for(i=0; i<width; i++)
1824 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1825 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1830 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1833 for(i=0; i<width; i++)
1835 int b= ((uint32_t*)src)[i]&0xFF;
1836 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1837 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1839 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1843 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1846 for(i=0; i<width; i++)
1848 const int a= ((uint32_t*)src1)[2*i+0];
1849 const int e= ((uint32_t*)src1)[2*i+1];
1850 const int c= ((uint32_t*)src2)[2*i+0];
1851 const int d= ((uint32_t*)src2)[2*i+1];
1852 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1853 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1854 const int b= l&0x3FF;
1858 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1859 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1863 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1867 "mov %2, %%"REG_a" \n\t"
1868 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1869 "movq "MANGLE(w1111)", %%mm5 \n\t"
1870 "pxor %%mm7, %%mm7 \n\t"
1871 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
1874 PREFETCH" 64(%0, %%"REG_d") \n\t"
1875 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1876 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1877 "punpcklbw %%mm7, %%mm0 \n\t"
1878 "punpcklbw %%mm7, %%mm1 \n\t"
1879 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1880 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1881 "punpcklbw %%mm7, %%mm2 \n\t"
1882 "punpcklbw %%mm7, %%mm3 \n\t"
1883 "pmaddwd %%mm6, %%mm0 \n\t"
1884 "pmaddwd %%mm6, %%mm1 \n\t"
1885 "pmaddwd %%mm6, %%mm2 \n\t"
1886 "pmaddwd %%mm6, %%mm3 \n\t"
1887 #ifndef FAST_BGR2YV12
1888 "psrad $8, %%mm0 \n\t"
1889 "psrad $8, %%mm1 \n\t"
1890 "psrad $8, %%mm2 \n\t"
1891 "psrad $8, %%mm3 \n\t"
1893 "packssdw %%mm1, %%mm0 \n\t"
1894 "packssdw %%mm3, %%mm2 \n\t"
1895 "pmaddwd %%mm5, %%mm0 \n\t"
1896 "pmaddwd %%mm5, %%mm2 \n\t"
1897 "packssdw %%mm2, %%mm0 \n\t"
1898 "psraw $7, %%mm0 \n\t"
1900 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1901 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1902 "punpcklbw %%mm7, %%mm4 \n\t"
1903 "punpcklbw %%mm7, %%mm1 \n\t"
1904 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1905 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1906 "punpcklbw %%mm7, %%mm2 \n\t"
1907 "punpcklbw %%mm7, %%mm3 \n\t"
1908 "pmaddwd %%mm6, %%mm4 \n\t"
1909 "pmaddwd %%mm6, %%mm1 \n\t"
1910 "pmaddwd %%mm6, %%mm2 \n\t"
1911 "pmaddwd %%mm6, %%mm3 \n\t"
1912 #ifndef FAST_BGR2YV12
1913 "psrad $8, %%mm4 \n\t"
1914 "psrad $8, %%mm1 \n\t"
1915 "psrad $8, %%mm2 \n\t"
1916 "psrad $8, %%mm3 \n\t"
1918 "packssdw %%mm1, %%mm4 \n\t"
1919 "packssdw %%mm3, %%mm2 \n\t"
1920 "pmaddwd %%mm5, %%mm4 \n\t"
1921 "pmaddwd %%mm5, %%mm2 \n\t"
1922 "add $24, %%"REG_d" \n\t"
1923 "packssdw %%mm2, %%mm4 \n\t"
1924 "psraw $7, %%mm4 \n\t"
1926 "packuswb %%mm4, %%mm0 \n\t"
1927 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1929 "movq %%mm0, (%1, %%"REG_a") \n\t"
1930 "add $8, %%"REG_a" \n\t"
1932 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1933 : "%"REG_a, "%"REG_d
1937 for(i=0; i<width; i++)
1943 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1948 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1952 "mov %4, %%"REG_a" \n\t"
1953 "movq "MANGLE(w1111)", %%mm5 \n\t"
1954 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1955 "pxor %%mm7, %%mm7 \n\t"
1956 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1957 "add %%"REG_d", %%"REG_d" \n\t"
1960 PREFETCH" 64(%0, %%"REG_d") \n\t"
1961 PREFETCH" 64(%1, %%"REG_d") \n\t"
1962 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1963 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1964 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1965 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1966 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1969 "movq %%mm0, %%mm1 \n\t"
1970 "movq %%mm2, %%mm3 \n\t"
1971 "psrlq $24, %%mm0 \n\t"
1972 "psrlq $24, %%mm2 \n\t"
1975 "punpcklbw %%mm7, %%mm0 \n\t"
1976 "punpcklbw %%mm7, %%mm2 \n\t"
1978 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1979 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1980 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1981 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1982 "punpcklbw %%mm7, %%mm0 \n\t"
1983 "punpcklbw %%mm7, %%mm1 \n\t"
1984 "punpcklbw %%mm7, %%mm2 \n\t"
1985 "punpcklbw %%mm7, %%mm3 \n\t"
1986 "paddw %%mm1, %%mm0 \n\t"
1987 "paddw %%mm3, %%mm2 \n\t"
1988 "paddw %%mm2, %%mm0 \n\t"
1989 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1990 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1991 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1992 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1993 "punpcklbw %%mm7, %%mm4 \n\t"
1994 "punpcklbw %%mm7, %%mm1 \n\t"
1995 "punpcklbw %%mm7, %%mm2 \n\t"
1996 "punpcklbw %%mm7, %%mm3 \n\t"
1997 "paddw %%mm1, %%mm4 \n\t"
1998 "paddw %%mm3, %%mm2 \n\t"
1999 "paddw %%mm4, %%mm2 \n\t"
2000 "psrlw $2, %%mm0 \n\t"
2001 "psrlw $2, %%mm2 \n\t"
2003 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2004 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2006 "pmaddwd %%mm0, %%mm1 \n\t"
2007 "pmaddwd %%mm2, %%mm3 \n\t"
2008 "pmaddwd %%mm6, %%mm0 \n\t"
2009 "pmaddwd %%mm6, %%mm2 \n\t"
2010 #ifndef FAST_BGR2YV12
2011 "psrad $8, %%mm0 \n\t"
2012 "psrad $8, %%mm1 \n\t"
2013 "psrad $8, %%mm2 \n\t"
2014 "psrad $8, %%mm3 \n\t"
2016 "packssdw %%mm2, %%mm0 \n\t"
2017 "packssdw %%mm3, %%mm1 \n\t"
2018 "pmaddwd %%mm5, %%mm0 \n\t"
2019 "pmaddwd %%mm5, %%mm1 \n\t"
2020 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2021 "psraw $7, %%mm0 \n\t"
2023 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2024 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2025 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2026 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2027 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2030 "movq %%mm4, %%mm1 \n\t"
2031 "movq %%mm2, %%mm3 \n\t"
2032 "psrlq $24, %%mm4 \n\t"
2033 "psrlq $24, %%mm2 \n\t"
2036 "punpcklbw %%mm7, %%mm4 \n\t"
2037 "punpcklbw %%mm7, %%mm2 \n\t"
2039 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2040 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2041 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2042 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2043 "punpcklbw %%mm7, %%mm4 \n\t"
2044 "punpcklbw %%mm7, %%mm1 \n\t"
2045 "punpcklbw %%mm7, %%mm2 \n\t"
2046 "punpcklbw %%mm7, %%mm3 \n\t"
2047 "paddw %%mm1, %%mm4 \n\t"
2048 "paddw %%mm3, %%mm2 \n\t"
2049 "paddw %%mm2, %%mm4 \n\t"
2050 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2051 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2052 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2053 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2054 "punpcklbw %%mm7, %%mm5 \n\t"
2055 "punpcklbw %%mm7, %%mm1 \n\t"
2056 "punpcklbw %%mm7, %%mm2 \n\t"
2057 "punpcklbw %%mm7, %%mm3 \n\t"
2058 "paddw %%mm1, %%mm5 \n\t"
2059 "paddw %%mm3, %%mm2 \n\t"
2060 "paddw %%mm5, %%mm2 \n\t"
2061 "movq "MANGLE(w1111)", %%mm5 \n\t"
2062 "psrlw $2, %%mm4 \n\t"
2063 "psrlw $2, %%mm2 \n\t"
2065 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2066 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2068 "pmaddwd %%mm4, %%mm1 \n\t"
2069 "pmaddwd %%mm2, %%mm3 \n\t"
2070 "pmaddwd %%mm6, %%mm4 \n\t"
2071 "pmaddwd %%mm6, %%mm2 \n\t"
2072 #ifndef FAST_BGR2YV12
2073 "psrad $8, %%mm4 \n\t"
2074 "psrad $8, %%mm1 \n\t"
2075 "psrad $8, %%mm2 \n\t"
2076 "psrad $8, %%mm3 \n\t"
2078 "packssdw %%mm2, %%mm4 \n\t"
2079 "packssdw %%mm3, %%mm1 \n\t"
2080 "pmaddwd %%mm5, %%mm4 \n\t"
2081 "pmaddwd %%mm5, %%mm1 \n\t"
2082 "add $24, %%"REG_d" \n\t"
2083 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2084 "psraw $7, %%mm4 \n\t"
2086 "movq %%mm0, %%mm1 \n\t"
2087 "punpckldq %%mm4, %%mm0 \n\t"
2088 "punpckhdq %%mm4, %%mm1 \n\t"
2089 "packsswb %%mm1, %%mm0 \n\t"
2090 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2092 "movd %%mm0, (%2, %%"REG_a") \n\t"
2093 "punpckhdq %%mm0, %%mm0 \n\t"
2094 "movd %%mm0, (%3, %%"REG_a") \n\t"
2095 "add $4, %%"REG_a" \n\t"
2097 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2098 : "%"REG_a, "%"REG_d
2102 for(i=0; i<width; i++)
2104 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2105 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2106 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2108 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2109 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2114 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2117 for(i=0; i<width; i++)
2119 int d= ((uint16_t*)src)[i];
2122 int r= (d>>11)&0x1F;
2124 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2128 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2131 for(i=0; i<width; i++)
2133 int d0= ((uint32_t*)src1)[i];
2134 int d1= ((uint32_t*)src2)[i];
2136 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
2137 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
2139 int dh2= (dh>>11) + (dh<<21);
2143 int r= (d>>11)&0x7F;
2145 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2146 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2150 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2153 for(i=0; i<width; i++)
2155 int d= ((uint16_t*)src)[i];
2158 int r= (d>>10)&0x1F;
2160 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2164 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2167 for(i=0; i<width; i++)
2169 int d0= ((uint32_t*)src1)[i];
2170 int d1= ((uint32_t*)src2)[i];
2172 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
2173 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
2175 int dh2= (dh>>11) + (dh<<21);
2179 int r= (d>>10)&0x7F;
2181 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2182 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2187 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2190 for(i=0; i<width; i++)
2192 int r= ((uint32_t*)src)[i]&0xFF;
2193 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2194 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2196 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2200 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2203 for(i=0; i<width; i++)
2205 const int a= ((uint32_t*)src1)[2*i+0];
2206 const int e= ((uint32_t*)src1)[2*i+1];
2207 const int c= ((uint32_t*)src2)[2*i+0];
2208 const int d= ((uint32_t*)src2)[2*i+1];
2209 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2210 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2211 const int r= l&0x3FF;
2215 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2216 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2220 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2223 for(i=0; i<width; i++)
2229 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2233 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2236 for(i=0; i<width; i++)
2238 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2239 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2240 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2242 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2243 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2248 // Bilinear / Bicubic scaling
2249 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2250 int16_t *filter, int16_t *filterPos, long filterSize)
2253 assert(filterSize % 4 == 0 && filterSize>0);
2254 if(filterSize==4) // allways true for upscaling, sometimes for down too
2256 long counter= -2*dstW;
2258 filterPos-= counter/2;
2262 "push %%"REG_b" \n\t"
2264 "pxor %%mm7, %%mm7 \n\t"
2265 "movq "MANGLE(w02)", %%mm6 \n\t"
2266 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2267 "mov %%"REG_a", %%"REG_BP" \n\t"
2270 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2271 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2272 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2273 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2274 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2275 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2276 "punpcklbw %%mm7, %%mm0 \n\t"
2277 "punpcklbw %%mm7, %%mm2 \n\t"
2278 "pmaddwd %%mm1, %%mm0 \n\t"
2279 "pmaddwd %%mm2, %%mm3 \n\t"
2280 "psrad $8, %%mm0 \n\t"
2281 "psrad $8, %%mm3 \n\t"
2282 "packssdw %%mm3, %%mm0 \n\t"
2283 "pmaddwd %%mm6, %%mm0 \n\t"
2284 "packssdw %%mm0, %%mm0 \n\t"
2285 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2286 "add $4, %%"REG_BP" \n\t"
2289 "pop %%"REG_BP" \n\t"
2291 "pop %%"REG_b" \n\t"
2294 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2300 else if(filterSize==8)
2302 long counter= -2*dstW;
2304 filterPos-= counter/2;
2308 "push %%"REG_b" \n\t"
2310 "pxor %%mm7, %%mm7 \n\t"
2311 "movq "MANGLE(w02)", %%mm6 \n\t"
2312 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2313 "mov %%"REG_a", %%"REG_BP" \n\t"
2316 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2317 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2318 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2319 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2320 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2321 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2322 "punpcklbw %%mm7, %%mm0 \n\t"
2323 "punpcklbw %%mm7, %%mm2 \n\t"
2324 "pmaddwd %%mm1, %%mm0 \n\t"
2325 "pmaddwd %%mm2, %%mm3 \n\t"
2327 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2328 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2329 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2330 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2331 "punpcklbw %%mm7, %%mm4 \n\t"
2332 "punpcklbw %%mm7, %%mm2 \n\t"
2333 "pmaddwd %%mm1, %%mm4 \n\t"
2334 "pmaddwd %%mm2, %%mm5 \n\t"
2335 "paddd %%mm4, %%mm0 \n\t"
2336 "paddd %%mm5, %%mm3 \n\t"
2338 "psrad $8, %%mm0 \n\t"
2339 "psrad $8, %%mm3 \n\t"
2340 "packssdw %%mm3, %%mm0 \n\t"
2341 "pmaddwd %%mm6, %%mm0 \n\t"
2342 "packssdw %%mm0, %%mm0 \n\t"
2343 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2344 "add $4, %%"REG_BP" \n\t"
2347 "pop %%"REG_BP" \n\t"
2349 "pop %%"REG_b" \n\t"
2352 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2360 uint8_t *offset = src+filterSize;
2361 long counter= -2*dstW;
2362 // filter-= counter*filterSize/2;
2363 filterPos-= counter/2;
2366 "pxor %%mm7, %%mm7 \n\t"
2367 "movq "MANGLE(w02)", %%mm6 \n\t"
2370 "mov %2, %%"REG_c" \n\t"
2371 "movzwl (%%"REG_c", %0), %%eax \n\t"
2372 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2373 "mov %5, %%"REG_c" \n\t"
2374 "pxor %%mm4, %%mm4 \n\t"
2375 "pxor %%mm5, %%mm5 \n\t"
2377 "movq (%1), %%mm1 \n\t"
2378 "movq (%1, %6), %%mm3 \n\t"
2379 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2380 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
2381 "punpcklbw %%mm7, %%mm0 \n\t"
2382 "punpcklbw %%mm7, %%mm2 \n\t"
2383 "pmaddwd %%mm1, %%mm0 \n\t"
2384 "pmaddwd %%mm2, %%mm3 \n\t"
2385 "paddd %%mm3, %%mm5 \n\t"
2386 "paddd %%mm0, %%mm4 \n\t"
2388 "add $4, %%"REG_c" \n\t"
2389 "cmp %4, %%"REG_c" \n\t"
2392 "psrad $8, %%mm4 \n\t"
2393 "psrad $8, %%mm5 \n\t"
2394 "packssdw %%mm5, %%mm4 \n\t"
2395 "pmaddwd %%mm6, %%mm4 \n\t"
2396 "packssdw %%mm4, %%mm4 \n\t"
2397 "mov %3, %%"REG_a" \n\t"
2398 "movd %%mm4, (%%"REG_a", %0) \n\t"
2402 : "+r" (counter), "+r" (filter)
2403 : "m" (filterPos), "m" (dst), "m"(offset),
2404 "m" (src), "r" (filterSize*2)
2405 : "%"REG_a, "%"REG_c, "%"REG_d
2410 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2413 for(i=0; i<dstW; i++)
2416 int srcPos= filterPos[i];
2418 // printf("filterPos: %d\n", filterPos[i]);
2419 for(j=0; j<filterSize; j++)
2421 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2422 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2424 // filter += hFilterSize;
2425 dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2431 // *** horizontal scale Y line to temp buffer
2432 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2433 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2434 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2435 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2436 int32_t *mmx2FilterPos)
2438 if(srcFormat==PIX_FMT_YUYV422)
2440 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2441 src= formatConvBuffer;
2443 else if(srcFormat==PIX_FMT_UYVY422)
2445 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2446 src= formatConvBuffer;
2448 else if(srcFormat==PIX_FMT_RGB32)
2450 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2451 src= formatConvBuffer;
2453 else if(srcFormat==PIX_FMT_BGR24)
2455 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2456 src= formatConvBuffer;
2458 else if(srcFormat==PIX_FMT_BGR565)
2460 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2461 src= formatConvBuffer;
2463 else if(srcFormat==PIX_FMT_BGR555)
2465 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2466 src= formatConvBuffer;
2468 else if(srcFormat==PIX_FMT_BGR32)
2470 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2471 src= formatConvBuffer;
2473 else if(srcFormat==PIX_FMT_RGB24)
2475 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2476 src= formatConvBuffer;
2480 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2481 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2483 if(!(flags&SWS_FAST_BILINEAR))
2486 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2488 else // Fast Bilinear upscale / crap downscale
2490 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2494 uint64_t ebxsave __attribute__((aligned(8)));
2500 "mov %%"REG_b", %5 \n\t"
2502 "pxor %%mm7, %%mm7 \n\t"
2503 "mov %0, %%"REG_c" \n\t"
2504 "mov %1, %%"REG_D" \n\t"
2505 "mov %2, %%"REG_d" \n\t"
2506 "mov %3, %%"REG_b" \n\t"
2507 "xor %%"REG_a", %%"REG_a" \n\t" // i
2508 PREFETCH" (%%"REG_c") \n\t"
2509 PREFETCH" 32(%%"REG_c") \n\t"
2510 PREFETCH" 64(%%"REG_c") \n\t"
2514 #define FUNNY_Y_CODE \
2515 "movl (%%"REG_b"), %%esi \n\t"\
2517 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2518 "add %%"REG_S", %%"REG_c" \n\t"\
2519 "add %%"REG_a", %%"REG_D" \n\t"\
2520 "xor %%"REG_a", %%"REG_a" \n\t"\
2524 #define FUNNY_Y_CODE \
2525 "movl (%%"REG_b"), %%esi \n\t"\
2527 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2528 "add %%"REG_a", %%"REG_D" \n\t"\
2529 "xor %%"REG_a", %%"REG_a" \n\t"\
2543 "mov %5, %%"REG_b" \n\t"
2545 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2550 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2555 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2560 long xInc_shr16 = xInc >> 16;
2561 uint16_t xInc_mask = xInc & 0xffff;
2562 //NO MMX just normal asm ...
2564 "xor %%"REG_a", %%"REG_a" \n\t" // i
2565 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2566 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2569 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2570 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2571 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2572 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2573 "shll $16, %%edi \n\t"
2574 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2575 "mov %1, %%"REG_D" \n\t"
2576 "shrl $9, %%esi \n\t"
2577 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2578 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2579 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2581 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2582 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2583 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2584 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2585 "shll $16, %%edi \n\t"
2586 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2587 "mov %1, %%"REG_D" \n\t"
2588 "shrl $9, %%esi \n\t"
2589 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2590 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2591 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2594 "add $2, %%"REG_a" \n\t"
2595 "cmp %2, %%"REG_a" \n\t"
2599 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2600 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2603 } //if MMX2 can't be used
2607 unsigned int xpos=0;
2608 for(i=0;i<dstWidth;i++)
2610 register unsigned int xx=xpos>>16;
2611 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2612 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2619 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2620 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2621 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2622 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2623 int32_t *mmx2FilterPos)
2625 if(srcFormat==PIX_FMT_YUYV422)
2627 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2628 src1= formatConvBuffer;
2629 src2= formatConvBuffer+2048;
2631 else if(srcFormat==PIX_FMT_UYVY422)
2633 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2634 src1= formatConvBuffer;
2635 src2= formatConvBuffer+2048;
2637 else if(srcFormat==PIX_FMT_RGB32)
2639 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2640 src1= formatConvBuffer;
2641 src2= formatConvBuffer+2048;
2643 else if(srcFormat==PIX_FMT_BGR24)
2645 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2646 src1= formatConvBuffer;
2647 src2= formatConvBuffer+2048;
2649 else if(srcFormat==PIX_FMT_BGR565)
2651 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2652 src1= formatConvBuffer;
2653 src2= formatConvBuffer+2048;
2655 else if(srcFormat==PIX_FMT_BGR555)
2657 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2658 src1= formatConvBuffer;
2659 src2= formatConvBuffer+2048;
2661 else if(srcFormat==PIX_FMT_BGR32)
2663 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2664 src1= formatConvBuffer;
2665 src2= formatConvBuffer+2048;
2667 else if(srcFormat==PIX_FMT_RGB24)
2669 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2670 src1= formatConvBuffer;
2671 src2= formatConvBuffer+2048;
2673 else if(isGray(srcFormat))
2679 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2680 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2682 if(!(flags&SWS_FAST_BILINEAR))
2685 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2686 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2688 else // Fast Bilinear upscale / crap downscale
2690 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2694 uint64_t ebxsave __attribute__((aligned(8)));
2700 "mov %%"REG_b", %6 \n\t"
2702 "pxor %%mm7, %%mm7 \n\t"
2703 "mov %0, %%"REG_c" \n\t"
2704 "mov %1, %%"REG_D" \n\t"
2705 "mov %2, %%"REG_d" \n\t"
2706 "mov %3, %%"REG_b" \n\t"
2707 "xor %%"REG_a", %%"REG_a" \n\t" // i
2708 PREFETCH" (%%"REG_c") \n\t"
2709 PREFETCH" 32(%%"REG_c") \n\t"
2710 PREFETCH" 64(%%"REG_c") \n\t"
2714 #define FUNNY_UV_CODE \
2715 "movl (%%"REG_b"), %%esi \n\t"\
2717 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2718 "add %%"REG_S", %%"REG_c" \n\t"\
2719 "add %%"REG_a", %%"REG_D" \n\t"\
2720 "xor %%"REG_a", %%"REG_a" \n\t"\
2724 #define FUNNY_UV_CODE \
2725 "movl (%%"REG_b"), %%esi \n\t"\
2727 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2728 "add %%"REG_a", %%"REG_D" \n\t"\
2729 "xor %%"REG_a", %%"REG_a" \n\t"\
2737 "xor %%"REG_a", %%"REG_a" \n\t" // i
2738 "mov %5, %%"REG_c" \n\t" // src
2739 "mov %1, %%"REG_D" \n\t" // buf1
2740 "add $4096, %%"REG_D" \n\t"
2741 PREFETCH" (%%"REG_c") \n\t"
2742 PREFETCH" 32(%%"REG_c") \n\t"
2743 PREFETCH" 64(%%"REG_c") \n\t"
2751 "mov %6, %%"REG_b" \n\t"
2753 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2754 "m" (funnyUVCode), "m" (src2)
2758 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2763 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2765 // printf("%d %d %d\n", dstWidth, i, srcW);
2766 dst[i] = src1[srcW-1]*128;
2767 dst[i+2048] = src2[srcW-1]*128;
2773 long xInc_shr16 = (long) (xInc >> 16);
2774 uint16_t xInc_mask = xInc & 0xffff;
2776 "xor %%"REG_a", %%"REG_a" \n\t" // i
2777 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2778 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2781 "mov %0, %%"REG_S" \n\t"
2782 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2783 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2784 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2785 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2786 "shll $16, %%edi \n\t"
2787 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2788 "mov %1, %%"REG_D" \n\t"
2789 "shrl $9, %%esi \n\t"
2790 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2792 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2793 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2794 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2795 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2796 "shll $16, %%edi \n\t"
2797 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2798 "mov %1, %%"REG_D" \n\t"
2799 "shrl $9, %%esi \n\t"
2800 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2802 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2803 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2804 "add $1, %%"REG_a" \n\t"
2805 "cmp %2, %%"REG_a" \n\t"
2808 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2809 which is needed to support GCC-4.0 */
2810 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2811 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2813 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2816 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2819 } //if MMX2 can't be used
2823 unsigned int xpos=0;
2824 for(i=0;i<dstWidth;i++)
2826 register unsigned int xx=xpos>>16;
2827 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2828 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2829 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2831 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2832 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2840 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2841 int srcSliceH, uint8_t* dst[], int dstStride[]){
2843 /* load a few things into local vars to make the code more readable? and faster */
2844 const int srcW= c->srcW;
2845 const int dstW= c->dstW;
2846 const int dstH= c->dstH;
2847 const int chrDstW= c->chrDstW;
2848 const int chrSrcW= c->chrSrcW;
2849 const int lumXInc= c->lumXInc;
2850 const int chrXInc= c->chrXInc;
2851 const int dstFormat= c->dstFormat;
2852 const int srcFormat= c->srcFormat;
2853 const int flags= c->flags;
2854 const int canMMX2BeUsed= c->canMMX2BeUsed;
2855 int16_t *vLumFilterPos= c->vLumFilterPos;
2856 int16_t *vChrFilterPos= c->vChrFilterPos;
2857 int16_t *hLumFilterPos= c->hLumFilterPos;
2858 int16_t *hChrFilterPos= c->hChrFilterPos;
2859 int16_t *vLumFilter= c->vLumFilter;
2860 int16_t *vChrFilter= c->vChrFilter;
2861 int16_t *hLumFilter= c->hLumFilter;
2862 int16_t *hChrFilter= c->hChrFilter;
2863 int32_t *lumMmxFilter= c->lumMmxFilter;
2864 int32_t *chrMmxFilter= c->chrMmxFilter;
2865 const int vLumFilterSize= c->vLumFilterSize;
2866 const int vChrFilterSize= c->vChrFilterSize;
2867 const int hLumFilterSize= c->hLumFilterSize;
2868 const int hChrFilterSize= c->hChrFilterSize;
2869 int16_t **lumPixBuf= c->lumPixBuf;
2870 int16_t **chrPixBuf= c->chrPixBuf;
2871 const int vLumBufSize= c->vLumBufSize;
2872 const int vChrBufSize= c->vChrBufSize;
2873 uint8_t *funnyYCode= c->funnyYCode;
2874 uint8_t *funnyUVCode= c->funnyUVCode;
2875 uint8_t *formatConvBuffer= c->formatConvBuffer;
2876 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2877 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2880 /* vars whch will change and which we need to storw back in the context */
2882 int lumBufIndex= c->lumBufIndex;
2883 int chrBufIndex= c->chrBufIndex;
2884 int lastInLumBuf= c->lastInLumBuf;
2885 int lastInChrBuf= c->lastInChrBuf;
2887 if(isPacked(c->srcFormat)){
2893 srcStride[2]= srcStride[0];
2895 srcStride[1]<<= c->vChrDrop;
2896 srcStride[2]<<= c->vChrDrop;
2898 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2899 // (int)dst[0], (int)dst[1], (int)dst[2]);
2901 #if 0 //self test FIXME move to a vfilter or something
2903 static volatile int i=0;
2905 if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2906 selfTest(src, srcStride, c->srcW, c->srcH);
2911 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2912 //dstStride[0],dstStride[1],dstStride[2]);
2914 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2916 static int firstTime=1; //FIXME move this into the context perhaps
2917 if(flags & SWS_PRINT_INFO && firstTime)
2919 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2920 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2925 /* Note the user might start scaling the picture in the middle so this will not get executed
2926 this is not really intended but works currently, so ppl might do it */
2937 for(;dstY < dstH; dstY++){
2938 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2939 const int chrDstY= dstY>>c->chrDstVSubSample;
2940 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2941 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2943 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2944 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2945 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2946 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2948 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2949 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2950 //handle holes (FAST_BILINEAR & weird filters)
2951 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2952 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2953 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2954 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2955 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2957 // Do we have enough lines in this slice to output the dstY line
2958 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2960 //Do horizontal scaling
2961 while(lastInLumBuf < lastLumSrcY)
2963 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2965 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2966 ASSERT(lumBufIndex < 2*vLumBufSize)
2967 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2968 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2969 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2970 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2971 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2972 funnyYCode, c->srcFormat, formatConvBuffer,
2973 c->lumMmx2Filter, c->lumMmx2FilterPos);
2976 while(lastInChrBuf < lastChrSrcY)
2978 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2979 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2981 ASSERT(chrBufIndex < 2*vChrBufSize)
2982 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2983 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2984 //FIXME replace parameters through context struct (some at least)
2986 if(!(isGray(srcFormat) || isGray(dstFormat)))
2987 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2988 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2989 funnyUVCode, c->srcFormat, formatConvBuffer,
2990 c->chrMmx2Filter, c->chrMmx2FilterPos);
2993 //wrap buf index around to stay inside the ring buffer
2994 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2995 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2997 else // not enough lines left in this slice -> load the rest in the buffer
2999 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3000 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3001 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3002 vChrBufSize, vLumBufSize);*/
3004 //Do horizontal scaling
3005 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3007 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3009 ASSERT(lumBufIndex < 2*vLumBufSize)
3010 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3011 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3012 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3013 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3014 funnyYCode, c->srcFormat, formatConvBuffer,
3015 c->lumMmx2Filter, c->lumMmx2FilterPos);
3018 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3020 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3021 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3023 ASSERT(chrBufIndex < 2*vChrBufSize)
3024 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3025 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3027 if(!(isGray(srcFormat) || isGray(dstFormat)))
3028 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3029 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3030 funnyUVCode, c->srcFormat, formatConvBuffer,
3031 c->chrMmx2Filter, c->chrMmx2FilterPos);
3034 //wrap buf index around to stay inside the ring buffer
3035 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3036 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3037 break; //we can't output a dstY line so let's try with the next slice
3041 b5Dither= dither8[dstY&1];
3042 g6Dither= dither4[dstY&1];
3043 g5Dither= dither8[dstY&1];
3044 r5Dither= dither8[(dstY+1)&1];
3048 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3049 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3052 if(flags & SWS_ACCURATE_RND){
3053 for(i=0; i<vLumFilterSize; i+=2){
3054 lumMmxFilter[2*i+0]= lumSrcPtr[i ];
3055 lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)];
3056 lumMmxFilter[2*i+2]=
3057 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3058 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3060 for(i=0; i<vChrFilterSize; i+=2){
3061 chrMmxFilter[2*i+0]= chrSrcPtr[i ];
3062 chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)];
3063 chrMmxFilter[2*i+2]=
3064 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3065 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3068 for(i=0; i<vLumFilterSize; i++)
3070 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3071 lumMmxFilter[4*i+2]=
3072 lumMmxFilter[4*i+3]=
3073 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3075 for(i=0; i<vChrFilterSize; i++)
3077 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3078 chrMmxFilter[4*i+2]=
3079 chrMmxFilter[4*i+3]=
3080 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3084 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3085 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3086 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3087 RENAME(yuv2nv12X)(c,
3088 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3089 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3090 dest, uDest, dstW, chrDstW, dstFormat);
3092 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3094 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3095 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3096 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3098 int16_t *lumBuf = lumPixBuf[0];
3099 int16_t *chrBuf= chrPixBuf[0];
3100 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3105 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3106 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3107 dest, uDest, vDest, dstW, chrDstW);
3112 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3113 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3114 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3116 int chrAlpha= vChrFilter[2*dstY+1];
3117 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3118 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3120 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3122 int lumAlpha= vLumFilter[2*dstY+1];
3123 int chrAlpha= vChrFilter[2*dstY+1];
3125 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3127 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3128 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3129 dest, dstW, lumAlpha, chrAlpha, dstY);
3133 RENAME(yuv2packedX)(c,
3134 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3135 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3140 else // hmm looks like we can't use MMX here without overwriting this array's tail
3142 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3143 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3144 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3145 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3146 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3148 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3149 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3150 dest, uDest, dstW, chrDstW, dstFormat);
3152 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3154 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3155 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3157 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3158 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3159 dest, uDest, vDest, dstW, chrDstW);
3163 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3164 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3166 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3167 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3174 __asm __volatile(SFENCE:::"memory");
3175 __asm __volatile(EMMS:::"memory");
3177 /* store changed local vars back in the context */
3179 c->lumBufIndex= lumBufIndex;
3180 c->chrBufIndex= chrBufIndex;
3181 c->lastInLumBuf= lastInLumBuf;
3182 c->lastInChrBuf= lastInChrBuf;
3184 return dstY - lastDstY;