2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #elif defined ( HAVE_MMX2 )
40 #define PREFETCH "prefetchnta"
41 #define PREFETCHW "prefetcht0"
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
48 #define SFENCE "sfence"
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
55 #elif defined (HAVE_3DNOW)
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
67 #include "swscale_altivec_template.c"
70 #define YSCALEYUV2YV12X(x, offset, dest, width) \
72 "xor %%"REG_a", %%"REG_a" \n\t"\
73 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
74 "movq %%mm3, %%mm4 \n\t"\
75 "lea " offset "(%0), %%"REG_d" \n\t"\
76 "mov (%%"REG_d"), %%"REG_S" \n\t"\
77 ASMALIGN16 /* FIXME Unroll? */\
79 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
80 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
81 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
82 "add $16, %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
84 "test %%"REG_S", %%"REG_S" \n\t"\
85 "pmulhw %%mm0, %%mm2 \n\t"\
86 "pmulhw %%mm0, %%mm5 \n\t"\
87 "paddw %%mm2, %%mm3 \n\t"\
88 "paddw %%mm5, %%mm4 \n\t"\
90 "psraw $3, %%mm3 \n\t"\
91 "psraw $3, %%mm4 \n\t"\
92 "packuswb %%mm4, %%mm3 \n\t"\
93 MOVNTQ(%%mm3, (%1, %%REGa))\
94 "add $8, %%"REG_a" \n\t"\
95 "cmp %2, %%"REG_a" \n\t"\
96 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
97 "movq %%mm3, %%mm4 \n\t"\
98 "lea " offset "(%0), %%"REG_d" \n\t"\
99 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 :: "r" (&c->redDither),\
102 "r" (dest), "p" (width)\
103 : "%"REG_a, "%"REG_d, "%"REG_S\
106 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
108 "lea " offset "(%0), %%"REG_d" \n\t"\
109 "xor %%"REG_a", %%"REG_a" \n\t"\
110 "pxor %%mm4, %%mm4 \n\t"\
111 "pxor %%mm5, %%mm5 \n\t"\
112 "pxor %%mm6, %%mm6 \n\t"\
113 "pxor %%mm7, %%mm7 \n\t"\
114 "mov (%%"REG_d"), %%"REG_S" \n\t"\
117 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
118 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
119 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
121 "movq %%mm0, %%mm3 \n\t"\
122 "punpcklwd %%mm1, %%mm0 \n\t"\
123 "punpckhwd %%mm1, %%mm3 \n\t"\
124 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "pmaddwd %%mm1, %%mm3 \n\t"\
127 "paddd %%mm0, %%mm4 \n\t"\
128 "paddd %%mm3, %%mm5 \n\t"\
129 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
130 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
131 "add $16, %%"REG_d" \n\t"\
132 "test %%"REG_S", %%"REG_S" \n\t"\
133 "movq %%mm2, %%mm0 \n\t"\
134 "punpcklwd %%mm3, %%mm2 \n\t"\
135 "punpckhwd %%mm3, %%mm0 \n\t"\
136 "pmaddwd %%mm1, %%mm2 \n\t"\
137 "pmaddwd %%mm1, %%mm0 \n\t"\
138 "paddd %%mm2, %%mm6 \n\t"\
139 "paddd %%mm0, %%mm7 \n\t"\
141 "psrad $16, %%mm4 \n\t"\
142 "psrad $16, %%mm5 \n\t"\
143 "psrad $16, %%mm6 \n\t"\
144 "psrad $16, %%mm7 \n\t"\
145 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
146 "packssdw %%mm5, %%mm4 \n\t"\
147 "packssdw %%mm7, %%mm6 \n\t"\
148 "paddw %%mm0, %%mm4 \n\t"\
149 "paddw %%mm0, %%mm6 \n\t"\
150 "psraw $3, %%mm4 \n\t"\
151 "psraw $3, %%mm6 \n\t"\
152 "packuswb %%mm6, %%mm4 \n\t"\
153 MOVNTQ(%%mm4, (%1, %%REGa))\
154 "add $8, %%"REG_a" \n\t"\
155 "cmp %2, %%"REG_a" \n\t"\
156 "lea " offset "(%0), %%"REG_d" \n\t"\
157 "pxor %%mm4, %%mm4 \n\t"\
158 "pxor %%mm5, %%mm5 \n\t"\
159 "pxor %%mm6, %%mm6 \n\t"\
160 "pxor %%mm7, %%mm7 \n\t"\
161 "mov (%%"REG_d"), %%"REG_S" \n\t"\
163 :: "r" (&c->redDither),\
164 "r" (dest), "p" (width)\
165 : "%"REG_a, "%"REG_d, "%"REG_S\
168 #define YSCALEYUV2YV121 \
169 "mov %2, %%"REG_a" \n\t"\
170 ASMALIGN16 /* FIXME Unroll? */\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
174 "psraw $7, %%mm0 \n\t"\
175 "psraw $7, %%mm1 \n\t"\
176 "packuswb %%mm1, %%mm0 \n\t"\
177 MOVNTQ(%%mm0, (%1, %%REGa))\
178 "add $8, %%"REG_a" \n\t"\
182 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
183 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
184 "r" (dest), "m" (dstW),
185 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
186 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
188 #define YSCALEYUV2PACKEDX \
190 "xor %%"REG_a", %%"REG_a" \n\t"\
194 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
195 "mov (%%"REG_d"), %%"REG_S" \n\t"\
196 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
197 "movq %%mm3, %%mm4 \n\t"\
200 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
201 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
202 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
203 "add $16, %%"REG_d" \n\t"\
204 "mov (%%"REG_d"), %%"REG_S" \n\t"\
205 "pmulhw %%mm0, %%mm2 \n\t"\
206 "pmulhw %%mm0, %%mm5 \n\t"\
207 "paddw %%mm2, %%mm3 \n\t"\
208 "paddw %%mm5, %%mm4 \n\t"\
209 "test %%"REG_S", %%"REG_S" \n\t"\
212 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
213 "mov (%%"REG_d"), %%"REG_S" \n\t"\
214 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
215 "movq %%mm1, %%mm7 \n\t"\
218 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
219 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
220 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
221 "add $16, %%"REG_d" \n\t"\
222 "mov (%%"REG_d"), %%"REG_S" \n\t"\
223 "pmulhw %%mm0, %%mm2 \n\t"\
224 "pmulhw %%mm0, %%mm5 \n\t"\
225 "paddw %%mm2, %%mm1 \n\t"\
226 "paddw %%mm5, %%mm7 \n\t"\
227 "test %%"REG_S", %%"REG_S" \n\t"\
230 #define YSCALEYUV2PACKEDX_END\
231 :: "r" (&c->redDither), \
232 "m" (dummy), "m" (dummy), "m" (dummy),\
233 "r" (dest), "m" (dstW)\
234 : "%"REG_a, "%"REG_d, "%"REG_S\
237 #define YSCALEYUV2PACKEDX_ACCURATE \
239 "xor %%"REG_a", %%"REG_a" \n\t"\
243 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pxor %%mm4, %%mm4 \n\t"\
246 "pxor %%mm5, %%mm5 \n\t"\
247 "pxor %%mm6, %%mm6 \n\t"\
248 "pxor %%mm7, %%mm7 \n\t"\
251 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
252 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
253 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
255 "movq %%mm0, %%mm3 \n\t"\
256 "punpcklwd %%mm1, %%mm0 \n\t"\
257 "punpckhwd %%mm1, %%mm3 \n\t"\
258 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
259 "pmaddwd %%mm1, %%mm0 \n\t"\
260 "pmaddwd %%mm1, %%mm3 \n\t"\
261 "paddd %%mm0, %%mm4 \n\t"\
262 "paddd %%mm3, %%mm5 \n\t"\
263 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
264 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
265 "add $16, %%"REG_d" \n\t"\
266 "test %%"REG_S", %%"REG_S" \n\t"\
267 "movq %%mm2, %%mm0 \n\t"\
268 "punpcklwd %%mm3, %%mm2 \n\t"\
269 "punpckhwd %%mm3, %%mm0 \n\t"\
270 "pmaddwd %%mm1, %%mm2 \n\t"\
271 "pmaddwd %%mm1, %%mm0 \n\t"\
272 "paddd %%mm2, %%mm6 \n\t"\
273 "paddd %%mm0, %%mm7 \n\t"\
275 "psrad $16, %%mm4 \n\t"\
276 "psrad $16, %%mm5 \n\t"\
277 "psrad $16, %%mm6 \n\t"\
278 "psrad $16, %%mm7 \n\t"\
279 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
280 "packssdw %%mm5, %%mm4 \n\t"\
281 "packssdw %%mm7, %%mm6 \n\t"\
282 "paddw %%mm0, %%mm4 \n\t"\
283 "paddw %%mm0, %%mm6 \n\t"\
284 "movq %%mm4, "U_TEMP"(%0) \n\t"\
285 "movq %%mm6, "V_TEMP"(%0) \n\t"\
287 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
288 "mov (%%"REG_d"), %%"REG_S" \n\t"\
289 "pxor %%mm1, %%mm1 \n\t"\
290 "pxor %%mm5, %%mm5 \n\t"\
291 "pxor %%mm7, %%mm7 \n\t"\
292 "pxor %%mm6, %%mm6 \n\t"\
295 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
296 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
297 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
299 "movq %%mm0, %%mm3 \n\t"\
300 "punpcklwd %%mm4, %%mm0 \n\t"\
301 "punpckhwd %%mm4, %%mm3 \n\t"\
302 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
303 "pmaddwd %%mm4, %%mm0 \n\t"\
304 "pmaddwd %%mm4, %%mm3 \n\t"\
305 "paddd %%mm0, %%mm1 \n\t"\
306 "paddd %%mm3, %%mm5 \n\t"\
307 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
308 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
309 "add $16, %%"REG_d" \n\t"\
310 "test %%"REG_S", %%"REG_S" \n\t"\
311 "movq %%mm2, %%mm0 \n\t"\
312 "punpcklwd %%mm3, %%mm2 \n\t"\
313 "punpckhwd %%mm3, %%mm0 \n\t"\
314 "pmaddwd %%mm4, %%mm2 \n\t"\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "paddd %%mm2, %%mm7 \n\t"\
317 "paddd %%mm0, %%mm6 \n\t"\
319 "psrad $16, %%mm1 \n\t"\
320 "psrad $16, %%mm5 \n\t"\
321 "psrad $16, %%mm7 \n\t"\
322 "psrad $16, %%mm6 \n\t"\
323 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
324 "packssdw %%mm5, %%mm1 \n\t"\
325 "packssdw %%mm6, %%mm7 \n\t"\
326 "paddw %%mm0, %%mm1 \n\t"\
327 "paddw %%mm0, %%mm7 \n\t"\
328 "movq "U_TEMP"(%0), %%mm3 \n\t"\
329 "movq "V_TEMP"(%0), %%mm4 \n\t"\
331 #define YSCALEYUV2RGBX \
332 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
333 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
334 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
335 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
336 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
337 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
338 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
339 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
340 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
341 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
342 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
343 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
344 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
345 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
346 "paddw %%mm3, %%mm4 \n\t"\
347 "movq %%mm2, %%mm0 \n\t"\
348 "movq %%mm5, %%mm6 \n\t"\
349 "movq %%mm4, %%mm3 \n\t"\
350 "punpcklwd %%mm2, %%mm2 \n\t"\
351 "punpcklwd %%mm5, %%mm5 \n\t"\
352 "punpcklwd %%mm4, %%mm4 \n\t"\
353 "paddw %%mm1, %%mm2 \n\t"\
354 "paddw %%mm1, %%mm5 \n\t"\
355 "paddw %%mm1, %%mm4 \n\t"\
356 "punpckhwd %%mm0, %%mm0 \n\t"\
357 "punpckhwd %%mm6, %%mm6 \n\t"\
358 "punpckhwd %%mm3, %%mm3 \n\t"\
359 "paddw %%mm7, %%mm0 \n\t"\
360 "paddw %%mm7, %%mm6 \n\t"\
361 "paddw %%mm7, %%mm3 \n\t"\
362 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
363 "packuswb %%mm0, %%mm2 \n\t"\
364 "packuswb %%mm6, %%mm5 \n\t"\
365 "packuswb %%mm3, %%mm4 \n\t"\
366 "pxor %%mm7, %%mm7 \n\t"
368 #define FULL_YSCALEYUV2RGB \
369 "pxor %%mm7, %%mm7 \n\t"\
370 "movd %6, %%mm6 \n\t" /*yalpha1*/\
371 "punpcklwd %%mm6, %%mm6 \n\t"\
372 "punpcklwd %%mm6, %%mm6 \n\t"\
373 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
374 "punpcklwd %%mm5, %%mm5 \n\t"\
375 "punpcklwd %%mm5, %%mm5 \n\t"\
376 "xor %%"REG_a", %%"REG_a" \n\t"\
379 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
380 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
381 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
382 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
383 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
384 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
385 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
386 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
387 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
388 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
389 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
390 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
391 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
392 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
393 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
394 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
395 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
396 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
399 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
400 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
401 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
402 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
403 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
404 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
405 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
408 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
409 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
410 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
411 "paddw %%mm1, %%mm3 \n\t" /* B*/\
412 "paddw %%mm1, %%mm0 \n\t" /* R*/\
413 "packuswb %%mm3, %%mm3 \n\t"\
415 "packuswb %%mm0, %%mm0 \n\t"\
416 "paddw %%mm4, %%mm2 \n\t"\
417 "paddw %%mm2, %%mm1 \n\t" /* G*/\
419 "packuswb %%mm1, %%mm1 \n\t"
422 #define REAL_YSCALEYUV2PACKED(index, c) \
423 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
424 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
425 "psraw $3, %%mm0 \n\t"\
426 "psraw $3, %%mm1 \n\t"\
427 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
428 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
429 "xor "#index", "#index" \n\t"\
432 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
433 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
434 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
435 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
436 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
437 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
438 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
439 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
440 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
441 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
442 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
443 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
444 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
445 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
446 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
447 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
448 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
449 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
450 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
451 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
452 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
453 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
454 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
455 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
460 #define REAL_YSCALEYUV2RGB(index, c) \
461 "xor "#index", "#index" \n\t"\
464 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
465 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
466 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
467 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
468 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
469 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
470 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
471 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
472 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
473 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
474 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
475 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
476 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
477 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
478 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
479 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
480 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
481 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
482 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
483 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
484 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
485 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
486 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
487 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
488 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
489 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
490 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
491 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
492 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
493 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
494 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
495 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
496 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
497 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
498 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
499 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
500 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
501 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
502 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
503 "paddw %%mm3, %%mm4 \n\t"\
504 "movq %%mm2, %%mm0 \n\t"\
505 "movq %%mm5, %%mm6 \n\t"\
506 "movq %%mm4, %%mm3 \n\t"\
507 "punpcklwd %%mm2, %%mm2 \n\t"\
508 "punpcklwd %%mm5, %%mm5 \n\t"\
509 "punpcklwd %%mm4, %%mm4 \n\t"\
510 "paddw %%mm1, %%mm2 \n\t"\
511 "paddw %%mm1, %%mm5 \n\t"\
512 "paddw %%mm1, %%mm4 \n\t"\
513 "punpckhwd %%mm0, %%mm0 \n\t"\
514 "punpckhwd %%mm6, %%mm6 \n\t"\
515 "punpckhwd %%mm3, %%mm3 \n\t"\
516 "paddw %%mm7, %%mm0 \n\t"\
517 "paddw %%mm7, %%mm6 \n\t"\
518 "paddw %%mm7, %%mm3 \n\t"\
519 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
520 "packuswb %%mm0, %%mm2 \n\t"\
521 "packuswb %%mm6, %%mm5 \n\t"\
522 "packuswb %%mm3, %%mm4 \n\t"\
523 "pxor %%mm7, %%mm7 \n\t"
524 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
526 #define REAL_YSCALEYUV2PACKED1(index, c) \
527 "xor "#index", "#index" \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $7, %%mm3 \n\t" \
533 "psraw $7, %%mm4 \n\t" \
534 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
535 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
536 "psraw $7, %%mm1 \n\t" \
537 "psraw $7, %%mm7 \n\t" \
539 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
541 #define REAL_YSCALEYUV2RGB1(index, c) \
542 "xor "#index", "#index" \n\t"\
545 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
546 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
547 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
548 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
549 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
550 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
551 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
552 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
553 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
554 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
555 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
556 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
557 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
558 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
559 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
560 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
561 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
562 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
563 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
564 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
565 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
566 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
567 "paddw %%mm3, %%mm4 \n\t"\
568 "movq %%mm2, %%mm0 \n\t"\
569 "movq %%mm5, %%mm6 \n\t"\
570 "movq %%mm4, %%mm3 \n\t"\
571 "punpcklwd %%mm2, %%mm2 \n\t"\
572 "punpcklwd %%mm5, %%mm5 \n\t"\
573 "punpcklwd %%mm4, %%mm4 \n\t"\
574 "paddw %%mm1, %%mm2 \n\t"\
575 "paddw %%mm1, %%mm5 \n\t"\
576 "paddw %%mm1, %%mm4 \n\t"\
577 "punpckhwd %%mm0, %%mm0 \n\t"\
578 "punpckhwd %%mm6, %%mm6 \n\t"\
579 "punpckhwd %%mm3, %%mm3 \n\t"\
580 "paddw %%mm7, %%mm0 \n\t"\
581 "paddw %%mm7, %%mm6 \n\t"\
582 "paddw %%mm7, %%mm3 \n\t"\
583 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
584 "packuswb %%mm0, %%mm2 \n\t"\
585 "packuswb %%mm6, %%mm5 \n\t"\
586 "packuswb %%mm3, %%mm4 \n\t"\
587 "pxor %%mm7, %%mm7 \n\t"
588 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
590 #define REAL_YSCALEYUV2PACKED1b(index, c) \
591 "xor "#index", "#index" \n\t"\
594 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
595 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
596 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
597 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
598 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
599 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
600 "psrlw $8, %%mm3 \n\t" \
601 "psrlw $8, %%mm4 \n\t" \
602 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
603 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
604 "psraw $7, %%mm1 \n\t" \
605 "psraw $7, %%mm7 \n\t"
606 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
608 // do vertical chrominance interpolation
609 #define REAL_YSCALEYUV2RGB1b(index, c) \
610 "xor "#index", "#index" \n\t"\
613 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
614 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
615 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
616 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
617 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
618 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
619 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
620 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
621 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
622 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
623 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
624 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
625 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
626 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
627 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
628 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
629 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
630 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
631 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
632 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
633 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
634 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
635 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
636 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
637 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
638 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
639 "paddw %%mm3, %%mm4 \n\t"\
640 "movq %%mm2, %%mm0 \n\t"\
641 "movq %%mm5, %%mm6 \n\t"\
642 "movq %%mm4, %%mm3 \n\t"\
643 "punpcklwd %%mm2, %%mm2 \n\t"\
644 "punpcklwd %%mm5, %%mm5 \n\t"\
645 "punpcklwd %%mm4, %%mm4 \n\t"\
646 "paddw %%mm1, %%mm2 \n\t"\
647 "paddw %%mm1, %%mm5 \n\t"\
648 "paddw %%mm1, %%mm4 \n\t"\
649 "punpckhwd %%mm0, %%mm0 \n\t"\
650 "punpckhwd %%mm6, %%mm6 \n\t"\
651 "punpckhwd %%mm3, %%mm3 \n\t"\
652 "paddw %%mm7, %%mm0 \n\t"\
653 "paddw %%mm7, %%mm6 \n\t"\
654 "paddw %%mm7, %%mm3 \n\t"\
655 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
656 "packuswb %%mm0, %%mm2 \n\t"\
657 "packuswb %%mm6, %%mm5 \n\t"\
658 "packuswb %%mm3, %%mm4 \n\t"\
659 "pxor %%mm7, %%mm7 \n\t"
660 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
662 #define REAL_WRITEBGR32(dst, dstw, index) \
663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
664 "movq %%mm2, %%mm1 \n\t" /* B */\
665 "movq %%mm5, %%mm6 \n\t" /* R */\
666 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
667 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
668 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
669 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
670 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
671 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
672 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
673 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
674 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
675 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
677 MOVNTQ(%%mm0, (dst, index, 4))\
678 MOVNTQ(%%mm2, 8(dst, index, 4))\
679 MOVNTQ(%%mm1, 16(dst, index, 4))\
680 MOVNTQ(%%mm3, 24(dst, index, 4))\
682 "add $8, "#index" \n\t"\
683 "cmp "#dstw", "#index" \n\t"\
685 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
687 #define REAL_WRITEBGR16(dst, dstw, index) \
688 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
689 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
690 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
691 "psrlq $3, %%mm2 \n\t"\
693 "movq %%mm2, %%mm1 \n\t"\
694 "movq %%mm4, %%mm3 \n\t"\
696 "punpcklbw %%mm7, %%mm3 \n\t"\
697 "punpcklbw %%mm5, %%mm2 \n\t"\
698 "punpckhbw %%mm7, %%mm4 \n\t"\
699 "punpckhbw %%mm5, %%mm1 \n\t"\
701 "psllq $3, %%mm3 \n\t"\
702 "psllq $3, %%mm4 \n\t"\
704 "por %%mm3, %%mm2 \n\t"\
705 "por %%mm4, %%mm1 \n\t"\
707 MOVNTQ(%%mm2, (dst, index, 2))\
708 MOVNTQ(%%mm1, 8(dst, index, 2))\
710 "add $8, "#index" \n\t"\
711 "cmp "#dstw", "#index" \n\t"\
713 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
715 #define REAL_WRITEBGR15(dst, dstw, index) \
716 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
717 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
718 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
719 "psrlq $3, %%mm2 \n\t"\
720 "psrlq $1, %%mm5 \n\t"\
722 "movq %%mm2, %%mm1 \n\t"\
723 "movq %%mm4, %%mm3 \n\t"\
725 "punpcklbw %%mm7, %%mm3 \n\t"\
726 "punpcklbw %%mm5, %%mm2 \n\t"\
727 "punpckhbw %%mm7, %%mm4 \n\t"\
728 "punpckhbw %%mm5, %%mm1 \n\t"\
730 "psllq $2, %%mm3 \n\t"\
731 "psllq $2, %%mm4 \n\t"\
733 "por %%mm3, %%mm2 \n\t"\
734 "por %%mm4, %%mm1 \n\t"\
736 MOVNTQ(%%mm2, (dst, index, 2))\
737 MOVNTQ(%%mm1, 8(dst, index, 2))\
739 "add $8, "#index" \n\t"\
740 "cmp "#dstw", "#index" \n\t"\
742 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
744 #define WRITEBGR24OLD(dst, dstw, index) \
745 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
746 "movq %%mm2, %%mm1 \n\t" /* B */\
747 "movq %%mm5, %%mm6 \n\t" /* R */\
748 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
749 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
750 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
751 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
752 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
753 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
754 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
755 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
756 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
757 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
759 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
760 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
761 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
762 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
763 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
764 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
765 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
766 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
768 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
769 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
770 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
771 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
772 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
773 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
774 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
775 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
776 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
777 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
778 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
779 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
780 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
782 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
783 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
784 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
785 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
786 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
787 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
788 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
789 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
791 MOVNTQ(%%mm0, (dst))\
792 MOVNTQ(%%mm2, 8(dst))\
793 MOVNTQ(%%mm3, 16(dst))\
794 "add $24, "#dst" \n\t"\
796 "add $8, "#index" \n\t"\
797 "cmp "#dstw", "#index" \n\t"\
800 #define WRITEBGR24MMX(dst, dstw, index) \
801 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
802 "movq %%mm2, %%mm1 \n\t" /* B */\
803 "movq %%mm5, %%mm6 \n\t" /* R */\
804 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
805 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
806 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
807 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
808 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
809 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
810 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
811 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
812 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
813 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
815 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
816 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
817 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
818 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
820 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
821 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
822 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
823 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
825 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
826 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
827 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
828 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
830 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
831 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
832 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
833 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
834 MOVNTQ(%%mm0, (dst))\
836 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
837 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
838 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
839 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
840 MOVNTQ(%%mm6, 8(dst))\
842 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
843 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
844 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
845 MOVNTQ(%%mm5, 16(dst))\
847 "add $24, "#dst" \n\t"\
849 "add $8, "#index" \n\t"\
850 "cmp "#dstw", "#index" \n\t"\
853 #define WRITEBGR24MMX2(dst, dstw, index) \
854 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
855 "movq "MANGLE(M24A)", %%mm0 \n\t"\
856 "movq "MANGLE(M24C)", %%mm7 \n\t"\
857 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
858 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
859 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
861 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
862 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
863 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
865 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
866 "por %%mm1, %%mm6 \n\t"\
867 "por %%mm3, %%mm6 \n\t"\
868 MOVNTQ(%%mm6, (dst))\
870 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
871 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
872 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
873 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
875 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
876 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
877 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
879 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
880 "por %%mm3, %%mm6 \n\t"\
881 MOVNTQ(%%mm6, 8(dst))\
883 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
884 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
885 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
887 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
888 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
889 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
891 "por %%mm1, %%mm3 \n\t"\
892 "por %%mm3, %%mm6 \n\t"\
893 MOVNTQ(%%mm6, 16(dst))\
895 "add $24, "#dst" \n\t"\
897 "add $8, "#index" \n\t"\
898 "cmp "#dstw", "#index" \n\t"\
903 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
909 #define REAL_WRITEYUY2(dst, dstw, index) \
910 "packuswb %%mm3, %%mm3 \n\t"\
911 "packuswb %%mm4, %%mm4 \n\t"\
912 "packuswb %%mm7, %%mm1 \n\t"\
913 "punpcklbw %%mm4, %%mm3 \n\t"\
914 "movq %%mm1, %%mm7 \n\t"\
915 "punpcklbw %%mm3, %%mm1 \n\t"\
916 "punpckhbw %%mm3, %%mm7 \n\t"\
918 MOVNTQ(%%mm1, (dst, index, 2))\
919 MOVNTQ(%%mm7, 8(dst, index, 2))\
921 "add $8, "#index" \n\t"\
922 "cmp "#dstw", "#index" \n\t"\
924 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
927 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
928 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
929 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
932 if(c->flags & SWS_ACCURATE_RND){
934 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
935 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
941 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
942 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
945 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, vDest, dstW, chrDstW);
953 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
954 chrFilter, chrSrc, chrFilterSize,
955 dest, uDest, vDest, dstW, chrDstW);
956 #endif //!HAVE_ALTIVEC
960 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
961 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
962 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
964 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
965 chrFilter, chrSrc, chrFilterSize,
966 dest, uDest, dstW, chrDstW, dstFormat);
969 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
970 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
977 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
984 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
992 :: "r" (lumSrc + dstW), "r" (dest + dstW),
998 for(i=0; i<dstW; i++)
1000 int val= lumSrc[i]>>7;
1011 for(i=0; i<chrDstW; i++)
1014 int v=chrSrc[i + 2048]>>7;
1018 else if (u>255) u=255;
1020 else if (v>255) v=255;
1031 * vertical scale YV12 to RGB
1033 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1034 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1035 uint8_t *dest, long dstW, long dstY)
1039 if(c->flags & SWS_ACCURATE_RND){
1040 switch(c->dstFormat){
1042 YSCALEYUV2PACKEDX_ACCURATE
1044 WRITEBGR32(%4, %5, %%REGa)
1046 YSCALEYUV2PACKEDX_END
1049 YSCALEYUV2PACKEDX_ACCURATE
1051 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
1052 "add %4, %%"REG_b" \n\t"
1053 WRITEBGR24(%%REGb, %5, %%REGa)
1056 :: "r" (&c->redDither),
1057 "m" (dummy), "m" (dummy), "m" (dummy),
1058 "r" (dest), "m" (dstW)
1059 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
1063 YSCALEYUV2PACKEDX_ACCURATE
1065 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1067 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1068 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1069 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1072 WRITEBGR15(%4, %5, %%REGa)
1073 YSCALEYUV2PACKEDX_END
1076 YSCALEYUV2PACKEDX_ACCURATE
1078 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1080 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1081 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1082 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1085 WRITEBGR16(%4, %5, %%REGa)
1086 YSCALEYUV2PACKEDX_END
1089 YSCALEYUV2PACKEDX_ACCURATE
1090 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1092 "psraw $3, %%mm3 \n\t"
1093 "psraw $3, %%mm4 \n\t"
1094 "psraw $3, %%mm1 \n\t"
1095 "psraw $3, %%mm7 \n\t"
1096 WRITEYUY2(%4, %5, %%REGa)
1097 YSCALEYUV2PACKEDX_END
1101 switch(c->dstFormat)
1106 WRITEBGR32(%4, %5, %%REGa)
1107 YSCALEYUV2PACKEDX_END
1112 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
1113 "add %4, %%"REG_b" \n\t"
1114 WRITEBGR24(%%REGb, %5, %%REGa)
1116 :: "r" (&c->redDither),
1117 "m" (dummy), "m" (dummy), "m" (dummy),
1118 "r" (dest), "m" (dstW)
1119 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
1125 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1127 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1128 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1129 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1132 WRITEBGR15(%4, %5, %%REGa)
1133 YSCALEYUV2PACKEDX_END
1138 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1140 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1141 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1142 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1145 WRITEBGR16(%4, %5, %%REGa)
1146 YSCALEYUV2PACKEDX_END
1150 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152 "psraw $3, %%mm3 \n\t"
1153 "psraw $3, %%mm4 \n\t"
1154 "psraw $3, %%mm1 \n\t"
1155 "psraw $3, %%mm7 \n\t"
1156 WRITEYUY2(%4, %5, %%REGa)
1157 YSCALEYUV2PACKEDX_END
1163 /* The following list of supported dstFormat values should
1164 match what's found in the body of altivec_yuv2packedX() */
1165 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
1166 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
1167 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
1168 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1169 chrFilter, chrSrc, chrFilterSize,
1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174 chrFilter, chrSrc, chrFilterSize,
1179 * vertical bilinear scale YV12 to RGB
1181 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1184 int yalpha1=yalpha^4095;
1185 int uvalpha1=uvalpha^4095;
1189 if(flags&SWS_FULL_CHR_H_INT)
1199 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1200 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1202 "movq %%mm3, %%mm1 \n\t"
1203 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1204 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1206 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1207 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1209 "add $4, %%"REG_a" \n\t"
1210 "cmp %5, %%"REG_a" \n\t"
1214 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1215 "m" (yalpha1), "m" (uvalpha1)
1225 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1226 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1228 "movq %%mm3, %%mm1 \n\t"
1229 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1230 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1232 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1233 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1234 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1235 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1236 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1237 "movq %%mm1, %%mm2 \n\t"
1238 "psllq $48, %%mm1 \n\t" // 000000BG
1239 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1241 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1242 "psrld $16, %%mm2 \n\t" // R000R000
1243 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1244 "por %%mm2, %%mm1 \n\t" // RBGRR000
1246 "mov %4, %%"REG_b" \n\t"
1247 "add %%"REG_a", %%"REG_b" \n\t"
1251 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1252 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1254 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1255 "psrlq $32, %%mm3 \n\t"
1256 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1257 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "add $4, %%"REG_a" \n\t"
1260 "cmp %5, %%"REG_a" \n\t"
1263 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1264 "m" (yalpha1), "m" (uvalpha1)
1265 : "%"REG_a, "%"REG_b
1273 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1274 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1275 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1277 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1278 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1279 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1281 "psrlw $3, %%mm3 \n\t"
1282 "psllw $2, %%mm1 \n\t"
1283 "psllw $7, %%mm0 \n\t"
1284 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1285 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1287 "por %%mm3, %%mm1 \n\t"
1288 "por %%mm1, %%mm0 \n\t"
1290 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1292 "add $4, %%"REG_a" \n\t"
1293 "cmp %5, %%"REG_a" \n\t"
1296 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1297 "m" (yalpha1), "m" (uvalpha1)
1306 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1307 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1308 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1310 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1311 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1312 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1314 "psrlw $3, %%mm3 \n\t"
1315 "psllw $3, %%mm1 \n\t"
1316 "psllw $8, %%mm0 \n\t"
1317 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1318 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1320 "por %%mm3, %%mm1 \n\t"
1321 "por %%mm1, %%mm0 \n\t"
1323 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1325 "add $4, %%"REG_a" \n\t"
1326 "cmp %5, %%"REG_a" \n\t"
1329 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1330 "m" (yalpha1), "m" (uvalpha1)
1339 if(dstFormat==IMGFMT_BGR32)
1342 #ifdef WORDS_BIGENDIAN
1345 for(i=0;i<dstW;i++){
1346 // vertical linear interpolation && yuv2rgb in a single step:
1347 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1348 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1349 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1350 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1351 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1352 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1356 else if(dstFormat==IMGFMT_BGR24)
1359 for(i=0;i<dstW;i++){
1360 // vertical linear interpolation && yuv2rgb in a single step:
1361 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1362 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1363 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1364 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1365 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1366 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1370 else if(dstFormat==IMGFMT_BGR16)
1373 for(i=0;i<dstW;i++){
1374 // vertical linear interpolation && yuv2rgb in a single step:
1375 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1376 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1377 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1379 ((uint16_t*)dest)[i] =
1380 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1381 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1382 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385 else if(dstFormat==IMGFMT_BGR15)
1388 for(i=0;i<dstW;i++){
1389 // vertical linear interpolation && yuv2rgb in a single step:
1390 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1391 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1392 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1394 ((uint16_t*)dest)[i] =
1395 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1396 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1397 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1405 switch(c->dstFormat)
1407 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB(%%REGBP, %5)
1414 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1415 "pop %%"REG_BP" \n\t"
1416 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1418 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1424 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1425 "mov %4, %%"REG_b" \n\t"
1426 "push %%"REG_BP" \n\t"
1427 YSCALEYUV2RGB(%%REGBP, %5)
1428 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1429 "pop %%"REG_BP" \n\t"
1430 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1431 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1437 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1438 "mov %4, %%"REG_b" \n\t"
1439 "push %%"REG_BP" \n\t"
1440 YSCALEYUV2RGB(%%REGBP, %5)
1441 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1444 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1445 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1448 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1449 "pop %%"REG_BP" \n\t"
1450 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1452 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1458 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1459 "mov %4, %%"REG_b" \n\t"
1460 "push %%"REG_BP" \n\t"
1461 YSCALEYUV2RGB(%%REGBP, %5)
1462 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1465 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1466 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1469 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1470 "pop %%"REG_BP" \n\t"
1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1472 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1479 "mov %4, %%"REG_b" \n\t"
1480 "push %%"REG_BP" \n\t"
1481 YSCALEYUV2PACKED(%%REGBP, %5)
1482 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1483 "pop %%"REG_BP" \n\t"
1484 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1492 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1496 * YV12 to RGB without scaling or interpolating
1498 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1499 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1501 const int yalpha1=0;
1504 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1505 const int yalpha= 4096; //FIXME ...
1507 if(flags&SWS_FULL_CHR_H_INT)
1509 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1514 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1(%%REGBP, %5)
1524 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1525 "pop %%"REG_BP" \n\t"
1526 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1528 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1534 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1535 "mov %4, %%"REG_b" \n\t"
1536 "push %%"REG_BP" \n\t"
1537 YSCALEYUV2RGB1(%%REGBP, %5)
1538 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1539 "pop %%"REG_BP" \n\t"
1540 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1542 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1548 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1549 "mov %4, %%"REG_b" \n\t"
1550 "push %%"REG_BP" \n\t"
1551 YSCALEYUV2RGB1(%%REGBP, %5)
1552 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1554 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1555 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1556 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1558 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1559 "pop %%"REG_BP" \n\t"
1560 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1562 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1569 "mov %4, %%"REG_b" \n\t"
1570 "push %%"REG_BP" \n\t"
1571 YSCALEYUV2RGB1(%%REGBP, %5)
1572 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1574 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1575 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1576 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1579 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1580 "pop %%"REG_BP" \n\t"
1581 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1583 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1589 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1590 "mov %4, %%"REG_b" \n\t"
1591 "push %%"REG_BP" \n\t"
1592 YSCALEYUV2PACKED1(%%REGBP, %5)
1593 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1594 "pop %%"REG_BP" \n\t"
1595 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1597 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1609 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1610 "mov %4, %%"REG_b" \n\t"
1611 "push %%"REG_BP" \n\t"
1612 YSCALEYUV2RGB1b(%%REGBP, %5)
1613 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1614 "pop %%"REG_BP" \n\t"
1615 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1617 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1623 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1624 "mov %4, %%"REG_b" \n\t"
1625 "push %%"REG_BP" \n\t"
1626 YSCALEYUV2RGB1b(%%REGBP, %5)
1627 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1628 "pop %%"REG_BP" \n\t"
1629 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1631 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1637 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1638 "mov %4, %%"REG_b" \n\t"
1639 "push %%"REG_BP" \n\t"
1640 YSCALEYUV2RGB1b(%%REGBP, %5)
1641 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1643 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1644 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1645 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1647 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1648 "pop %%"REG_BP" \n\t"
1649 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1651 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1658 "mov %4, %%"REG_b" \n\t"
1659 "push %%"REG_BP" \n\t"
1660 YSCALEYUV2RGB1b(%%REGBP, %5)
1661 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1663 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1664 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1665 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1668 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1669 "pop %%"REG_BP" \n\t"
1670 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1672 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1678 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1679 "mov %4, %%"REG_b" \n\t"
1680 "push %%"REG_BP" \n\t"
1681 YSCALEYUV2PACKED1b(%%REGBP, %5)
1682 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1683 "pop %%"REG_BP" \n\t"
1684 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1686 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1693 if( uvalpha < 2048 )
1695 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1701 //FIXME yuy2* can read upto 7 samples to much
1703 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1707 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1708 "mov %0, %%"REG_a" \n\t"
1710 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1711 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1712 "pand %%mm2, %%mm0 \n\t"
1713 "pand %%mm2, %%mm1 \n\t"
1714 "packuswb %%mm1, %%mm0 \n\t"
1715 "movq %%mm0, (%2, %%"REG_a") \n\t"
1716 "add $8, %%"REG_a" \n\t"
1718 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1723 for(i=0; i<width; i++)
1728 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1730 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1732 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1733 "mov %0, %%"REG_a" \n\t"
1735 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1736 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1737 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1738 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1741 "psrlw $8, %%mm0 \n\t"
1742 "psrlw $8, %%mm1 \n\t"
1743 "packuswb %%mm1, %%mm0 \n\t"
1744 "movq %%mm0, %%mm1 \n\t"
1745 "psrlw $8, %%mm0 \n\t"
1746 "pand %%mm4, %%mm1 \n\t"
1747 "packuswb %%mm0, %%mm0 \n\t"
1748 "packuswb %%mm1, %%mm1 \n\t"
1749 "movd %%mm0, (%4, %%"REG_a") \n\t"
1750 "movd %%mm1, (%3, %%"REG_a") \n\t"
1751 "add $4, %%"REG_a" \n\t"
1753 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1758 for(i=0; i<width; i++)
1760 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1761 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1766 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1767 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1771 "mov %0, %%"REG_a" \n\t"
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1786 for(i=0; i<width; i++)
1791 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1793 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1795 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1796 "mov %0, %%"REG_a" \n\t"
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1800 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1801 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1804 "pand %%mm4, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, %%mm1 \n\t"
1808 "psrlw $8, %%mm0 \n\t"
1809 "pand %%mm4, %%mm1 \n\t"
1810 "packuswb %%mm0, %%mm0 \n\t"
1811 "packuswb %%mm1, %%mm1 \n\t"
1812 "movd %%mm0, (%4, %%"REG_a") \n\t"
1813 "movd %%mm1, (%3, %%"REG_a") \n\t"
1814 "add $4, %%"REG_a" \n\t"
1816 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1821 for(i=0; i<width; i++)
1823 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1824 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1829 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1832 for(i=0; i<width; i++)
1834 int b= ((uint32_t*)src)[i]&0xFF;
1835 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1836 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1838 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1842 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1845 for(i=0; i<width; i++)
1847 const int a= ((uint32_t*)src1)[2*i+0];
1848 const int e= ((uint32_t*)src1)[2*i+1];
1849 const int c= ((uint32_t*)src2)[2*i+0];
1850 const int d= ((uint32_t*)src2)[2*i+1];
1851 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1852 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1853 const int b= l&0x3FF;
1857 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1858 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1862 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1866 "mov %2, %%"REG_a" \n\t"
1867 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1868 "movq "MANGLE(w1111)", %%mm5 \n\t"
1869 "pxor %%mm7, %%mm7 \n\t"
1870 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1873 PREFETCH" 64(%0, %%"REG_b") \n\t"
1874 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1875 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1876 "punpcklbw %%mm7, %%mm0 \n\t"
1877 "punpcklbw %%mm7, %%mm1 \n\t"
1878 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1879 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1880 "punpcklbw %%mm7, %%mm2 \n\t"
1881 "punpcklbw %%mm7, %%mm3 \n\t"
1882 "pmaddwd %%mm6, %%mm0 \n\t"
1883 "pmaddwd %%mm6, %%mm1 \n\t"
1884 "pmaddwd %%mm6, %%mm2 \n\t"
1885 "pmaddwd %%mm6, %%mm3 \n\t"
1886 #ifndef FAST_BGR2YV12
1887 "psrad $8, %%mm0 \n\t"
1888 "psrad $8, %%mm1 \n\t"
1889 "psrad $8, %%mm2 \n\t"
1890 "psrad $8, %%mm3 \n\t"
1892 "packssdw %%mm1, %%mm0 \n\t"
1893 "packssdw %%mm3, %%mm2 \n\t"
1894 "pmaddwd %%mm5, %%mm0 \n\t"
1895 "pmaddwd %%mm5, %%mm2 \n\t"
1896 "packssdw %%mm2, %%mm0 \n\t"
1897 "psraw $7, %%mm0 \n\t"
1899 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1900 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1901 "punpcklbw %%mm7, %%mm4 \n\t"
1902 "punpcklbw %%mm7, %%mm1 \n\t"
1903 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1904 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1905 "punpcklbw %%mm7, %%mm2 \n\t"
1906 "punpcklbw %%mm7, %%mm3 \n\t"
1907 "pmaddwd %%mm6, %%mm4 \n\t"
1908 "pmaddwd %%mm6, %%mm1 \n\t"
1909 "pmaddwd %%mm6, %%mm2 \n\t"
1910 "pmaddwd %%mm6, %%mm3 \n\t"
1911 #ifndef FAST_BGR2YV12
1912 "psrad $8, %%mm4 \n\t"
1913 "psrad $8, %%mm1 \n\t"
1914 "psrad $8, %%mm2 \n\t"
1915 "psrad $8, %%mm3 \n\t"
1917 "packssdw %%mm1, %%mm4 \n\t"
1918 "packssdw %%mm3, %%mm2 \n\t"
1919 "pmaddwd %%mm5, %%mm4 \n\t"
1920 "pmaddwd %%mm5, %%mm2 \n\t"
1921 "add $24, %%"REG_b" \n\t"
1922 "packssdw %%mm2, %%mm4 \n\t"
1923 "psraw $7, %%mm4 \n\t"
1925 "packuswb %%mm4, %%mm0 \n\t"
1926 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1928 "movq %%mm0, (%1, %%"REG_a") \n\t"
1929 "add $8, %%"REG_a" \n\t"
1931 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1932 : "%"REG_a, "%"REG_b
1936 for(i=0; i<width; i++)
1942 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1947 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1951 "mov %4, %%"REG_a" \n\t"
1952 "movq "MANGLE(w1111)", %%mm5 \n\t"
1953 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1954 "pxor %%mm7, %%mm7 \n\t"
1955 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1956 "add %%"REG_b", %%"REG_b" \n\t"
1959 PREFETCH" 64(%0, %%"REG_b") \n\t"
1960 PREFETCH" 64(%1, %%"REG_b") \n\t"
1961 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1962 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1963 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1964 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1965 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1968 "movq %%mm0, %%mm1 \n\t"
1969 "movq %%mm2, %%mm3 \n\t"
1970 "psrlq $24, %%mm0 \n\t"
1971 "psrlq $24, %%mm2 \n\t"
1974 "punpcklbw %%mm7, %%mm0 \n\t"
1975 "punpcklbw %%mm7, %%mm2 \n\t"
1977 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1978 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1979 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1980 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1981 "punpcklbw %%mm7, %%mm0 \n\t"
1982 "punpcklbw %%mm7, %%mm1 \n\t"
1983 "punpcklbw %%mm7, %%mm2 \n\t"
1984 "punpcklbw %%mm7, %%mm3 \n\t"
1985 "paddw %%mm1, %%mm0 \n\t"
1986 "paddw %%mm3, %%mm2 \n\t"
1987 "paddw %%mm2, %%mm0 \n\t"
1988 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1989 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1990 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1991 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1992 "punpcklbw %%mm7, %%mm4 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm2 \n\t"
1995 "punpcklbw %%mm7, %%mm3 \n\t"
1996 "paddw %%mm1, %%mm4 \n\t"
1997 "paddw %%mm3, %%mm2 \n\t"
1998 "paddw %%mm4, %%mm2 \n\t"
1999 "psrlw $2, %%mm0 \n\t"
2000 "psrlw $2, %%mm2 \n\t"
2002 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2003 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2005 "pmaddwd %%mm0, %%mm1 \n\t"
2006 "pmaddwd %%mm2, %%mm3 \n\t"
2007 "pmaddwd %%mm6, %%mm0 \n\t"
2008 "pmaddwd %%mm6, %%mm2 \n\t"
2009 #ifndef FAST_BGR2YV12
2010 "psrad $8, %%mm0 \n\t"
2011 "psrad $8, %%mm1 \n\t"
2012 "psrad $8, %%mm2 \n\t"
2013 "psrad $8, %%mm3 \n\t"
2015 "packssdw %%mm2, %%mm0 \n\t"
2016 "packssdw %%mm3, %%mm1 \n\t"
2017 "pmaddwd %%mm5, %%mm0 \n\t"
2018 "pmaddwd %%mm5, %%mm1 \n\t"
2019 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2020 "psraw $7, %%mm0 \n\t"
2022 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2023 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2024 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2025 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2026 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2029 "movq %%mm4, %%mm1 \n\t"
2030 "movq %%mm2, %%mm3 \n\t"
2031 "psrlq $24, %%mm4 \n\t"
2032 "psrlq $24, %%mm2 \n\t"
2035 "punpcklbw %%mm7, %%mm4 \n\t"
2036 "punpcklbw %%mm7, %%mm2 \n\t"
2038 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2039 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2040 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2041 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2042 "punpcklbw %%mm7, %%mm4 \n\t"
2043 "punpcklbw %%mm7, %%mm1 \n\t"
2044 "punpcklbw %%mm7, %%mm2 \n\t"
2045 "punpcklbw %%mm7, %%mm3 \n\t"
2046 "paddw %%mm1, %%mm4 \n\t"
2047 "paddw %%mm3, %%mm2 \n\t"
2048 "paddw %%mm2, %%mm4 \n\t"
2049 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2050 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2051 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2052 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2053 "punpcklbw %%mm7, %%mm5 \n\t"
2054 "punpcklbw %%mm7, %%mm1 \n\t"
2055 "punpcklbw %%mm7, %%mm2 \n\t"
2056 "punpcklbw %%mm7, %%mm3 \n\t"
2057 "paddw %%mm1, %%mm5 \n\t"
2058 "paddw %%mm3, %%mm2 \n\t"
2059 "paddw %%mm5, %%mm2 \n\t"
2060 "movq "MANGLE(w1111)", %%mm5 \n\t"
2061 "psrlw $2, %%mm4 \n\t"
2062 "psrlw $2, %%mm2 \n\t"
2064 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2065 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2067 "pmaddwd %%mm4, %%mm1 \n\t"
2068 "pmaddwd %%mm2, %%mm3 \n\t"
2069 "pmaddwd %%mm6, %%mm4 \n\t"
2070 "pmaddwd %%mm6, %%mm2 \n\t"
2071 #ifndef FAST_BGR2YV12
2072 "psrad $8, %%mm4 \n\t"
2073 "psrad $8, %%mm1 \n\t"
2074 "psrad $8, %%mm2 \n\t"
2075 "psrad $8, %%mm3 \n\t"
2077 "packssdw %%mm2, %%mm4 \n\t"
2078 "packssdw %%mm3, %%mm1 \n\t"
2079 "pmaddwd %%mm5, %%mm4 \n\t"
2080 "pmaddwd %%mm5, %%mm1 \n\t"
2081 "add $24, %%"REG_b" \n\t"
2082 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2083 "psraw $7, %%mm4 \n\t"
2085 "movq %%mm0, %%mm1 \n\t"
2086 "punpckldq %%mm4, %%mm0 \n\t"
2087 "punpckhdq %%mm4, %%mm1 \n\t"
2088 "packsswb %%mm1, %%mm0 \n\t"
2089 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2091 "movd %%mm0, (%2, %%"REG_a") \n\t"
2092 "punpckhdq %%mm0, %%mm0 \n\t"
2093 "movd %%mm0, (%3, %%"REG_a") \n\t"
2094 "add $4, %%"REG_a" \n\t"
2096 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2097 : "%"REG_a, "%"REG_b
2101 for(i=0; i<width; i++)
2103 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2104 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2105 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2107 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2108 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2113 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2116 for(i=0; i<width; i++)
2118 int d= ((uint16_t*)src)[i];
2121 int r= (d>>11)&0x1F;
2123 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2127 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2130 for(i=0; i<width; i++)
2132 int d0= ((uint32_t*)src1)[i];
2133 int d1= ((uint32_t*)src2)[i];
2135 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
2136 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
2138 int dh2= (dh>>11) + (dh<<21);
2142 int r= (d>>11)&0x7F;
2144 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2145 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2149 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2152 for(i=0; i<width; i++)
2154 int d= ((uint16_t*)src)[i];
2157 int r= (d>>10)&0x1F;
2159 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2163 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2166 for(i=0; i<width; i++)
2168 int d0= ((uint32_t*)src1)[i];
2169 int d1= ((uint32_t*)src2)[i];
2171 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
2172 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
2174 int dh2= (dh>>11) + (dh<<21);
2178 int r= (d>>10)&0x7F;
2180 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2181 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2186 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2189 for(i=0; i<width; i++)
2191 int r= ((uint32_t*)src)[i]&0xFF;
2192 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2193 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2195 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2199 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2202 for(i=0; i<width; i++)
2204 const int a= ((uint32_t*)src1)[2*i+0];
2205 const int e= ((uint32_t*)src1)[2*i+1];
2206 const int c= ((uint32_t*)src2)[2*i+0];
2207 const int d= ((uint32_t*)src2)[2*i+1];
2208 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2209 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2210 const int r= l&0x3FF;
2214 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2215 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2219 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2222 for(i=0; i<width; i++)
2228 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2232 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2235 for(i=0; i<width; i++)
2237 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2238 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2239 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2241 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2242 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2247 // Bilinear / Bicubic scaling
2248 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2249 int16_t *filter, int16_t *filterPos, long filterSize)
2252 assert(filterSize % 4 == 0 && filterSize>0);
2253 if(filterSize==4) // allways true for upscaling, sometimes for down too
2255 long counter= -2*dstW;
2257 filterPos-= counter/2;
2260 "pxor %%mm7, %%mm7 \n\t"
2261 "movq "MANGLE(w02)", %%mm6 \n\t"
2262 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2263 "mov %%"REG_a", %%"REG_BP" \n\t"
2266 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2267 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2268 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2269 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2270 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2271 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2272 "punpcklbw %%mm7, %%mm0 \n\t"
2273 "punpcklbw %%mm7, %%mm2 \n\t"
2274 "pmaddwd %%mm1, %%mm0 \n\t"
2275 "pmaddwd %%mm2, %%mm3 \n\t"
2276 "psrad $8, %%mm0 \n\t"
2277 "psrad $8, %%mm3 \n\t"
2278 "packssdw %%mm3, %%mm0 \n\t"
2279 "pmaddwd %%mm6, %%mm0 \n\t"
2280 "packssdw %%mm0, %%mm0 \n\t"
2281 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2282 "add $4, %%"REG_BP" \n\t"
2285 "pop %%"REG_BP" \n\t"
2287 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2291 else if(filterSize==8)
2293 long counter= -2*dstW;
2295 filterPos-= counter/2;
2298 "pxor %%mm7, %%mm7 \n\t"
2299 "movq "MANGLE(w02)", %%mm6 \n\t"
2300 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2301 "mov %%"REG_a", %%"REG_BP" \n\t"
2304 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2305 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2306 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2307 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2308 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2309 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2310 "punpcklbw %%mm7, %%mm0 \n\t"
2311 "punpcklbw %%mm7, %%mm2 \n\t"
2312 "pmaddwd %%mm1, %%mm0 \n\t"
2313 "pmaddwd %%mm2, %%mm3 \n\t"
2315 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2316 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2317 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2318 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2319 "punpcklbw %%mm7, %%mm4 \n\t"
2320 "punpcklbw %%mm7, %%mm2 \n\t"
2321 "pmaddwd %%mm1, %%mm4 \n\t"
2322 "pmaddwd %%mm2, %%mm5 \n\t"
2323 "paddd %%mm4, %%mm0 \n\t"
2324 "paddd %%mm5, %%mm3 \n\t"
2326 "psrad $8, %%mm0 \n\t"
2327 "psrad $8, %%mm3 \n\t"
2328 "packssdw %%mm3, %%mm0 \n\t"
2329 "pmaddwd %%mm6, %%mm0 \n\t"
2330 "packssdw %%mm0, %%mm0 \n\t"
2331 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2332 "add $4, %%"REG_BP" \n\t"
2335 "pop %%"REG_BP" \n\t"
2337 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2343 uint8_t *offset = src+filterSize;
2344 long counter= -2*dstW;
2345 // filter-= counter*filterSize/2;
2346 filterPos-= counter/2;
2349 "pxor %%mm7, %%mm7 \n\t"
2350 "movq "MANGLE(w02)", %%mm6 \n\t"
2353 "mov %2, %%"REG_c" \n\t"
2354 "movzwl (%%"REG_c", %0), %%eax \n\t"
2355 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2356 "mov %5, %%"REG_c" \n\t"
2357 "pxor %%mm4, %%mm4 \n\t"
2358 "pxor %%mm5, %%mm5 \n\t"
2360 "movq (%1), %%mm1 \n\t"
2361 "movq (%1, %6), %%mm3 \n\t"
2362 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2363 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2364 "punpcklbw %%mm7, %%mm0 \n\t"
2365 "punpcklbw %%mm7, %%mm2 \n\t"
2366 "pmaddwd %%mm1, %%mm0 \n\t"
2367 "pmaddwd %%mm2, %%mm3 \n\t"
2368 "paddd %%mm3, %%mm5 \n\t"
2369 "paddd %%mm0, %%mm4 \n\t"
2371 "add $4, %%"REG_c" \n\t"
2372 "cmp %4, %%"REG_c" \n\t"
2375 "psrad $8, %%mm4 \n\t"
2376 "psrad $8, %%mm5 \n\t"
2377 "packssdw %%mm5, %%mm4 \n\t"
2378 "pmaddwd %%mm6, %%mm4 \n\t"
2379 "packssdw %%mm4, %%mm4 \n\t"
2380 "mov %3, %%"REG_a" \n\t"
2381 "movd %%mm4, (%%"REG_a", %0) \n\t"
2385 : "+r" (counter), "+r" (filter)
2386 : "m" (filterPos), "m" (dst), "m"(offset),
2387 "m" (src), "r" (filterSize*2)
2388 : "%"REG_b, "%"REG_a, "%"REG_c
2393 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2396 for(i=0; i<dstW; i++)
2399 int srcPos= filterPos[i];
2401 // printf("filterPos: %d\n", filterPos[i]);
2402 for(j=0; j<filterSize; j++)
2404 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2405 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2407 // filter += hFilterSize;
2408 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2414 // *** horizontal scale Y line to temp buffer
2415 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2416 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2417 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2418 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2419 int32_t *mmx2FilterPos)
2421 if(srcFormat==IMGFMT_YUY2)
2423 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2424 src= formatConvBuffer;
2426 else if(srcFormat==IMGFMT_UYVY)
2428 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2429 src= formatConvBuffer;
2431 else if(srcFormat==IMGFMT_BGR32)
2433 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2434 src= formatConvBuffer;
2436 else if(srcFormat==IMGFMT_BGR24)
2438 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2439 src= formatConvBuffer;
2441 else if(srcFormat==IMGFMT_BGR16)
2443 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2444 src= formatConvBuffer;
2446 else if(srcFormat==IMGFMT_BGR15)
2448 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2449 src= formatConvBuffer;
2451 else if(srcFormat==IMGFMT_RGB32)
2453 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2454 src= formatConvBuffer;
2456 else if(srcFormat==IMGFMT_RGB24)
2458 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2459 src= formatConvBuffer;
2463 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2464 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2466 if(!(flags&SWS_FAST_BILINEAR))
2469 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2471 else // Fast Bilinear upscale / crap downscale
2473 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2479 "pxor %%mm7, %%mm7 \n\t"
2480 "mov %0, %%"REG_c" \n\t"
2481 "mov %1, %%"REG_D" \n\t"
2482 "mov %2, %%"REG_d" \n\t"
2483 "mov %3, %%"REG_b" \n\t"
2484 "xor %%"REG_a", %%"REG_a" \n\t" // i
2485 PREFETCH" (%%"REG_c") \n\t"
2486 PREFETCH" 32(%%"REG_c") \n\t"
2487 PREFETCH" 64(%%"REG_c") \n\t"
2491 #define FUNNY_Y_CODE \
2492 "movl (%%"REG_b"), %%esi \n\t"\
2494 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2495 "add %%"REG_S", %%"REG_c" \n\t"\
2496 "add %%"REG_a", %%"REG_D" \n\t"\
2497 "xor %%"REG_a", %%"REG_a" \n\t"\
2501 #define FUNNY_Y_CODE \
2502 "movl (%%"REG_b"), %%esi \n\t"\
2504 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2505 "add %%"REG_a", %%"REG_D" \n\t"\
2506 "xor %%"REG_a", %%"REG_a" \n\t"\
2519 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2521 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2523 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2528 long xInc_shr16 = xInc >> 16;
2529 uint16_t xInc_mask = xInc & 0xffff;
2530 //NO MMX just normal asm ...
2532 "xor %%"REG_a", %%"REG_a" \n\t" // i
2533 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2534 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2537 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2538 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2539 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2540 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2541 "shll $16, %%edi \n\t"
2542 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2543 "mov %1, %%"REG_D" \n\t"
2544 "shrl $9, %%esi \n\t"
2545 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2546 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2547 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2549 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2550 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2551 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2552 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2553 "shll $16, %%edi \n\t"
2554 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2555 "mov %1, %%"REG_D" \n\t"
2556 "shrl $9, %%esi \n\t"
2557 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2558 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2559 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2562 "add $2, %%"REG_a" \n\t"
2563 "cmp %2, %%"REG_a" \n\t"
2567 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2568 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2571 } //if MMX2 can't be used
2575 unsigned int xpos=0;
2576 for(i=0;i<dstWidth;i++)
2578 register unsigned int xx=xpos>>16;
2579 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2580 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2587 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2588 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2589 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2590 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2591 int32_t *mmx2FilterPos)
2593 if(srcFormat==IMGFMT_YUY2)
2595 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2596 src1= formatConvBuffer;
2597 src2= formatConvBuffer+2048;
2599 else if(srcFormat==IMGFMT_UYVY)
2601 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2602 src1= formatConvBuffer;
2603 src2= formatConvBuffer+2048;
2605 else if(srcFormat==IMGFMT_BGR32)
2607 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2608 src1= formatConvBuffer;
2609 src2= formatConvBuffer+2048;
2611 else if(srcFormat==IMGFMT_BGR24)
2613 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2614 src1= formatConvBuffer;
2615 src2= formatConvBuffer+2048;
2617 else if(srcFormat==IMGFMT_BGR16)
2619 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2620 src1= formatConvBuffer;
2621 src2= formatConvBuffer+2048;
2623 else if(srcFormat==IMGFMT_BGR15)
2625 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2626 src1= formatConvBuffer;
2627 src2= formatConvBuffer+2048;
2629 else if(srcFormat==IMGFMT_RGB32)
2631 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2632 src1= formatConvBuffer;
2633 src2= formatConvBuffer+2048;
2635 else if(srcFormat==IMGFMT_RGB24)
2637 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2638 src1= formatConvBuffer;
2639 src2= formatConvBuffer+2048;
2641 else if(isGray(srcFormat))
2647 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2648 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2650 if(!(flags&SWS_FAST_BILINEAR))
2653 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2654 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2656 else // Fast Bilinear upscale / crap downscale
2658 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2664 "pxor %%mm7, %%mm7 \n\t"
2665 "mov %0, %%"REG_c" \n\t"
2666 "mov %1, %%"REG_D" \n\t"
2667 "mov %2, %%"REG_d" \n\t"
2668 "mov %3, %%"REG_b" \n\t"
2669 "xor %%"REG_a", %%"REG_a" \n\t" // i
2670 PREFETCH" (%%"REG_c") \n\t"
2671 PREFETCH" 32(%%"REG_c") \n\t"
2672 PREFETCH" 64(%%"REG_c") \n\t"
2676 #define FUNNY_UV_CODE \
2677 "movl (%%"REG_b"), %%esi \n\t"\
2679 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2680 "add %%"REG_S", %%"REG_c" \n\t"\
2681 "add %%"REG_a", %%"REG_D" \n\t"\
2682 "xor %%"REG_a", %%"REG_a" \n\t"\
2686 #define FUNNY_UV_CODE \
2687 "movl (%%"REG_b"), %%esi \n\t"\
2689 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2690 "add %%"REG_a", %%"REG_D" \n\t"\
2691 "xor %%"REG_a", %%"REG_a" \n\t"\
2699 "xor %%"REG_a", %%"REG_a" \n\t" // i
2700 "mov %5, %%"REG_c" \n\t" // src
2701 "mov %1, %%"REG_D" \n\t" // buf1
2702 "add $4096, %%"REG_D" \n\t"
2703 PREFETCH" (%%"REG_c") \n\t"
2704 PREFETCH" 32(%%"REG_c") \n\t"
2705 PREFETCH" 64(%%"REG_c") \n\t"
2712 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2713 "m" (funnyUVCode), "m" (src2)
2714 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2716 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2718 // printf("%d %d %d\n", dstWidth, i, srcW);
2719 dst[i] = src1[srcW-1]*128;
2720 dst[i+2048] = src2[srcW-1]*128;
2726 long xInc_shr16 = (long) (xInc >> 16);
2727 uint16_t xInc_mask = xInc & 0xffff;
2729 "xor %%"REG_a", %%"REG_a" \n\t" // i
2730 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2731 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2734 "mov %0, %%"REG_S" \n\t"
2735 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2736 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2737 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2738 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2739 "shll $16, %%edi \n\t"
2740 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2741 "mov %1, %%"REG_D" \n\t"
2742 "shrl $9, %%esi \n\t"
2743 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2745 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2746 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2747 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2748 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2749 "shll $16, %%edi \n\t"
2750 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2751 "mov %1, %%"REG_D" \n\t"
2752 "shrl $9, %%esi \n\t"
2753 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2755 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2756 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2757 "add $1, %%"REG_a" \n\t"
2758 "cmp %2, %%"REG_a" \n\t"
2761 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2762 which is needed to support GCC-4.0 */
2763 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2764 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2766 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2769 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2772 } //if MMX2 can't be used
2776 unsigned int xpos=0;
2777 for(i=0;i<dstWidth;i++)
2779 register unsigned int xx=xpos>>16;
2780 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2781 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2782 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2784 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2785 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2793 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2794 int srcSliceH, uint8_t* dst[], int dstStride[]){
2796 /* load a few things into local vars to make the code more readable? and faster */
2797 const int srcW= c->srcW;
2798 const int dstW= c->dstW;
2799 const int dstH= c->dstH;
2800 const int chrDstW= c->chrDstW;
2801 const int chrSrcW= c->chrSrcW;
2802 const int lumXInc= c->lumXInc;
2803 const int chrXInc= c->chrXInc;
2804 const int dstFormat= c->dstFormat;
2805 const int srcFormat= c->srcFormat;
2806 const int flags= c->flags;
2807 const int canMMX2BeUsed= c->canMMX2BeUsed;
2808 int16_t *vLumFilterPos= c->vLumFilterPos;
2809 int16_t *vChrFilterPos= c->vChrFilterPos;
2810 int16_t *hLumFilterPos= c->hLumFilterPos;
2811 int16_t *hChrFilterPos= c->hChrFilterPos;
2812 int16_t *vLumFilter= c->vLumFilter;
2813 int16_t *vChrFilter= c->vChrFilter;
2814 int16_t *hLumFilter= c->hLumFilter;
2815 int16_t *hChrFilter= c->hChrFilter;
2816 int32_t *lumMmxFilter= c->lumMmxFilter;
2817 int32_t *chrMmxFilter= c->chrMmxFilter;
2818 const int vLumFilterSize= c->vLumFilterSize;
2819 const int vChrFilterSize= c->vChrFilterSize;
2820 const int hLumFilterSize= c->hLumFilterSize;
2821 const int hChrFilterSize= c->hChrFilterSize;
2822 int16_t **lumPixBuf= c->lumPixBuf;
2823 int16_t **chrPixBuf= c->chrPixBuf;
2824 const int vLumBufSize= c->vLumBufSize;
2825 const int vChrBufSize= c->vChrBufSize;
2826 uint8_t *funnyYCode= c->funnyYCode;
2827 uint8_t *funnyUVCode= c->funnyUVCode;
2828 uint8_t *formatConvBuffer= c->formatConvBuffer;
2829 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2830 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2833 /* vars whch will change and which we need to storw back in the context */
2835 int lumBufIndex= c->lumBufIndex;
2836 int chrBufIndex= c->chrBufIndex;
2837 int lastInLumBuf= c->lastInLumBuf;
2838 int lastInChrBuf= c->lastInChrBuf;
2840 if(isPacked(c->srcFormat)){
2846 srcStride[2]= srcStride[0];
2848 srcStride[1]<<= c->vChrDrop;
2849 srcStride[2]<<= c->vChrDrop;
2851 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2852 // (int)dst[0], (int)dst[1], (int)dst[2]);
2854 #if 0 //self test FIXME move to a vfilter or something
2856 static volatile int i=0;
2858 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2859 selfTest(src, srcStride, c->srcW, c->srcH);
2864 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2865 //dstStride[0],dstStride[1],dstStride[2]);
2867 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2869 static int firstTime=1; //FIXME move this into the context perhaps
2870 if(flags & SWS_PRINT_INFO && firstTime)
2872 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2873 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2878 /* Note the user might start scaling the picture in the middle so this will not get executed
2879 this is not really intended but works currently, so ppl might do it */
2890 for(;dstY < dstH; dstY++){
2891 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2892 const int chrDstY= dstY>>c->chrDstVSubSample;
2893 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2894 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2896 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2897 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2898 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2899 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2901 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2902 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2903 //handle holes (FAST_BILINEAR & weird filters)
2904 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2905 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2906 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2907 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2908 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2910 // Do we have enough lines in this slice to output the dstY line
2911 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2913 //Do horizontal scaling
2914 while(lastInLumBuf < lastLumSrcY)
2916 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2918 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2919 ASSERT(lumBufIndex < 2*vLumBufSize)
2920 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2921 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2922 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2923 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2924 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2925 funnyYCode, c->srcFormat, formatConvBuffer,
2926 c->lumMmx2Filter, c->lumMmx2FilterPos);
2929 while(lastInChrBuf < lastChrSrcY)
2931 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2932 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2934 ASSERT(chrBufIndex < 2*vChrBufSize)
2935 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2936 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2937 //FIXME replace parameters through context struct (some at least)
2939 if(!(isGray(srcFormat) || isGray(dstFormat)))
2940 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2941 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2942 funnyUVCode, c->srcFormat, formatConvBuffer,
2943 c->chrMmx2Filter, c->chrMmx2FilterPos);
2946 //wrap buf index around to stay inside the ring buffer
2947 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2948 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2950 else // not enough lines left in this slice -> load the rest in the buffer
2952 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2953 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2954 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2955 vChrBufSize, vLumBufSize);*/
2957 //Do horizontal scaling
2958 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2960 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2962 ASSERT(lumBufIndex < 2*vLumBufSize)
2963 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2964 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2965 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2966 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2967 funnyYCode, c->srcFormat, formatConvBuffer,
2968 c->lumMmx2Filter, c->lumMmx2FilterPos);
2971 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2973 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2974 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2976 ASSERT(chrBufIndex < 2*vChrBufSize)
2977 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2978 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2980 if(!(isGray(srcFormat) || isGray(dstFormat)))
2981 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2982 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2983 funnyUVCode, c->srcFormat, formatConvBuffer,
2984 c->chrMmx2Filter, c->chrMmx2FilterPos);
2987 //wrap buf index around to stay inside the ring buffer
2988 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2989 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2990 break; //we can't output a dstY line so let's try with the next slice
2994 b5Dither= dither8[dstY&1];
2995 g6Dither= dither4[dstY&1];
2996 g5Dither= dither8[dstY&1];
2997 r5Dither= dither8[(dstY+1)&1];
3001 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3002 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3005 if(flags & SWS_ACCURATE_RND){
3006 for(i=0; i<vLumFilterSize; i+=2){
3007 lumMmxFilter[2*i+0]= lumSrcPtr[i ];
3008 lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)];
3009 lumMmxFilter[2*i+2]=
3010 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3011 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3013 for(i=0; i<vChrFilterSize; i+=2){
3014 chrMmxFilter[2*i+0]= chrSrcPtr[i ];
3015 chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)];
3016 chrMmxFilter[2*i+2]=
3017 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3018 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3021 for(i=0; i<vLumFilterSize; i++)
3023 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3024 lumMmxFilter[4*i+2]=
3025 lumMmxFilter[4*i+3]=
3026 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3028 for(i=0; i<vChrFilterSize; i++)
3030 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3031 chrMmxFilter[4*i+2]=
3032 chrMmxFilter[4*i+3]=
3033 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3037 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3038 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3039 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3040 RENAME(yuv2nv12X)(c,
3041 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3042 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3043 dest, uDest, dstW, chrDstW, dstFormat);
3045 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3047 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3048 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3049 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3051 int16_t *lumBuf = lumPixBuf[0];
3052 int16_t *chrBuf= chrPixBuf[0];
3053 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3058 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3059 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3060 dest, uDest, vDest, dstW, chrDstW);
3065 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3066 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3067 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3069 int chrAlpha= vChrFilter[2*dstY+1];
3070 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3071 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3073 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3075 int lumAlpha= vLumFilter[2*dstY+1];
3076 int chrAlpha= vChrFilter[2*dstY+1];
3078 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3080 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3081 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3082 dest, dstW, lumAlpha, chrAlpha, dstY);
3086 RENAME(yuv2packedX)(c,
3087 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3088 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3093 else // hmm looks like we can't use MMX here without overwriting this array's tail
3095 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3096 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3097 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3098 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3099 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3101 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3102 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3103 dest, uDest, dstW, chrDstW, dstFormat);
3105 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3107 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3108 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3110 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3111 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3112 dest, uDest, vDest, dstW, chrDstW);
3116 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3117 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3119 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3120 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3127 __asm __volatile(SFENCE:::"memory");
3128 __asm __volatile(EMMS:::"memory");
3130 /* store changed local vars back in the context */
3132 c->lumBufIndex= lumBufIndex;
3133 c->chrBufIndex= chrBufIndex;
3134 c->lastInLumBuf= lastInLumBuf;
3135 c->lastInChrBuf= lastInChrBuf;
3137 return dstY - lastDstY;