2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset, dest, width) \
70 "xor %%"REG_a", %%"REG_a" \n\t"\
71 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
72 "movq %%mm3, %%mm4 \n\t"\
73 "lea " offset "(%0), %%"REG_d" \n\t"\
74 "mov (%%"REG_d"), %%"REG_S" \n\t"\
75 ASMALIGN(4) /* FIXME Unroll? */\
77 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
78 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
79 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
80 "add $16, %%"REG_d" \n\t"\
81 "mov (%%"REG_d"), %%"REG_S" \n\t"\
82 "test %%"REG_S", %%"REG_S" \n\t"\
83 "pmulhw %%mm0, %%mm2 \n\t"\
84 "pmulhw %%mm0, %%mm5 \n\t"\
85 "paddw %%mm2, %%mm3 \n\t"\
86 "paddw %%mm5, %%mm4 \n\t"\
88 "psraw $3, %%mm3 \n\t"\
89 "psraw $3, %%mm4 \n\t"\
90 "packuswb %%mm4, %%mm3 \n\t"\
91 MOVNTQ(%%mm3, (%1, %%REGa))\
92 "add $8, %%"REG_a" \n\t"\
93 "cmp %2, %%"REG_a" \n\t"\
94 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
95 "movq %%mm3, %%mm4 \n\t"\
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 :: "r" (&c->redDither),\
100 "r" (dest), "p" (width)\
101 : "%"REG_a, "%"REG_d, "%"REG_S\
104 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
106 "lea " offset "(%0), %%"REG_d" \n\t"\
107 "xor %%"REG_a", %%"REG_a" \n\t"\
108 "pxor %%mm4, %%mm4 \n\t"\
109 "pxor %%mm5, %%mm5 \n\t"\
110 "pxor %%mm6, %%mm6 \n\t"\
111 "pxor %%mm7, %%mm7 \n\t"\
112 "mov (%%"REG_d"), %%"REG_S" \n\t"\
115 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
116 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
117 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
118 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
119 "movq %%mm0, %%mm3 \n\t"\
120 "punpcklwd %%mm1, %%mm0 \n\t"\
121 "punpckhwd %%mm1, %%mm3 \n\t"\
122 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
123 "pmaddwd %%mm1, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm3 \n\t"\
125 "paddd %%mm0, %%mm4 \n\t"\
126 "paddd %%mm3, %%mm5 \n\t"\
127 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
128 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
129 "add $16, %%"REG_d" \n\t"\
130 "test %%"REG_S", %%"REG_S" \n\t"\
131 "movq %%mm2, %%mm0 \n\t"\
132 "punpcklwd %%mm3, %%mm2 \n\t"\
133 "punpckhwd %%mm3, %%mm0 \n\t"\
134 "pmaddwd %%mm1, %%mm2 \n\t"\
135 "pmaddwd %%mm1, %%mm0 \n\t"\
136 "paddd %%mm2, %%mm6 \n\t"\
137 "paddd %%mm0, %%mm7 \n\t"\
139 "psrad $16, %%mm4 \n\t"\
140 "psrad $16, %%mm5 \n\t"\
141 "psrad $16, %%mm6 \n\t"\
142 "psrad $16, %%mm7 \n\t"\
143 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
144 "packssdw %%mm5, %%mm4 \n\t"\
145 "packssdw %%mm7, %%mm6 \n\t"\
146 "paddw %%mm0, %%mm4 \n\t"\
147 "paddw %%mm0, %%mm6 \n\t"\
148 "psraw $3, %%mm4 \n\t"\
149 "psraw $3, %%mm6 \n\t"\
150 "packuswb %%mm6, %%mm4 \n\t"\
151 MOVNTQ(%%mm4, (%1, %%REGa))\
152 "add $8, %%"REG_a" \n\t"\
153 "cmp %2, %%"REG_a" \n\t"\
154 "lea " offset "(%0), %%"REG_d" \n\t"\
155 "pxor %%mm4, %%mm4 \n\t"\
156 "pxor %%mm5, %%mm5 \n\t"\
157 "pxor %%mm6, %%mm6 \n\t"\
158 "pxor %%mm7, %%mm7 \n\t"\
159 "mov (%%"REG_d"), %%"REG_S" \n\t"\
161 :: "r" (&c->redDither),\
162 "r" (dest), "p" (width)\
163 : "%"REG_a, "%"REG_d, "%"REG_S\
166 #define YSCALEYUV2YV121 \
167 "mov %2, %%"REG_a" \n\t"\
168 ASMALIGN(4) /* FIXME Unroll? */\
170 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
171 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
172 "psraw $7, %%mm0 \n\t"\
173 "psraw $7, %%mm1 \n\t"\
174 "packuswb %%mm1, %%mm0 \n\t"\
175 MOVNTQ(%%mm0, (%1, %%REGa))\
176 "add $8, %%"REG_a" \n\t"\
180 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
181 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
182 "r" (dest), "m" (dstW),
183 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
184 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186 #define YSCALEYUV2PACKEDX \
188 "xor %%"REG_a", %%"REG_a" \n\t"\
192 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
193 "mov (%%"REG_d"), %%"REG_S" \n\t"\
194 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
195 "movq %%mm3, %%mm4 \n\t"\
198 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
199 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
200 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
201 "add $16, %%"REG_d" \n\t"\
202 "mov (%%"REG_d"), %%"REG_S" \n\t"\
203 "pmulhw %%mm0, %%mm2 \n\t"\
204 "pmulhw %%mm0, %%mm5 \n\t"\
205 "paddw %%mm2, %%mm3 \n\t"\
206 "paddw %%mm5, %%mm4 \n\t"\
207 "test %%"REG_S", %%"REG_S" \n\t"\
210 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
211 "mov (%%"REG_d"), %%"REG_S" \n\t"\
212 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
213 "movq %%mm1, %%mm7 \n\t"\
216 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
217 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
218 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
219 "add $16, %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "pmulhw %%mm0, %%mm2 \n\t"\
222 "pmulhw %%mm0, %%mm5 \n\t"\
223 "paddw %%mm2, %%mm1 \n\t"\
224 "paddw %%mm5, %%mm7 \n\t"\
225 "test %%"REG_S", %%"REG_S" \n\t"\
228 #define YSCALEYUV2PACKEDX_END\
229 :: "r" (&c->redDither), \
230 "m" (dummy), "m" (dummy), "m" (dummy),\
231 "r" (dest), "m" (dstW)\
232 : "%"REG_a, "%"REG_d, "%"REG_S\
235 #define YSCALEYUV2PACKEDX_ACCURATE \
237 "xor %%"REG_a", %%"REG_a" \n\t"\
241 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
242 "mov (%%"REG_d"), %%"REG_S" \n\t"\
243 "pxor %%mm4, %%mm4 \n\t"\
244 "pxor %%mm5, %%mm5 \n\t"\
245 "pxor %%mm6, %%mm6 \n\t"\
246 "pxor %%mm7, %%mm7 \n\t"\
249 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
250 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
251 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
252 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
253 "movq %%mm0, %%mm3 \n\t"\
254 "punpcklwd %%mm1, %%mm0 \n\t"\
255 "punpckhwd %%mm1, %%mm3 \n\t"\
256 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
257 "pmaddwd %%mm1, %%mm0 \n\t"\
258 "pmaddwd %%mm1, %%mm3 \n\t"\
259 "paddd %%mm0, %%mm4 \n\t"\
260 "paddd %%mm3, %%mm5 \n\t"\
261 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
262 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
263 "add $16, %%"REG_d" \n\t"\
264 "test %%"REG_S", %%"REG_S" \n\t"\
265 "movq %%mm2, %%mm0 \n\t"\
266 "punpcklwd %%mm3, %%mm2 \n\t"\
267 "punpckhwd %%mm3, %%mm0 \n\t"\
268 "pmaddwd %%mm1, %%mm2 \n\t"\
269 "pmaddwd %%mm1, %%mm0 \n\t"\
270 "paddd %%mm2, %%mm6 \n\t"\
271 "paddd %%mm0, %%mm7 \n\t"\
273 "psrad $16, %%mm4 \n\t"\
274 "psrad $16, %%mm5 \n\t"\
275 "psrad $16, %%mm6 \n\t"\
276 "psrad $16, %%mm7 \n\t"\
277 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
278 "packssdw %%mm5, %%mm4 \n\t"\
279 "packssdw %%mm7, %%mm6 \n\t"\
280 "paddw %%mm0, %%mm4 \n\t"\
281 "paddw %%mm0, %%mm6 \n\t"\
282 "movq %%mm4, "U_TEMP"(%0) \n\t"\
283 "movq %%mm6, "V_TEMP"(%0) \n\t"\
285 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
286 "mov (%%"REG_d"), %%"REG_S" \n\t"\
287 "pxor %%mm1, %%mm1 \n\t"\
288 "pxor %%mm5, %%mm5 \n\t"\
289 "pxor %%mm7, %%mm7 \n\t"\
290 "pxor %%mm6, %%mm6 \n\t"\
293 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
294 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
295 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
296 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
297 "movq %%mm0, %%mm3 \n\t"\
298 "punpcklwd %%mm4, %%mm0 \n\t"\
299 "punpckhwd %%mm4, %%mm3 \n\t"\
300 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
301 "pmaddwd %%mm4, %%mm0 \n\t"\
302 "pmaddwd %%mm4, %%mm3 \n\t"\
303 "paddd %%mm0, %%mm1 \n\t"\
304 "paddd %%mm3, %%mm5 \n\t"\
305 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
306 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
307 "add $16, %%"REG_d" \n\t"\
308 "test %%"REG_S", %%"REG_S" \n\t"\
309 "movq %%mm2, %%mm0 \n\t"\
310 "punpcklwd %%mm3, %%mm2 \n\t"\
311 "punpckhwd %%mm3, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm2 \n\t"\
313 "pmaddwd %%mm4, %%mm0 \n\t"\
314 "paddd %%mm2, %%mm7 \n\t"\
315 "paddd %%mm0, %%mm6 \n\t"\
317 "psrad $16, %%mm1 \n\t"\
318 "psrad $16, %%mm5 \n\t"\
319 "psrad $16, %%mm7 \n\t"\
320 "psrad $16, %%mm6 \n\t"\
321 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
322 "packssdw %%mm5, %%mm1 \n\t"\
323 "packssdw %%mm6, %%mm7 \n\t"\
324 "paddw %%mm0, %%mm1 \n\t"\
325 "paddw %%mm0, %%mm7 \n\t"\
326 "movq "U_TEMP"(%0), %%mm3 \n\t"\
327 "movq "V_TEMP"(%0), %%mm4 \n\t"\
329 #define YSCALEYUV2RGBX \
330 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
331 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
332 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
333 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
334 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
335 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
336 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
337 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
338 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
339 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
340 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
341 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
342 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
343 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
344 "paddw %%mm3, %%mm4 \n\t"\
345 "movq %%mm2, %%mm0 \n\t"\
346 "movq %%mm5, %%mm6 \n\t"\
347 "movq %%mm4, %%mm3 \n\t"\
348 "punpcklwd %%mm2, %%mm2 \n\t"\
349 "punpcklwd %%mm5, %%mm5 \n\t"\
350 "punpcklwd %%mm4, %%mm4 \n\t"\
351 "paddw %%mm1, %%mm2 \n\t"\
352 "paddw %%mm1, %%mm5 \n\t"\
353 "paddw %%mm1, %%mm4 \n\t"\
354 "punpckhwd %%mm0, %%mm0 \n\t"\
355 "punpckhwd %%mm6, %%mm6 \n\t"\
356 "punpckhwd %%mm3, %%mm3 \n\t"\
357 "paddw %%mm7, %%mm0 \n\t"\
358 "paddw %%mm7, %%mm6 \n\t"\
359 "paddw %%mm7, %%mm3 \n\t"\
360 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
361 "packuswb %%mm0, %%mm2 \n\t"\
362 "packuswb %%mm6, %%mm5 \n\t"\
363 "packuswb %%mm3, %%mm4 \n\t"\
364 "pxor %%mm7, %%mm7 \n\t"
366 #define FULL_YSCALEYUV2RGB \
367 "pxor %%mm7, %%mm7 \n\t"\
368 "movd %6, %%mm6 \n\t" /*yalpha1*/\
369 "punpcklwd %%mm6, %%mm6 \n\t"\
370 "punpcklwd %%mm6, %%mm6 \n\t"\
371 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm5, %%mm5 \n\t"\
374 "xor %%"REG_a", %%"REG_a" \n\t"\
377 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
378 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
379 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
380 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
381 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
382 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
383 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
384 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
385 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
386 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
387 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
388 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
389 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
390 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
391 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
393 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
394 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
397 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
399 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
400 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
401 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
402 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
403 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
406 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
407 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
408 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
409 "paddw %%mm1, %%mm3 \n\t" /* B*/\
410 "paddw %%mm1, %%mm0 \n\t" /* R*/\
411 "packuswb %%mm3, %%mm3 \n\t"\
413 "packuswb %%mm0, %%mm0 \n\t"\
414 "paddw %%mm4, %%mm2 \n\t"\
415 "paddw %%mm2, %%mm1 \n\t" /* G*/\
417 "packuswb %%mm1, %%mm1 \n\t"
420 #define REAL_YSCALEYUV2PACKED(index, c) \
421 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
422 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
423 "psraw $3, %%mm0 \n\t"\
424 "psraw $3, %%mm1 \n\t"\
425 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
426 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
427 "xor "#index", "#index" \n\t"\
430 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
431 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
432 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
433 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
434 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
435 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
436 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
437 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
438 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
439 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
440 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
441 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
442 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
443 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
458 #define REAL_YSCALEYUV2RGB(index, c) \
459 "xor "#index", "#index" \n\t"\
462 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
463 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
464 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
465 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
466 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
467 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
468 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
469 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
470 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
471 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
472 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
473 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
474 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
475 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
476 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
477 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
478 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
479 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
480 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
481 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
482 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
483 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
484 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
485 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
486 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
487 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
488 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
489 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
490 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
491 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
492 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
493 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
494 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
495 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
496 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
497 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
498 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
499 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
500 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
501 "paddw %%mm3, %%mm4 \n\t"\
502 "movq %%mm2, %%mm0 \n\t"\
503 "movq %%mm5, %%mm6 \n\t"\
504 "movq %%mm4, %%mm3 \n\t"\
505 "punpcklwd %%mm2, %%mm2 \n\t"\
506 "punpcklwd %%mm5, %%mm5 \n\t"\
507 "punpcklwd %%mm4, %%mm4 \n\t"\
508 "paddw %%mm1, %%mm2 \n\t"\
509 "paddw %%mm1, %%mm5 \n\t"\
510 "paddw %%mm1, %%mm4 \n\t"\
511 "punpckhwd %%mm0, %%mm0 \n\t"\
512 "punpckhwd %%mm6, %%mm6 \n\t"\
513 "punpckhwd %%mm3, %%mm3 \n\t"\
514 "paddw %%mm7, %%mm0 \n\t"\
515 "paddw %%mm7, %%mm6 \n\t"\
516 "paddw %%mm7, %%mm3 \n\t"\
517 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
518 "packuswb %%mm0, %%mm2 \n\t"\
519 "packuswb %%mm6, %%mm5 \n\t"\
520 "packuswb %%mm3, %%mm4 \n\t"\
521 "pxor %%mm7, %%mm7 \n\t"
522 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
524 #define REAL_YSCALEYUV2PACKED1(index, c) \
525 "xor "#index", "#index" \n\t"\
528 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
529 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
530 "psraw $7, %%mm3 \n\t" \
531 "psraw $7, %%mm4 \n\t" \
532 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
533 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
534 "psraw $7, %%mm1 \n\t" \
535 "psraw $7, %%mm7 \n\t" \
537 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
539 #define REAL_YSCALEYUV2RGB1(index, c) \
540 "xor "#index", "#index" \n\t"\
543 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
544 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
545 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
546 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
547 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
548 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
549 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
550 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
551 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
552 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
553 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
554 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
555 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
556 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
557 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
558 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
559 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
560 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
561 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
562 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
563 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
564 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
565 "paddw %%mm3, %%mm4 \n\t"\
566 "movq %%mm2, %%mm0 \n\t"\
567 "movq %%mm5, %%mm6 \n\t"\
568 "movq %%mm4, %%mm3 \n\t"\
569 "punpcklwd %%mm2, %%mm2 \n\t"\
570 "punpcklwd %%mm5, %%mm5 \n\t"\
571 "punpcklwd %%mm4, %%mm4 \n\t"\
572 "paddw %%mm1, %%mm2 \n\t"\
573 "paddw %%mm1, %%mm5 \n\t"\
574 "paddw %%mm1, %%mm4 \n\t"\
575 "punpckhwd %%mm0, %%mm0 \n\t"\
576 "punpckhwd %%mm6, %%mm6 \n\t"\
577 "punpckhwd %%mm3, %%mm3 \n\t"\
578 "paddw %%mm7, %%mm0 \n\t"\
579 "paddw %%mm7, %%mm6 \n\t"\
580 "paddw %%mm7, %%mm3 \n\t"\
581 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
582 "packuswb %%mm0, %%mm2 \n\t"\
583 "packuswb %%mm6, %%mm5 \n\t"\
584 "packuswb %%mm3, %%mm4 \n\t"\
585 "pxor %%mm7, %%mm7 \n\t"
586 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
588 #define REAL_YSCALEYUV2PACKED1b(index, c) \
589 "xor "#index", "#index" \n\t"\
592 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
593 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
594 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
595 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
596 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
597 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
598 "psrlw $8, %%mm3 \n\t" \
599 "psrlw $8, %%mm4 \n\t" \
600 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
601 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
602 "psraw $7, %%mm1 \n\t" \
603 "psraw $7, %%mm7 \n\t"
604 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
606 // do vertical chrominance interpolation
607 #define REAL_YSCALEYUV2RGB1b(index, c) \
608 "xor "#index", "#index" \n\t"\
611 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
612 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
613 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
614 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
615 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
616 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
617 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
618 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
619 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
620 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
621 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
622 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
623 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
624 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
625 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
626 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
627 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
628 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
629 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
630 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
631 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
632 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
633 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
634 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
635 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
636 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
637 "paddw %%mm3, %%mm4 \n\t"\
638 "movq %%mm2, %%mm0 \n\t"\
639 "movq %%mm5, %%mm6 \n\t"\
640 "movq %%mm4, %%mm3 \n\t"\
641 "punpcklwd %%mm2, %%mm2 \n\t"\
642 "punpcklwd %%mm5, %%mm5 \n\t"\
643 "punpcklwd %%mm4, %%mm4 \n\t"\
644 "paddw %%mm1, %%mm2 \n\t"\
645 "paddw %%mm1, %%mm5 \n\t"\
646 "paddw %%mm1, %%mm4 \n\t"\
647 "punpckhwd %%mm0, %%mm0 \n\t"\
648 "punpckhwd %%mm6, %%mm6 \n\t"\
649 "punpckhwd %%mm3, %%mm3 \n\t"\
650 "paddw %%mm7, %%mm0 \n\t"\
651 "paddw %%mm7, %%mm6 \n\t"\
652 "paddw %%mm7, %%mm3 \n\t"\
653 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
654 "packuswb %%mm0, %%mm2 \n\t"\
655 "packuswb %%mm6, %%mm5 \n\t"\
656 "packuswb %%mm3, %%mm4 \n\t"\
657 "pxor %%mm7, %%mm7 \n\t"
658 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
660 #define REAL_WRITEBGR32(dst, dstw, index) \
661 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
662 "movq %%mm2, %%mm1 \n\t" /* B */\
663 "movq %%mm5, %%mm6 \n\t" /* R */\
664 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
665 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
666 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
667 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
668 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
669 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
670 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
671 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
672 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
673 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
675 MOVNTQ(%%mm0, (dst, index, 4))\
676 MOVNTQ(%%mm2, 8(dst, index, 4))\
677 MOVNTQ(%%mm1, 16(dst, index, 4))\
678 MOVNTQ(%%mm3, 24(dst, index, 4))\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
683 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
685 #define REAL_WRITEBGR16(dst, dstw, index) \
686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
687 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
689 "psrlq $3, %%mm2 \n\t"\
691 "movq %%mm2, %%mm1 \n\t"\
692 "movq %%mm4, %%mm3 \n\t"\
694 "punpcklbw %%mm7, %%mm3 \n\t"\
695 "punpcklbw %%mm5, %%mm2 \n\t"\
696 "punpckhbw %%mm7, %%mm4 \n\t"\
697 "punpckhbw %%mm5, %%mm1 \n\t"\
699 "psllq $3, %%mm3 \n\t"\
700 "psllq $3, %%mm4 \n\t"\
702 "por %%mm3, %%mm2 \n\t"\
703 "por %%mm4, %%mm1 \n\t"\
705 MOVNTQ(%%mm2, (dst, index, 2))\
706 MOVNTQ(%%mm1, 8(dst, index, 2))\
708 "add $8, "#index" \n\t"\
709 "cmp "#dstw", "#index" \n\t"\
711 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
713 #define REAL_WRITEBGR15(dst, dstw, index) \
714 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
715 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
716 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
717 "psrlq $3, %%mm2 \n\t"\
718 "psrlq $1, %%mm5 \n\t"\
720 "movq %%mm2, %%mm1 \n\t"\
721 "movq %%mm4, %%mm3 \n\t"\
723 "punpcklbw %%mm7, %%mm3 \n\t"\
724 "punpcklbw %%mm5, %%mm2 \n\t"\
725 "punpckhbw %%mm7, %%mm4 \n\t"\
726 "punpckhbw %%mm5, %%mm1 \n\t"\
728 "psllq $2, %%mm3 \n\t"\
729 "psllq $2, %%mm4 \n\t"\
731 "por %%mm3, %%mm2 \n\t"\
732 "por %%mm4, %%mm1 \n\t"\
734 MOVNTQ(%%mm2, (dst, index, 2))\
735 MOVNTQ(%%mm1, 8(dst, index, 2))\
737 "add $8, "#index" \n\t"\
738 "cmp "#dstw", "#index" \n\t"\
740 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
742 #define WRITEBGR24OLD(dst, dstw, index) \
743 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
744 "movq %%mm2, %%mm1 \n\t" /* B */\
745 "movq %%mm5, %%mm6 \n\t" /* R */\
746 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
747 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
748 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
749 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
750 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
751 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
752 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
753 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
754 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
755 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
757 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
758 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
759 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
760 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
761 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
762 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
763 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
764 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
766 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
767 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
768 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
769 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
770 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
771 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
772 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
773 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
774 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
775 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
776 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
777 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
778 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
780 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
781 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
782 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
783 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
784 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
785 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
786 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
787 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
789 MOVNTQ(%%mm0, (dst))\
790 MOVNTQ(%%mm2, 8(dst))\
791 MOVNTQ(%%mm3, 16(dst))\
792 "add $24, "#dst" \n\t"\
794 "add $8, "#index" \n\t"\
795 "cmp "#dstw", "#index" \n\t"\
798 #define WRITEBGR24MMX(dst, dstw, index) \
799 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
800 "movq %%mm2, %%mm1 \n\t" /* B */\
801 "movq %%mm5, %%mm6 \n\t" /* R */\
802 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
803 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
804 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
805 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
806 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
807 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
808 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
809 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
810 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
811 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
813 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
814 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
815 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
816 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
818 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
819 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
820 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
821 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
823 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
824 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
825 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
826 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
828 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
829 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
830 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
831 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
832 MOVNTQ(%%mm0, (dst))\
834 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
835 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
836 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
837 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
838 MOVNTQ(%%mm6, 8(dst))\
840 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
841 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
842 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
843 MOVNTQ(%%mm5, 16(dst))\
845 "add $24, "#dst" \n\t"\
847 "add $8, "#index" \n\t"\
848 "cmp "#dstw", "#index" \n\t"\
851 #define WRITEBGR24MMX2(dst, dstw, index) \
852 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
853 "movq "MANGLE(M24A)", %%mm0 \n\t"\
854 "movq "MANGLE(M24C)", %%mm7 \n\t"\
855 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
856 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
857 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
859 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
860 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
861 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
863 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
864 "por %%mm1, %%mm6 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, (dst))\
868 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
869 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
870 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
871 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
873 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
874 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
875 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
877 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
878 "por %%mm3, %%mm6 \n\t"\
879 MOVNTQ(%%mm6, 8(dst))\
881 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
882 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
883 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
885 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
886 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
887 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
889 "por %%mm1, %%mm3 \n\t"\
890 "por %%mm3, %%mm6 \n\t"\
891 MOVNTQ(%%mm6, 16(dst))\
893 "add $24, "#dst" \n\t"\
895 "add $8, "#index" \n\t"\
896 "cmp "#dstw", "#index" \n\t"\
901 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
904 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
907 #define REAL_WRITEYUY2(dst, dstw, index) \
908 "packuswb %%mm3, %%mm3 \n\t"\
909 "packuswb %%mm4, %%mm4 \n\t"\
910 "packuswb %%mm7, %%mm1 \n\t"\
911 "punpcklbw %%mm4, %%mm3 \n\t"\
912 "movq %%mm1, %%mm7 \n\t"\
913 "punpcklbw %%mm3, %%mm1 \n\t"\
914 "punpckhbw %%mm3, %%mm7 \n\t"\
916 MOVNTQ(%%mm1, (dst, index, 2))\
917 MOVNTQ(%%mm7, 8(dst, index, 2))\
919 "add $8, "#index" \n\t"\
920 "cmp "#dstw", "#index" \n\t"\
922 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
925 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
926 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
927 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
930 if(c->flags & SWS_ACCURATE_RND){
932 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
933 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
936 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
939 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
940 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
943 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
947 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
948 chrFilter, chrSrc, chrFilterSize,
949 dest, uDest, vDest, dstW, chrDstW);
951 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
952 chrFilter, chrSrc, chrFilterSize,
953 dest, uDest, vDest, dstW, chrDstW);
954 #endif //!HAVE_ALTIVEC
958 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
959 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
960 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
962 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
963 chrFilter, chrSrc, chrFilterSize,
964 dest, uDest, dstW, chrDstW, dstFormat);
967 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
968 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
975 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
982 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
990 :: "r" (lumSrc + dstW), "r" (dest + dstW),
996 for(i=0; i<dstW; i++)
998 int val= lumSrc[i]>>7;
1009 for(i=0; i<chrDstW; i++)
1012 int v=chrSrc[i + 2048]>>7;
1016 else if (u>255) u=255;
1018 else if (v>255) v=255;
1029 * vertical scale YV12 to RGB
1031 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1032 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1033 uint8_t *dest, long dstW, long dstY)
1037 if(c->flags & SWS_ACCURATE_RND){
1038 switch(c->dstFormat){
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 WRITEBGR32(%4, %5, %%REGa)
1044 YSCALEYUV2PACKEDX_END
1047 YSCALEYUV2PACKEDX_ACCURATE
1049 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1050 "add %4, %%"REG_c" \n\t"
1051 WRITEBGR24(%%REGc, %5, %%REGa)
1054 :: "r" (&c->redDither),
1055 "m" (dummy), "m" (dummy), "m" (dummy),
1056 "r" (dest), "m" (dstW)
1057 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1061 YSCALEYUV2PACKEDX_ACCURATE
1063 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1065 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1066 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1067 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1070 WRITEBGR15(%4, %5, %%REGa)
1071 YSCALEYUV2PACKEDX_END
1074 YSCALEYUV2PACKEDX_ACCURATE
1076 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1079 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1080 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1083 WRITEBGR16(%4, %5, %%REGa)
1084 YSCALEYUV2PACKEDX_END
1087 YSCALEYUV2PACKEDX_ACCURATE
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 "psraw $3, %%mm3 \n\t"
1091 "psraw $3, %%mm4 \n\t"
1092 "psraw $3, %%mm1 \n\t"
1093 "psraw $3, %%mm7 \n\t"
1094 WRITEYUY2(%4, %5, %%REGa)
1095 YSCALEYUV2PACKEDX_END
1099 switch(c->dstFormat)
1104 WRITEBGR32(%4, %5, %%REGa)
1105 YSCALEYUV2PACKEDX_END
1110 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1111 "add %4, %%"REG_c" \n\t"
1112 WRITEBGR24(%%REGc, %5, %%REGa)
1114 :: "r" (&c->redDither),
1115 "m" (dummy), "m" (dummy), "m" (dummy),
1116 "r" (dest), "m" (dstW)
1117 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1125 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1126 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1127 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1130 WRITEBGR15(%4, %5, %%REGa)
1131 YSCALEYUV2PACKEDX_END
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1138 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1139 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1140 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1143 WRITEBGR16(%4, %5, %%REGa)
1144 YSCALEYUV2PACKEDX_END
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa)
1155 YSCALEYUV2PACKEDX_END
1161 /* The following list of supported dstFormat values should
1162 match what's found in the body of altivec_yuv2packedX() */
1163 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
1164 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
1165 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
1166 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1167 chrFilter, chrSrc, chrFilterSize,
1171 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1177 * vertical bilinear scale YV12 to RGB
1179 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1180 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1182 int yalpha1=yalpha^4095;
1183 int uvalpha1=uvalpha^4095;
1187 if(flags&SWS_FULL_CHR_H_INT)
1197 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1198 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1200 "movq %%mm3, %%mm1 \n\t"
1201 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1202 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1204 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1205 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1207 "add $4, %%"REG_a" \n\t"
1208 "cmp %5, %%"REG_a" \n\t"
1212 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1213 "m" (yalpha1), "m" (uvalpha1)
1223 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1224 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1226 "movq %%mm3, %%mm1 \n\t"
1227 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1228 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1230 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1231 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1232 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1233 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1234 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1235 "movq %%mm1, %%mm2 \n\t"
1236 "psllq $48, %%mm1 \n\t" // 000000BG
1237 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1239 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1240 "psrld $16, %%mm2 \n\t" // R000R000
1241 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1242 "por %%mm2, %%mm1 \n\t" // RBGRR000
1244 "mov %4, %%"REG_b" \n\t"
1245 "add %%"REG_a", %%"REG_b" \n\t"
1249 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1250 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1252 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1253 "psrlq $32, %%mm3 \n\t"
1254 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1255 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1257 "add $4, %%"REG_a" \n\t"
1258 "cmp %5, %%"REG_a" \n\t"
1261 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1262 "m" (yalpha1), "m" (uvalpha1)
1263 : "%"REG_a, "%"REG_b
1271 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1272 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1273 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1275 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1276 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1277 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1279 "psrlw $3, %%mm3 \n\t"
1280 "psllw $2, %%mm1 \n\t"
1281 "psllw $7, %%mm0 \n\t"
1282 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1283 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1285 "por %%mm3, %%mm1 \n\t"
1286 "por %%mm1, %%mm0 \n\t"
1288 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1290 "add $4, %%"REG_a" \n\t"
1291 "cmp %5, %%"REG_a" \n\t"
1294 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1295 "m" (yalpha1), "m" (uvalpha1)
1304 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1305 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1306 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1308 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1309 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1310 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1312 "psrlw $3, %%mm3 \n\t"
1313 "psllw $3, %%mm1 \n\t"
1314 "psllw $8, %%mm0 \n\t"
1315 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1316 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1318 "por %%mm3, %%mm1 \n\t"
1319 "por %%mm1, %%mm0 \n\t"
1321 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1323 "add $4, %%"REG_a" \n\t"
1324 "cmp %5, %%"REG_a" \n\t"
1327 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1328 "m" (yalpha1), "m" (uvalpha1)
1337 if(dstFormat==IMGFMT_BGR32)
1340 #ifdef WORDS_BIGENDIAN
1343 for(i=0;i<dstW;i++){
1344 // vertical linear interpolation && yuv2rgb in a single step:
1345 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1346 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1347 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1348 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1349 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1350 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1354 else if(dstFormat==IMGFMT_BGR24)
1357 for(i=0;i<dstW;i++){
1358 // vertical linear interpolation && yuv2rgb in a single step:
1359 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1360 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1361 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1362 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1363 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1364 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1368 else if(dstFormat==IMGFMT_BGR16)
1371 for(i=0;i<dstW;i++){
1372 // vertical linear interpolation && yuv2rgb in a single step:
1373 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1374 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1375 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1377 ((uint16_t*)dest)[i] =
1378 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1379 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1380 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1383 else if(dstFormat==IMGFMT_BGR15)
1386 for(i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1392 ((uint16_t*)dest)[i] =
1393 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1394 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1395 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1403 switch(c->dstFormat)
1405 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1408 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1409 "mov %4, %%"REG_b" \n\t"
1410 "push %%"REG_BP" \n\t"
1411 YSCALEYUV2RGB(%%REGBP, %5)
1412 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1413 "pop %%"REG_BP" \n\t"
1414 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1416 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1422 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1423 "mov %4, %%"REG_b" \n\t"
1424 "push %%"REG_BP" \n\t"
1425 YSCALEYUV2RGB(%%REGBP, %5)
1426 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1436 "mov %4, %%"REG_b" \n\t"
1437 "push %%"REG_BP" \n\t"
1438 YSCALEYUV2RGB(%%REGBP, %5)
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1441 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1442 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1443 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1446 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1447 "pop %%"REG_BP" \n\t"
1448 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1450 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1456 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1457 "mov %4, %%"REG_b" \n\t"
1458 "push %%"REG_BP" \n\t"
1459 YSCALEYUV2RGB(%%REGBP, %5)
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1462 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1463 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1464 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1467 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1468 "pop %%"REG_BP" \n\t"
1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1470 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1476 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1477 "mov %4, %%"REG_b" \n\t"
1478 "push %%"REG_BP" \n\t"
1479 YSCALEYUV2PACKED(%%REGBP, %5)
1480 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1481 "pop %%"REG_BP" \n\t"
1482 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1483 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1490 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1494 * YV12 to RGB without scaling or interpolating
1496 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1497 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1499 const int yalpha1=0;
1502 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1503 const int yalpha= 4096; //FIXME ...
1505 if(flags&SWS_FULL_CHR_H_INT)
1507 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1518 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1519 "mov %4, %%"REG_b" \n\t"
1520 "push %%"REG_BP" \n\t"
1521 YSCALEYUV2RGB1(%%REGBP, %5)
1522 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1523 "pop %%"REG_BP" \n\t"
1524 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1526 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1532 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1533 "mov %4, %%"REG_b" \n\t"
1534 "push %%"REG_BP" \n\t"
1535 YSCALEYUV2RGB1(%%REGBP, %5)
1536 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1537 "pop %%"REG_BP" \n\t"
1538 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1540 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1546 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1547 "mov %4, %%"REG_b" \n\t"
1548 "push %%"REG_BP" \n\t"
1549 YSCALEYUV2RGB1(%%REGBP, %5)
1550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1552 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1553 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1554 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1556 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1557 "pop %%"REG_BP" \n\t"
1558 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1560 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1566 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1567 "mov %4, %%"REG_b" \n\t"
1568 "push %%"REG_BP" \n\t"
1569 YSCALEYUV2RGB1(%%REGBP, %5)
1570 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1572 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1573 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1574 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1577 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1578 "pop %%"REG_BP" \n\t"
1579 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1581 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1588 "mov %4, %%"REG_b" \n\t"
1589 "push %%"REG_BP" \n\t"
1590 YSCALEYUV2PACKED1(%%REGBP, %5)
1591 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1592 "pop %%"REG_BP" \n\t"
1593 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1595 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1607 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1608 "mov %4, %%"REG_b" \n\t"
1609 "push %%"REG_BP" \n\t"
1610 YSCALEYUV2RGB1b(%%REGBP, %5)
1611 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1612 "pop %%"REG_BP" \n\t"
1613 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1615 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1621 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1622 "mov %4, %%"REG_b" \n\t"
1623 "push %%"REG_BP" \n\t"
1624 YSCALEYUV2RGB1b(%%REGBP, %5)
1625 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1626 "pop %%"REG_BP" \n\t"
1627 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1629 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1635 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1636 "mov %4, %%"REG_b" \n\t"
1637 "push %%"REG_BP" \n\t"
1638 YSCALEYUV2RGB1b(%%REGBP, %5)
1639 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1641 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1642 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1643 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1645 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1646 "pop %%"REG_BP" \n\t"
1647 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1649 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1655 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1656 "mov %4, %%"REG_b" \n\t"
1657 "push %%"REG_BP" \n\t"
1658 YSCALEYUV2RGB1b(%%REGBP, %5)
1659 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1661 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1662 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1663 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1666 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1667 "pop %%"REG_BP" \n\t"
1668 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1670 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1676 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1677 "mov %4, %%"REG_b" \n\t"
1678 "push %%"REG_BP" \n\t"
1679 YSCALEYUV2PACKED1b(%%REGBP, %5)
1680 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1681 "pop %%"REG_BP" \n\t"
1682 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1684 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1691 if( uvalpha < 2048 )
1693 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1695 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1699 //FIXME yuy2* can read upto 7 samples to much
1701 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1705 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1706 "mov %0, %%"REG_a" \n\t"
1708 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1709 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1710 "pand %%mm2, %%mm0 \n\t"
1711 "pand %%mm2, %%mm1 \n\t"
1712 "packuswb %%mm1, %%mm0 \n\t"
1713 "movq %%mm0, (%2, %%"REG_a") \n\t"
1714 "add $8, %%"REG_a" \n\t"
1716 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721 for(i=0; i<width; i++)
1726 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1728 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1730 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1731 "mov %0, %%"REG_a" \n\t"
1733 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1734 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1735 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1736 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%4, %%"REG_a") \n\t"
1748 "movd %%mm1, (%3, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1756 for(i=0; i<width; i++)
1758 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1759 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1764 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1765 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1769 "mov %0, %%"REG_a" \n\t"
1771 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1772 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1773 "psrlw $8, %%mm0 \n\t"
1774 "psrlw $8, %%mm1 \n\t"
1775 "packuswb %%mm1, %%mm0 \n\t"
1776 "movq %%mm0, (%2, %%"REG_a") \n\t"
1777 "add $8, %%"REG_a" \n\t"
1779 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1784 for(i=0; i<width; i++)
1789 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1791 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1793 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1794 "mov %0, %%"REG_a" \n\t"
1796 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1797 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1798 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1799 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1802 "pand %%mm4, %%mm0 \n\t"
1803 "pand %%mm4, %%mm1 \n\t"
1804 "packuswb %%mm1, %%mm0 \n\t"
1805 "movq %%mm0, %%mm1 \n\t"
1806 "psrlw $8, %%mm0 \n\t"
1807 "pand %%mm4, %%mm1 \n\t"
1808 "packuswb %%mm0, %%mm0 \n\t"
1809 "packuswb %%mm1, %%mm1 \n\t"
1810 "movd %%mm0, (%4, %%"REG_a") \n\t"
1811 "movd %%mm1, (%3, %%"REG_a") \n\t"
1812 "add $4, %%"REG_a" \n\t"
1814 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1819 for(i=0; i<width; i++)
1821 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1822 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1827 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1830 for(i=0; i<width; i++)
1832 int b= ((uint32_t*)src)[i]&0xFF;
1833 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1834 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1836 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1840 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1843 for(i=0; i<width; i++)
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
1847 const int c= ((uint32_t*)src2)[2*i+0];
1848 const int d= ((uint32_t*)src2)[2*i+1];
1849 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1850 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1851 const int b= l&0x3FF;
1855 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1856 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1860 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1864 "mov %2, %%"REG_a" \n\t"
1865 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1866 "movq "MANGLE(w1111)", %%mm5 \n\t"
1867 "pxor %%mm7, %%mm7 \n\t"
1868 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
1871 PREFETCH" 64(%0, %%"REG_d") \n\t"
1872 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1873 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1874 "punpcklbw %%mm7, %%mm0 \n\t"
1875 "punpcklbw %%mm7, %%mm1 \n\t"
1876 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1877 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1878 "punpcklbw %%mm7, %%mm2 \n\t"
1879 "punpcklbw %%mm7, %%mm3 \n\t"
1880 "pmaddwd %%mm6, %%mm0 \n\t"
1881 "pmaddwd %%mm6, %%mm1 \n\t"
1882 "pmaddwd %%mm6, %%mm2 \n\t"
1883 "pmaddwd %%mm6, %%mm3 \n\t"
1884 #ifndef FAST_BGR2YV12
1885 "psrad $8, %%mm0 \n\t"
1886 "psrad $8, %%mm1 \n\t"
1887 "psrad $8, %%mm2 \n\t"
1888 "psrad $8, %%mm3 \n\t"
1890 "packssdw %%mm1, %%mm0 \n\t"
1891 "packssdw %%mm3, %%mm2 \n\t"
1892 "pmaddwd %%mm5, %%mm0 \n\t"
1893 "pmaddwd %%mm5, %%mm2 \n\t"
1894 "packssdw %%mm2, %%mm0 \n\t"
1895 "psraw $7, %%mm0 \n\t"
1897 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1898 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1899 "punpcklbw %%mm7, %%mm4 \n\t"
1900 "punpcklbw %%mm7, %%mm1 \n\t"
1901 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1902 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1903 "punpcklbw %%mm7, %%mm2 \n\t"
1904 "punpcklbw %%mm7, %%mm3 \n\t"
1905 "pmaddwd %%mm6, %%mm4 \n\t"
1906 "pmaddwd %%mm6, %%mm1 \n\t"
1907 "pmaddwd %%mm6, %%mm2 \n\t"
1908 "pmaddwd %%mm6, %%mm3 \n\t"
1909 #ifndef FAST_BGR2YV12
1910 "psrad $8, %%mm4 \n\t"
1911 "psrad $8, %%mm1 \n\t"
1912 "psrad $8, %%mm2 \n\t"
1913 "psrad $8, %%mm3 \n\t"
1915 "packssdw %%mm1, %%mm4 \n\t"
1916 "packssdw %%mm3, %%mm2 \n\t"
1917 "pmaddwd %%mm5, %%mm4 \n\t"
1918 "pmaddwd %%mm5, %%mm2 \n\t"
1919 "add $24, %%"REG_d" \n\t"
1920 "packssdw %%mm2, %%mm4 \n\t"
1921 "psraw $7, %%mm4 \n\t"
1923 "packuswb %%mm4, %%mm0 \n\t"
1924 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1926 "movq %%mm0, (%1, %%"REG_a") \n\t"
1927 "add $8, %%"REG_a" \n\t"
1929 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1930 : "%"REG_a, "%"REG_d
1934 for(i=0; i<width; i++)
1940 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1945 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1949 "mov %4, %%"REG_a" \n\t"
1950 "movq "MANGLE(w1111)", %%mm5 \n\t"
1951 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1952 "pxor %%mm7, %%mm7 \n\t"
1953 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1954 "add %%"REG_d", %%"REG_d" \n\t"
1957 PREFETCH" 64(%0, %%"REG_d") \n\t"
1958 PREFETCH" 64(%1, %%"REG_d") \n\t"
1959 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1960 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1961 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1962 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1963 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1966 "movq %%mm0, %%mm1 \n\t"
1967 "movq %%mm2, %%mm3 \n\t"
1968 "psrlq $24, %%mm0 \n\t"
1969 "psrlq $24, %%mm2 \n\t"
1972 "punpcklbw %%mm7, %%mm0 \n\t"
1973 "punpcklbw %%mm7, %%mm2 \n\t"
1975 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1976 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1977 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1978 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "punpcklbw %%mm7, %%mm2 \n\t"
1982 "punpcklbw %%mm7, %%mm3 \n\t"
1983 "paddw %%mm1, %%mm0 \n\t"
1984 "paddw %%mm3, %%mm2 \n\t"
1985 "paddw %%mm2, %%mm0 \n\t"
1986 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1987 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1988 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1989 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1990 "punpcklbw %%mm7, %%mm4 \n\t"
1991 "punpcklbw %%mm7, %%mm1 \n\t"
1992 "punpcklbw %%mm7, %%mm2 \n\t"
1993 "punpcklbw %%mm7, %%mm3 \n\t"
1994 "paddw %%mm1, %%mm4 \n\t"
1995 "paddw %%mm3, %%mm2 \n\t"
1996 "paddw %%mm4, %%mm2 \n\t"
1997 "psrlw $2, %%mm0 \n\t"
1998 "psrlw $2, %%mm2 \n\t"
2000 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2001 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2003 "pmaddwd %%mm0, %%mm1 \n\t"
2004 "pmaddwd %%mm2, %%mm3 \n\t"
2005 "pmaddwd %%mm6, %%mm0 \n\t"
2006 "pmaddwd %%mm6, %%mm2 \n\t"
2007 #ifndef FAST_BGR2YV12
2008 "psrad $8, %%mm0 \n\t"
2009 "psrad $8, %%mm1 \n\t"
2010 "psrad $8, %%mm2 \n\t"
2011 "psrad $8, %%mm3 \n\t"
2013 "packssdw %%mm2, %%mm0 \n\t"
2014 "packssdw %%mm3, %%mm1 \n\t"
2015 "pmaddwd %%mm5, %%mm0 \n\t"
2016 "pmaddwd %%mm5, %%mm1 \n\t"
2017 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2018 "psraw $7, %%mm0 \n\t"
2020 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2021 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2022 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2023 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2024 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2027 "movq %%mm4, %%mm1 \n\t"
2028 "movq %%mm2, %%mm3 \n\t"
2029 "psrlq $24, %%mm4 \n\t"
2030 "psrlq $24, %%mm2 \n\t"
2033 "punpcklbw %%mm7, %%mm4 \n\t"
2034 "punpcklbw %%mm7, %%mm2 \n\t"
2036 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2037 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2038 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2039 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2040 "punpcklbw %%mm7, %%mm4 \n\t"
2041 "punpcklbw %%mm7, %%mm1 \n\t"
2042 "punpcklbw %%mm7, %%mm2 \n\t"
2043 "punpcklbw %%mm7, %%mm3 \n\t"
2044 "paddw %%mm1, %%mm4 \n\t"
2045 "paddw %%mm3, %%mm2 \n\t"
2046 "paddw %%mm2, %%mm4 \n\t"
2047 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2048 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2049 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2050 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2051 "punpcklbw %%mm7, %%mm5 \n\t"
2052 "punpcklbw %%mm7, %%mm1 \n\t"
2053 "punpcklbw %%mm7, %%mm2 \n\t"
2054 "punpcklbw %%mm7, %%mm3 \n\t"
2055 "paddw %%mm1, %%mm5 \n\t"
2056 "paddw %%mm3, %%mm2 \n\t"
2057 "paddw %%mm5, %%mm2 \n\t"
2058 "movq "MANGLE(w1111)", %%mm5 \n\t"
2059 "psrlw $2, %%mm4 \n\t"
2060 "psrlw $2, %%mm2 \n\t"
2062 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2063 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2065 "pmaddwd %%mm4, %%mm1 \n\t"
2066 "pmaddwd %%mm2, %%mm3 \n\t"
2067 "pmaddwd %%mm6, %%mm4 \n\t"
2068 "pmaddwd %%mm6, %%mm2 \n\t"
2069 #ifndef FAST_BGR2YV12
2070 "psrad $8, %%mm4 \n\t"
2071 "psrad $8, %%mm1 \n\t"
2072 "psrad $8, %%mm2 \n\t"
2073 "psrad $8, %%mm3 \n\t"
2075 "packssdw %%mm2, %%mm4 \n\t"
2076 "packssdw %%mm3, %%mm1 \n\t"
2077 "pmaddwd %%mm5, %%mm4 \n\t"
2078 "pmaddwd %%mm5, %%mm1 \n\t"
2079 "add $24, %%"REG_d" \n\t"
2080 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2081 "psraw $7, %%mm4 \n\t"
2083 "movq %%mm0, %%mm1 \n\t"
2084 "punpckldq %%mm4, %%mm0 \n\t"
2085 "punpckhdq %%mm4, %%mm1 \n\t"
2086 "packsswb %%mm1, %%mm0 \n\t"
2087 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2089 "movd %%mm0, (%2, %%"REG_a") \n\t"
2090 "punpckhdq %%mm0, %%mm0 \n\t"
2091 "movd %%mm0, (%3, %%"REG_a") \n\t"
2092 "add $4, %%"REG_a" \n\t"
2094 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2095 : "%"REG_a, "%"REG_d
2099 for(i=0; i<width; i++)
2101 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2102 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2103 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2105 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2106 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2111 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2114 for(i=0; i<width; i++)
2116 int d= ((uint16_t*)src)[i];
2119 int r= (d>>11)&0x1F;
2121 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2125 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2128 for(i=0; i<width; i++)
2130 int d0= ((uint32_t*)src1)[i];
2131 int d1= ((uint32_t*)src2)[i];
2133 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
2134 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
2136 int dh2= (dh>>11) + (dh<<21);
2140 int r= (d>>11)&0x7F;
2142 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2143 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2147 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2150 for(i=0; i<width; i++)
2152 int d= ((uint16_t*)src)[i];
2155 int r= (d>>10)&0x1F;
2157 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2161 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164 for(i=0; i<width; i++)
2166 int d0= ((uint32_t*)src1)[i];
2167 int d1= ((uint32_t*)src2)[i];
2169 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
2170 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
2172 int dh2= (dh>>11) + (dh<<21);
2176 int r= (d>>10)&0x7F;
2178 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2179 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2184 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2187 for(i=0; i<width; i++)
2189 int r= ((uint32_t*)src)[i]&0xFF;
2190 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2191 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2193 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2197 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2200 for(i=0; i<width; i++)
2202 const int a= ((uint32_t*)src1)[2*i+0];
2203 const int e= ((uint32_t*)src1)[2*i+1];
2204 const int c= ((uint32_t*)src2)[2*i+0];
2205 const int d= ((uint32_t*)src2)[2*i+1];
2206 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2207 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2208 const int r= l&0x3FF;
2212 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2213 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2217 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2220 for(i=0; i<width; i++)
2226 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2230 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2233 for(i=0; i<width; i++)
2235 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2236 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2237 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2239 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2240 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2245 // Bilinear / Bicubic scaling
2246 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2247 int16_t *filter, int16_t *filterPos, long filterSize)
2250 assert(filterSize % 4 == 0 && filterSize>0);
2251 if(filterSize==4) // allways true for upscaling, sometimes for down too
2253 long counter= -2*dstW;
2255 filterPos-= counter/2;
2259 "push %%"REG_b" \n\t"
2261 "pxor %%mm7, %%mm7 \n\t"
2262 "movq "MANGLE(w02)", %%mm6 \n\t"
2263 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2264 "mov %%"REG_a", %%"REG_BP" \n\t"
2267 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2268 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2269 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2270 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2271 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2272 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2273 "punpcklbw %%mm7, %%mm0 \n\t"
2274 "punpcklbw %%mm7, %%mm2 \n\t"
2275 "pmaddwd %%mm1, %%mm0 \n\t"
2276 "pmaddwd %%mm2, %%mm3 \n\t"
2277 "psrad $8, %%mm0 \n\t"
2278 "psrad $8, %%mm3 \n\t"
2279 "packssdw %%mm3, %%mm0 \n\t"
2280 "pmaddwd %%mm6, %%mm0 \n\t"
2281 "packssdw %%mm0, %%mm0 \n\t"
2282 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2283 "add $4, %%"REG_BP" \n\t"
2286 "pop %%"REG_BP" \n\t"
2288 "pop %%"REG_b" \n\t"
2291 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2297 else if(filterSize==8)
2299 long counter= -2*dstW;
2301 filterPos-= counter/2;
2305 "push %%"REG_b" \n\t"
2307 "pxor %%mm7, %%mm7 \n\t"
2308 "movq "MANGLE(w02)", %%mm6 \n\t"
2309 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2310 "mov %%"REG_a", %%"REG_BP" \n\t"
2313 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2314 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2315 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2316 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2317 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2318 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2319 "punpcklbw %%mm7, %%mm0 \n\t"
2320 "punpcklbw %%mm7, %%mm2 \n\t"
2321 "pmaddwd %%mm1, %%mm0 \n\t"
2322 "pmaddwd %%mm2, %%mm3 \n\t"
2324 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2325 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2326 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2327 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm4 \n\t"
2329 "punpcklbw %%mm7, %%mm2 \n\t"
2330 "pmaddwd %%mm1, %%mm4 \n\t"
2331 "pmaddwd %%mm2, %%mm5 \n\t"
2332 "paddd %%mm4, %%mm0 \n\t"
2333 "paddd %%mm5, %%mm3 \n\t"
2335 "psrad $8, %%mm0 \n\t"
2336 "psrad $8, %%mm3 \n\t"
2337 "packssdw %%mm3, %%mm0 \n\t"
2338 "pmaddwd %%mm6, %%mm0 \n\t"
2339 "packssdw %%mm0, %%mm0 \n\t"
2340 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2341 "add $4, %%"REG_BP" \n\t"
2344 "pop %%"REG_BP" \n\t"
2346 "pop %%"REG_b" \n\t"
2349 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2357 uint8_t *offset = src+filterSize;
2358 long counter= -2*dstW;
2359 // filter-= counter*filterSize/2;
2360 filterPos-= counter/2;
2363 "pxor %%mm7, %%mm7 \n\t"
2364 "movq "MANGLE(w02)", %%mm6 \n\t"
2367 "mov %2, %%"REG_c" \n\t"
2368 "movzwl (%%"REG_c", %0), %%eax \n\t"
2369 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2370 "mov %5, %%"REG_c" \n\t"
2371 "pxor %%mm4, %%mm4 \n\t"
2372 "pxor %%mm5, %%mm5 \n\t"
2374 "movq (%1), %%mm1 \n\t"
2375 "movq (%1, %6), %%mm3 \n\t"
2376 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2377 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
2378 "punpcklbw %%mm7, %%mm0 \n\t"
2379 "punpcklbw %%mm7, %%mm2 \n\t"
2380 "pmaddwd %%mm1, %%mm0 \n\t"
2381 "pmaddwd %%mm2, %%mm3 \n\t"
2382 "paddd %%mm3, %%mm5 \n\t"
2383 "paddd %%mm0, %%mm4 \n\t"
2385 "add $4, %%"REG_c" \n\t"
2386 "cmp %4, %%"REG_c" \n\t"
2389 "psrad $8, %%mm4 \n\t"
2390 "psrad $8, %%mm5 \n\t"
2391 "packssdw %%mm5, %%mm4 \n\t"
2392 "pmaddwd %%mm6, %%mm4 \n\t"
2393 "packssdw %%mm4, %%mm4 \n\t"
2394 "mov %3, %%"REG_a" \n\t"
2395 "movd %%mm4, (%%"REG_a", %0) \n\t"
2399 : "+r" (counter), "+r" (filter)
2400 : "m" (filterPos), "m" (dst), "m"(offset),
2401 "m" (src), "r" (filterSize*2)
2402 : "%"REG_a, "%"REG_c, "%"REG_d
2407 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2410 for(i=0; i<dstW; i++)
2413 int srcPos= filterPos[i];
2415 // printf("filterPos: %d\n", filterPos[i]);
2416 for(j=0; j<filterSize; j++)
2418 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2419 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2421 // filter += hFilterSize;
2422 dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2428 // *** horizontal scale Y line to temp buffer
2429 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2430 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2431 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2432 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2433 int32_t *mmx2FilterPos)
2435 if(srcFormat==IMGFMT_YUY2)
2437 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2438 src= formatConvBuffer;
2440 else if(srcFormat==IMGFMT_UYVY)
2442 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2443 src= formatConvBuffer;
2445 else if(srcFormat==IMGFMT_BGR32)
2447 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2448 src= formatConvBuffer;
2450 else if(srcFormat==IMGFMT_BGR24)
2452 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2453 src= formatConvBuffer;
2455 else if(srcFormat==IMGFMT_BGR16)
2457 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2458 src= formatConvBuffer;
2460 else if(srcFormat==IMGFMT_BGR15)
2462 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2463 src= formatConvBuffer;
2465 else if(srcFormat==IMGFMT_RGB32)
2467 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2468 src= formatConvBuffer;
2470 else if(srcFormat==IMGFMT_RGB24)
2472 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2473 src= formatConvBuffer;
2477 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2478 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2480 if(!(flags&SWS_FAST_BILINEAR))
2483 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2485 else // Fast Bilinear upscale / crap downscale
2487 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2491 uint64_t ebxsave __attribute__((aligned(8)));
2497 "mov %%"REG_b", %5 \n\t"
2499 "pxor %%mm7, %%mm7 \n\t"
2500 "mov %0, %%"REG_c" \n\t"
2501 "mov %1, %%"REG_D" \n\t"
2502 "mov %2, %%"REG_d" \n\t"
2503 "mov %3, %%"REG_b" \n\t"
2504 "xor %%"REG_a", %%"REG_a" \n\t" // i
2505 PREFETCH" (%%"REG_c") \n\t"
2506 PREFETCH" 32(%%"REG_c") \n\t"
2507 PREFETCH" 64(%%"REG_c") \n\t"
2511 #define FUNNY_Y_CODE \
2512 "movl (%%"REG_b"), %%esi \n\t"\
2514 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2515 "add %%"REG_S", %%"REG_c" \n\t"\
2516 "add %%"REG_a", %%"REG_D" \n\t"\
2517 "xor %%"REG_a", %%"REG_a" \n\t"\
2521 #define FUNNY_Y_CODE \
2522 "movl (%%"REG_b"), %%esi \n\t"\
2524 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2525 "add %%"REG_a", %%"REG_D" \n\t"\
2526 "xor %%"REG_a", %%"REG_a" \n\t"\
2540 "mov %5, %%"REG_b" \n\t"
2542 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2547 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2552 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2557 long xInc_shr16 = xInc >> 16;
2558 uint16_t xInc_mask = xInc & 0xffff;
2559 //NO MMX just normal asm ...
2561 "xor %%"REG_a", %%"REG_a" \n\t" // i
2562 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2563 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2566 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2567 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2568 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2569 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2570 "shll $16, %%edi \n\t"
2571 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2572 "mov %1, %%"REG_D" \n\t"
2573 "shrl $9, %%esi \n\t"
2574 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2575 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2576 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2578 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2579 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2580 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2581 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2582 "shll $16, %%edi \n\t"
2583 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2584 "mov %1, %%"REG_D" \n\t"
2585 "shrl $9, %%esi \n\t"
2586 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2587 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2588 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2591 "add $2, %%"REG_a" \n\t"
2592 "cmp %2, %%"REG_a" \n\t"
2596 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2597 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2600 } //if MMX2 can't be used
2604 unsigned int xpos=0;
2605 for(i=0;i<dstWidth;i++)
2607 register unsigned int xx=xpos>>16;
2608 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2609 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2616 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2617 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2618 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2619 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2620 int32_t *mmx2FilterPos)
2622 if(srcFormat==IMGFMT_YUY2)
2624 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2625 src1= formatConvBuffer;
2626 src2= formatConvBuffer+2048;
2628 else if(srcFormat==IMGFMT_UYVY)
2630 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2631 src1= formatConvBuffer;
2632 src2= formatConvBuffer+2048;
2634 else if(srcFormat==IMGFMT_BGR32)
2636 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2637 src1= formatConvBuffer;
2638 src2= formatConvBuffer+2048;
2640 else if(srcFormat==IMGFMT_BGR24)
2642 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2643 src1= formatConvBuffer;
2644 src2= formatConvBuffer+2048;
2646 else if(srcFormat==IMGFMT_BGR16)
2648 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2649 src1= formatConvBuffer;
2650 src2= formatConvBuffer+2048;
2652 else if(srcFormat==IMGFMT_BGR15)
2654 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2655 src1= formatConvBuffer;
2656 src2= formatConvBuffer+2048;
2658 else if(srcFormat==IMGFMT_RGB32)
2660 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2661 src1= formatConvBuffer;
2662 src2= formatConvBuffer+2048;
2664 else if(srcFormat==IMGFMT_RGB24)
2666 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2667 src1= formatConvBuffer;
2668 src2= formatConvBuffer+2048;
2670 else if(isGray(srcFormat))
2676 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2677 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2679 if(!(flags&SWS_FAST_BILINEAR))
2682 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2683 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2685 else // Fast Bilinear upscale / crap downscale
2687 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2691 uint64_t ebxsave __attribute__((aligned(8)));
2697 "mov %%"REG_b", %6 \n\t"
2699 "pxor %%mm7, %%mm7 \n\t"
2700 "mov %0, %%"REG_c" \n\t"
2701 "mov %1, %%"REG_D" \n\t"
2702 "mov %2, %%"REG_d" \n\t"
2703 "mov %3, %%"REG_b" \n\t"
2704 "xor %%"REG_a", %%"REG_a" \n\t" // i
2705 PREFETCH" (%%"REG_c") \n\t"
2706 PREFETCH" 32(%%"REG_c") \n\t"
2707 PREFETCH" 64(%%"REG_c") \n\t"
2711 #define FUNNY_UV_CODE \
2712 "movl (%%"REG_b"), %%esi \n\t"\
2714 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2715 "add %%"REG_S", %%"REG_c" \n\t"\
2716 "add %%"REG_a", %%"REG_D" \n\t"\
2717 "xor %%"REG_a", %%"REG_a" \n\t"\
2721 #define FUNNY_UV_CODE \
2722 "movl (%%"REG_b"), %%esi \n\t"\
2724 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2725 "add %%"REG_a", %%"REG_D" \n\t"\
2726 "xor %%"REG_a", %%"REG_a" \n\t"\
2734 "xor %%"REG_a", %%"REG_a" \n\t" // i
2735 "mov %5, %%"REG_c" \n\t" // src
2736 "mov %1, %%"REG_D" \n\t" // buf1
2737 "add $4096, %%"REG_D" \n\t"
2738 PREFETCH" (%%"REG_c") \n\t"
2739 PREFETCH" 32(%%"REG_c") \n\t"
2740 PREFETCH" 64(%%"REG_c") \n\t"
2748 "mov %6, %%"REG_b" \n\t"
2750 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2751 "m" (funnyUVCode), "m" (src2)
2755 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2760 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2762 // printf("%d %d %d\n", dstWidth, i, srcW);
2763 dst[i] = src1[srcW-1]*128;
2764 dst[i+2048] = src2[srcW-1]*128;
2770 long xInc_shr16 = (long) (xInc >> 16);
2771 uint16_t xInc_mask = xInc & 0xffff;
2773 "xor %%"REG_a", %%"REG_a" \n\t" // i
2774 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2775 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2778 "mov %0, %%"REG_S" \n\t"
2779 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2780 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2781 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2782 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2783 "shll $16, %%edi \n\t"
2784 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2785 "mov %1, %%"REG_D" \n\t"
2786 "shrl $9, %%esi \n\t"
2787 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2789 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2790 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2791 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2792 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2793 "shll $16, %%edi \n\t"
2794 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2795 "mov %1, %%"REG_D" \n\t"
2796 "shrl $9, %%esi \n\t"
2797 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2799 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2800 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2801 "add $1, %%"REG_a" \n\t"
2802 "cmp %2, %%"REG_a" \n\t"
2805 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2806 which is needed to support GCC-4.0 */
2807 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2808 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2810 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2813 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2816 } //if MMX2 can't be used
2820 unsigned int xpos=0;
2821 for(i=0;i<dstWidth;i++)
2823 register unsigned int xx=xpos>>16;
2824 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2825 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2826 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2828 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2829 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2837 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2838 int srcSliceH, uint8_t* dst[], int dstStride[]){
2840 /* load a few things into local vars to make the code more readable? and faster */
2841 const int srcW= c->srcW;
2842 const int dstW= c->dstW;
2843 const int dstH= c->dstH;
2844 const int chrDstW= c->chrDstW;
2845 const int chrSrcW= c->chrSrcW;
2846 const int lumXInc= c->lumXInc;
2847 const int chrXInc= c->chrXInc;
2848 const int dstFormat= c->dstFormat;
2849 const int srcFormat= c->srcFormat;
2850 const int flags= c->flags;
2851 const int canMMX2BeUsed= c->canMMX2BeUsed;
2852 int16_t *vLumFilterPos= c->vLumFilterPos;
2853 int16_t *vChrFilterPos= c->vChrFilterPos;
2854 int16_t *hLumFilterPos= c->hLumFilterPos;
2855 int16_t *hChrFilterPos= c->hChrFilterPos;
2856 int16_t *vLumFilter= c->vLumFilter;
2857 int16_t *vChrFilter= c->vChrFilter;
2858 int16_t *hLumFilter= c->hLumFilter;
2859 int16_t *hChrFilter= c->hChrFilter;
2860 int32_t *lumMmxFilter= c->lumMmxFilter;
2861 int32_t *chrMmxFilter= c->chrMmxFilter;
2862 const int vLumFilterSize= c->vLumFilterSize;
2863 const int vChrFilterSize= c->vChrFilterSize;
2864 const int hLumFilterSize= c->hLumFilterSize;
2865 const int hChrFilterSize= c->hChrFilterSize;
2866 int16_t **lumPixBuf= c->lumPixBuf;
2867 int16_t **chrPixBuf= c->chrPixBuf;
2868 const int vLumBufSize= c->vLumBufSize;
2869 const int vChrBufSize= c->vChrBufSize;
2870 uint8_t *funnyYCode= c->funnyYCode;
2871 uint8_t *funnyUVCode= c->funnyUVCode;
2872 uint8_t *formatConvBuffer= c->formatConvBuffer;
2873 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2874 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2877 /* vars whch will change and which we need to storw back in the context */
2879 int lumBufIndex= c->lumBufIndex;
2880 int chrBufIndex= c->chrBufIndex;
2881 int lastInLumBuf= c->lastInLumBuf;
2882 int lastInChrBuf= c->lastInChrBuf;
2884 if(isPacked(c->srcFormat)){
2890 srcStride[2]= srcStride[0];
2892 srcStride[1]<<= c->vChrDrop;
2893 srcStride[2]<<= c->vChrDrop;
2895 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2896 // (int)dst[0], (int)dst[1], (int)dst[2]);
2898 #if 0 //self test FIXME move to a vfilter or something
2900 static volatile int i=0;
2902 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2903 selfTest(src, srcStride, c->srcW, c->srcH);
2908 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2909 //dstStride[0],dstStride[1],dstStride[2]);
2911 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2913 static int firstTime=1; //FIXME move this into the context perhaps
2914 if(flags & SWS_PRINT_INFO && firstTime)
2916 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2917 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2922 /* Note the user might start scaling the picture in the middle so this will not get executed
2923 this is not really intended but works currently, so ppl might do it */
2934 for(;dstY < dstH; dstY++){
2935 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2936 const int chrDstY= dstY>>c->chrDstVSubSample;
2937 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2938 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2940 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2941 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2942 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2943 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2945 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2946 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2947 //handle holes (FAST_BILINEAR & weird filters)
2948 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2949 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2950 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2951 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2952 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2954 // Do we have enough lines in this slice to output the dstY line
2955 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2957 //Do horizontal scaling
2958 while(lastInLumBuf < lastLumSrcY)
2960 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2962 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2963 ASSERT(lumBufIndex < 2*vLumBufSize)
2964 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2965 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2966 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2967 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2968 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2969 funnyYCode, c->srcFormat, formatConvBuffer,
2970 c->lumMmx2Filter, c->lumMmx2FilterPos);
2973 while(lastInChrBuf < lastChrSrcY)
2975 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2976 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2978 ASSERT(chrBufIndex < 2*vChrBufSize)
2979 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2980 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2981 //FIXME replace parameters through context struct (some at least)
2983 if(!(isGray(srcFormat) || isGray(dstFormat)))
2984 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2985 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2986 funnyUVCode, c->srcFormat, formatConvBuffer,
2987 c->chrMmx2Filter, c->chrMmx2FilterPos);
2990 //wrap buf index around to stay inside the ring buffer
2991 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2992 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2994 else // not enough lines left in this slice -> load the rest in the buffer
2996 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2997 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2998 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2999 vChrBufSize, vLumBufSize);*/
3001 //Do horizontal scaling
3002 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3004 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3006 ASSERT(lumBufIndex < 2*vLumBufSize)
3007 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3008 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3009 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3010 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3011 funnyYCode, c->srcFormat, formatConvBuffer,
3012 c->lumMmx2Filter, c->lumMmx2FilterPos);
3015 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3017 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3018 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3020 ASSERT(chrBufIndex < 2*vChrBufSize)
3021 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3022 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3024 if(!(isGray(srcFormat) || isGray(dstFormat)))
3025 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3026 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3027 funnyUVCode, c->srcFormat, formatConvBuffer,
3028 c->chrMmx2Filter, c->chrMmx2FilterPos);
3031 //wrap buf index around to stay inside the ring buffer
3032 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3033 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3034 break; //we can't output a dstY line so let's try with the next slice
3038 b5Dither= dither8[dstY&1];
3039 g6Dither= dither4[dstY&1];
3040 g5Dither= dither8[dstY&1];
3041 r5Dither= dither8[(dstY+1)&1];
3045 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3046 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3049 if(flags & SWS_ACCURATE_RND){
3050 for(i=0; i<vLumFilterSize; i+=2){
3051 lumMmxFilter[2*i+0]= lumSrcPtr[i ];
3052 lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)];
3053 lumMmxFilter[2*i+2]=
3054 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3055 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3057 for(i=0; i<vChrFilterSize; i+=2){
3058 chrMmxFilter[2*i+0]= chrSrcPtr[i ];
3059 chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)];
3060 chrMmxFilter[2*i+2]=
3061 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3062 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3065 for(i=0; i<vLumFilterSize; i++)
3067 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3068 lumMmxFilter[4*i+2]=
3069 lumMmxFilter[4*i+3]=
3070 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3072 for(i=0; i<vChrFilterSize; i++)
3074 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3075 chrMmxFilter[4*i+2]=
3076 chrMmxFilter[4*i+3]=
3077 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3081 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3082 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3083 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3084 RENAME(yuv2nv12X)(c,
3085 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3086 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3087 dest, uDest, dstW, chrDstW, dstFormat);
3089 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3091 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3092 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3093 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3095 int16_t *lumBuf = lumPixBuf[0];
3096 int16_t *chrBuf= chrPixBuf[0];
3097 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3102 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3103 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3104 dest, uDest, vDest, dstW, chrDstW);
3109 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3110 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3111 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3113 int chrAlpha= vChrFilter[2*dstY+1];
3114 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3115 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3117 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3119 int lumAlpha= vLumFilter[2*dstY+1];
3120 int chrAlpha= vChrFilter[2*dstY+1];
3122 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3124 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3125 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3126 dest, dstW, lumAlpha, chrAlpha, dstY);
3130 RENAME(yuv2packedX)(c,
3131 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3132 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3137 else // hmm looks like we can't use MMX here without overwriting this array's tail
3139 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3140 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3141 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3142 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3143 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3145 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3146 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3147 dest, uDest, dstW, chrDstW, dstFormat);
3149 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3151 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3152 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3154 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3155 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3156 dest, uDest, vDest, dstW, chrDstW);
3160 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3161 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3163 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3164 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3171 __asm __volatile(SFENCE:::"memory");
3172 __asm __volatile(EMMS:::"memory");
3174 /* store changed local vars back in the context */
3176 c->lumBufIndex= lumBufIndex;
3177 c->chrBufIndex= chrBufIndex;
3178 c->lastInLumBuf= lastInLumBuf;
3179 c->lastInChrBuf= lastInChrBuf;
3181 return dstY - lastDstY;