2 // Software scaling and colorspace conversion routines for MPlayer
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL
16 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
23 #define PREFETCH "prefetch"
24 #define PREFETCHW "prefetchw"
25 #elif defined ( HAVE_MMX2 )
26 #define PREFETCH "prefetchnta"
27 #define PREFETCHW "prefetcht0"
29 #define PREFETCH "/nop"
30 #define PREFETCHW "/nop"
34 #define SFENCE "sfence"
40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41 #elif defined (HAVE_3DNOW)
42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
46 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
48 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
52 #define YSCALEYUV2YV12X(x) \
53 "xorl %%eax, %%eax \n\t"\
54 "pxor %%mm3, %%mm3 \n\t"\
55 "pxor %%mm4, %%mm4 \n\t"\
56 "movl %0, %%edx \n\t"\
57 ".balign 16 \n\t" /* FIXME Unroll? */\
59 "movl (%1, %%edx, 4), %%esi \n\t"\
60 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
61 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
63 "pmulhw %%mm0, %%mm2 \n\t"\
64 "pmulhw %%mm0, %%mm5 \n\t"\
65 "paddw %%mm2, %%mm3 \n\t"\
66 "paddw %%mm5, %%mm4 \n\t"\
67 "addl $1, %%edx \n\t"\
69 "psraw $3, %%mm3 \n\t"\
70 "psraw $3, %%mm4 \n\t"\
71 "packuswb %%mm4, %%mm3 \n\t"\
72 MOVNTQ(%%mm3, (%3, %%eax))\
73 "addl $8, %%eax \n\t"\
74 "cmpl %4, %%eax \n\t"\
75 "pxor %%mm3, %%mm3 \n\t"\
76 "pxor %%mm4, %%mm4 \n\t"\
77 "movl %0, %%edx \n\t"\
80 #define YSCALEYUV2YV121 \
81 "movl %2, %%eax \n\t"\
82 ".balign 16 \n\t" /* FIXME Unroll? */\
84 "movq (%0, %%eax, 2), %%mm0 \n\t"\
85 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
86 "psraw $7, %%mm0 \n\t"\
87 "psraw $7, %%mm1 \n\t"\
88 "packuswb %%mm1, %%mm0 \n\t"\
89 MOVNTQ(%%mm0, (%1, %%eax))\
90 "addl $8, %%eax \n\t"\
94 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
95 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
96 "r" (dest), "m" (dstW),
97 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
98 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
100 #define YSCALEYUV2RGBX \
101 "xorl %%eax, %%eax \n\t"\
104 "movl %1, %%edx \n\t" /* -chrFilterSize */\
105 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
106 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
107 "pxor %%mm3, %%mm3 \n\t"\
108 "pxor %%mm4, %%mm4 \n\t"\
110 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
111 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
112 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
113 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
114 "pmulhw %%mm0, %%mm2 \n\t"\
115 "pmulhw %%mm0, %%mm5 \n\t"\
116 "paddw %%mm2, %%mm3 \n\t"\
117 "paddw %%mm5, %%mm4 \n\t"\
118 "addl $1, %%edx \n\t"\
121 "movl %0, %%edx \n\t" /* -lumFilterSize */\
122 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
123 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
124 "pxor %%mm1, %%mm1 \n\t"\
125 "pxor %%mm7, %%mm7 \n\t"\
127 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
128 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
129 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
130 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
131 "pmulhw %%mm0, %%mm2 \n\t"\
132 "pmulhw %%mm0, %%mm5 \n\t"\
133 "paddw %%mm2, %%mm1 \n\t"\
134 "paddw %%mm5, %%mm7 \n\t"\
135 "addl $1, %%edx \n\t"\
138 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
139 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
140 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
141 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
142 "pmulhw ugCoeff, %%mm3 \n\t"\
143 "pmulhw vgCoeff, %%mm4 \n\t"\
144 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
145 "pmulhw ubCoeff, %%mm2 \n\t"\
146 "pmulhw vrCoeff, %%mm5 \n\t"\
147 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
148 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
149 "pmulhw yCoeff, %%mm1 \n\t"\
150 "pmulhw yCoeff, %%mm7 \n\t"\
151 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
152 "paddw %%mm3, %%mm4 \n\t"\
153 "movq %%mm2, %%mm0 \n\t"\
154 "movq %%mm5, %%mm6 \n\t"\
155 "movq %%mm4, %%mm3 \n\t"\
156 "punpcklwd %%mm2, %%mm2 \n\t"\
157 "punpcklwd %%mm5, %%mm5 \n\t"\
158 "punpcklwd %%mm4, %%mm4 \n\t"\
159 "paddw %%mm1, %%mm2 \n\t"\
160 "paddw %%mm1, %%mm5 \n\t"\
161 "paddw %%mm1, %%mm4 \n\t"\
162 "punpckhwd %%mm0, %%mm0 \n\t"\
163 "punpckhwd %%mm6, %%mm6 \n\t"\
164 "punpckhwd %%mm3, %%mm3 \n\t"\
165 "paddw %%mm7, %%mm0 \n\t"\
166 "paddw %%mm7, %%mm6 \n\t"\
167 "paddw %%mm7, %%mm3 \n\t"\
168 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
169 "packuswb %%mm0, %%mm2 \n\t"\
170 "packuswb %%mm6, %%mm5 \n\t"\
171 "packuswb %%mm3, %%mm4 \n\t"\
172 "pxor %%mm7, %%mm7 \n\t"
174 #define FULL_YSCALEYUV2RGB \
175 "pxor %%mm7, %%mm7 \n\t"\
176 "movd %6, %%mm6 \n\t" /*yalpha1*/\
177 "punpcklwd %%mm6, %%mm6 \n\t"\
178 "punpcklwd %%mm6, %%mm6 \n\t"\
179 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
180 "punpcklwd %%mm5, %%mm5 \n\t"\
181 "punpcklwd %%mm5, %%mm5 \n\t"\
182 "xorl %%eax, %%eax \n\t"\
185 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
186 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
187 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
188 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
189 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
190 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
191 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
192 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
193 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
194 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
195 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
196 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
197 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
198 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
199 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
200 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
201 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
202 "pmulhw yCoeff, %%mm1 \n\t"\
205 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
206 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
207 "pmulhw ubCoeff, %%mm3 \n\t"\
208 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
209 "pmulhw ugCoeff, %%mm2 \n\t"\
210 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
211 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
214 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
215 "pmulhw vrCoeff, %%mm0 \n\t"\
216 "pmulhw vgCoeff, %%mm4 \n\t"\
217 "paddw %%mm1, %%mm3 \n\t" /* B*/\
218 "paddw %%mm1, %%mm0 \n\t" /* R*/\
219 "packuswb %%mm3, %%mm3 \n\t"\
221 "packuswb %%mm0, %%mm0 \n\t"\
222 "paddw %%mm4, %%mm2 \n\t"\
223 "paddw %%mm2, %%mm1 \n\t" /* G*/\
225 "packuswb %%mm1, %%mm1 \n\t"
227 #define YSCALEYUV2RGB \
228 "movd %6, %%mm6 \n\t" /*yalpha1*/\
229 "punpcklwd %%mm6, %%mm6 \n\t"\
230 "punpcklwd %%mm6, %%mm6 \n\t"\
231 "movq %%mm6, asm_yalpha1 \n\t"\
232 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
233 "punpcklwd %%mm5, %%mm5 \n\t"\
234 "punpcklwd %%mm5, %%mm5 \n\t"\
235 "movq %%mm5, asm_uvalpha1 \n\t"\
236 "xorl %%eax, %%eax \n\t"\
239 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
240 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
241 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
242 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
243 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
244 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
245 "movq asm_uvalpha1, %%mm0 \n\t"\
246 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
247 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
248 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
249 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
250 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
251 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
252 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
253 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
254 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
255 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
256 "pmulhw ugCoeff, %%mm3 \n\t"\
257 "pmulhw vgCoeff, %%mm4 \n\t"\
258 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
259 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
260 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
261 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
262 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
263 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
264 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
265 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
266 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
267 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
268 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
269 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
270 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
271 "pmulhw ubCoeff, %%mm2 \n\t"\
272 "pmulhw vrCoeff, %%mm5 \n\t"\
273 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
274 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
275 "pmulhw yCoeff, %%mm1 \n\t"\
276 "pmulhw yCoeff, %%mm7 \n\t"\
277 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
278 "paddw %%mm3, %%mm4 \n\t"\
279 "movq %%mm2, %%mm0 \n\t"\
280 "movq %%mm5, %%mm6 \n\t"\
281 "movq %%mm4, %%mm3 \n\t"\
282 "punpcklwd %%mm2, %%mm2 \n\t"\
283 "punpcklwd %%mm5, %%mm5 \n\t"\
284 "punpcklwd %%mm4, %%mm4 \n\t"\
285 "paddw %%mm1, %%mm2 \n\t"\
286 "paddw %%mm1, %%mm5 \n\t"\
287 "paddw %%mm1, %%mm4 \n\t"\
288 "punpckhwd %%mm0, %%mm0 \n\t"\
289 "punpckhwd %%mm6, %%mm6 \n\t"\
290 "punpckhwd %%mm3, %%mm3 \n\t"\
291 "paddw %%mm7, %%mm0 \n\t"\
292 "paddw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm7, %%mm3 \n\t"\
294 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
295 "packuswb %%mm0, %%mm2 \n\t"\
296 "packuswb %%mm6, %%mm5 \n\t"\
297 "packuswb %%mm3, %%mm4 \n\t"\
298 "pxor %%mm7, %%mm7 \n\t"
300 #define YSCALEYUV2RGB1 \
301 "xorl %%eax, %%eax \n\t"\
304 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
305 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
309 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw ugCoeff, %%mm3 \n\t"\
313 "pmulhw vgCoeff, %%mm4 \n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
316 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
317 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319 "pmulhw ubCoeff, %%mm2 \n\t"\
320 "pmulhw vrCoeff, %%mm5 \n\t"\
321 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
322 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
323 "pmulhw yCoeff, %%mm1 \n\t"\
324 "pmulhw yCoeff, %%mm7 \n\t"\
325 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326 "paddw %%mm3, %%mm4 \n\t"\
327 "movq %%mm2, %%mm0 \n\t"\
328 "movq %%mm5, %%mm6 \n\t"\
329 "movq %%mm4, %%mm3 \n\t"\
330 "punpcklwd %%mm2, %%mm2 \n\t"\
331 "punpcklwd %%mm5, %%mm5 \n\t"\
332 "punpcklwd %%mm4, %%mm4 \n\t"\
333 "paddw %%mm1, %%mm2 \n\t"\
334 "paddw %%mm1, %%mm5 \n\t"\
335 "paddw %%mm1, %%mm4 \n\t"\
336 "punpckhwd %%mm0, %%mm0 \n\t"\
337 "punpckhwd %%mm6, %%mm6 \n\t"\
338 "punpckhwd %%mm3, %%mm3 \n\t"\
339 "paddw %%mm7, %%mm0 \n\t"\
340 "paddw %%mm7, %%mm6 \n\t"\
341 "paddw %%mm7, %%mm3 \n\t"\
342 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343 "packuswb %%mm0, %%mm2 \n\t"\
344 "packuswb %%mm6, %%mm5 \n\t"\
345 "packuswb %%mm3, %%mm4 \n\t"\
346 "pxor %%mm7, %%mm7 \n\t"
348 // do vertical chrominance interpolation
349 #define YSCALEYUV2RGB1b \
350 "xorl %%eax, %%eax \n\t"\
353 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
354 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
355 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
356 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
357 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
358 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
359 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
360 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
361 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
362 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
363 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
364 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
365 "pmulhw ugCoeff, %%mm3 \n\t"\
366 "pmulhw vgCoeff, %%mm4 \n\t"\
367 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
370 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
371 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
372 "pmulhw ubCoeff, %%mm2 \n\t"\
373 "pmulhw vrCoeff, %%mm5 \n\t"\
374 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
375 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
376 "pmulhw yCoeff, %%mm1 \n\t"\
377 "pmulhw yCoeff, %%mm7 \n\t"\
378 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
379 "paddw %%mm3, %%mm4 \n\t"\
380 "movq %%mm2, %%mm0 \n\t"\
381 "movq %%mm5, %%mm6 \n\t"\
382 "movq %%mm4, %%mm3 \n\t"\
383 "punpcklwd %%mm2, %%mm2 \n\t"\
384 "punpcklwd %%mm5, %%mm5 \n\t"\
385 "punpcklwd %%mm4, %%mm4 \n\t"\
386 "paddw %%mm1, %%mm2 \n\t"\
387 "paddw %%mm1, %%mm5 \n\t"\
388 "paddw %%mm1, %%mm4 \n\t"\
389 "punpckhwd %%mm0, %%mm0 \n\t"\
390 "punpckhwd %%mm6, %%mm6 \n\t"\
391 "punpckhwd %%mm3, %%mm3 \n\t"\
392 "paddw %%mm7, %%mm0 \n\t"\
393 "paddw %%mm7, %%mm6 \n\t"\
394 "paddw %%mm7, %%mm3 \n\t"\
395 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
396 "packuswb %%mm0, %%mm2 \n\t"\
397 "packuswb %%mm6, %%mm5 \n\t"\
398 "packuswb %%mm3, %%mm4 \n\t"\
399 "pxor %%mm7, %%mm7 \n\t"
402 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
403 "movq %%mm2, %%mm1 \n\t" /* B */\
404 "movq %%mm5, %%mm6 \n\t" /* R */\
405 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
406 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
407 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
408 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
409 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
410 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
411 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
412 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
413 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
414 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
416 MOVNTQ(%%mm0, (%4, %%eax, 4))\
417 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
418 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
419 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
421 "addl $8, %%eax \n\t"\
422 "cmpl %5, %%eax \n\t"\
426 "pand bF8, %%mm2 \n\t" /* B */\
427 "pand bFC, %%mm4 \n\t" /* G */\
428 "pand bF8, %%mm5 \n\t" /* R */\
429 "psrlq $3, %%mm2 \n\t"\
431 "movq %%mm2, %%mm1 \n\t"\
432 "movq %%mm4, %%mm3 \n\t"\
434 "punpcklbw %%mm7, %%mm3 \n\t"\
435 "punpcklbw %%mm5, %%mm2 \n\t"\
436 "punpckhbw %%mm7, %%mm4 \n\t"\
437 "punpckhbw %%mm5, %%mm1 \n\t"\
439 "psllq $3, %%mm3 \n\t"\
440 "psllq $3, %%mm4 \n\t"\
442 "por %%mm3, %%mm2 \n\t"\
443 "por %%mm4, %%mm1 \n\t"\
445 MOVNTQ(%%mm2, (%4, %%eax, 2))\
446 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
448 "addl $8, %%eax \n\t"\
449 "cmpl %5, %%eax \n\t"\
453 "pand bF8, %%mm2 \n\t" /* B */\
454 "pand bF8, %%mm4 \n\t" /* G */\
455 "pand bF8, %%mm5 \n\t" /* R */\
456 "psrlq $3, %%mm2 \n\t"\
457 "psrlq $1, %%mm5 \n\t"\
459 "movq %%mm2, %%mm1 \n\t"\
460 "movq %%mm4, %%mm3 \n\t"\
462 "punpcklbw %%mm7, %%mm3 \n\t"\
463 "punpcklbw %%mm5, %%mm2 \n\t"\
464 "punpckhbw %%mm7, %%mm4 \n\t"\
465 "punpckhbw %%mm5, %%mm1 \n\t"\
467 "psllq $2, %%mm3 \n\t"\
468 "psllq $2, %%mm4 \n\t"\
470 "por %%mm3, %%mm2 \n\t"\
471 "por %%mm4, %%mm1 \n\t"\
473 MOVNTQ(%%mm2, (%4, %%eax, 2))\
474 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
476 "addl $8, %%eax \n\t"\
477 "cmpl %5, %%eax \n\t"\
480 #define WRITEBGR24OLD \
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
482 "movq %%mm2, %%mm1 \n\t" /* B */\
483 "movq %%mm5, %%mm6 \n\t" /* R */\
484 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
485 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
486 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
487 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
488 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
489 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
490 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
491 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
492 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
493 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
495 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
496 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
497 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
498 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
499 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
500 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
501 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
502 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
504 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
506 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
507 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
508 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
509 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
510 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
511 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
512 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
513 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
514 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
515 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
516 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
518 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
519 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
520 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
521 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
522 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
523 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
524 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
525 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
527 MOVNTQ(%%mm0, (%%ebx))\
528 MOVNTQ(%%mm2, 8(%%ebx))\
529 MOVNTQ(%%mm3, 16(%%ebx))\
530 "addl $24, %%ebx \n\t"\
532 "addl $8, %%eax \n\t"\
533 "cmpl %5, %%eax \n\t"\
536 #define WRITEBGR24MMX \
537 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
538 "movq %%mm2, %%mm1 \n\t" /* B */\
539 "movq %%mm5, %%mm6 \n\t" /* R */\
540 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
541 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
542 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
543 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
544 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
545 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
546 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
547 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
548 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
549 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
551 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
552 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
553 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
554 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
556 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
557 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
558 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
559 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
561 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
562 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
563 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
564 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
566 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
567 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
568 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
569 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
570 MOVNTQ(%%mm0, (%%ebx))\
572 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
573 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
574 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
575 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
576 MOVNTQ(%%mm6, 8(%%ebx))\
578 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
579 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
580 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
581 MOVNTQ(%%mm5, 16(%%ebx))\
583 "addl $24, %%ebx \n\t"\
585 "addl $8, %%eax \n\t"\
586 "cmpl %5, %%eax \n\t"\
589 #define WRITEBGR24MMX2 \
590 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
591 "movq M24A, %%mm0 \n\t"\
592 "movq M24C, %%mm7 \n\t"\
593 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
594 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
595 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
597 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
598 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
599 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
601 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
602 "por %%mm1, %%mm6 \n\t"\
603 "por %%mm3, %%mm6 \n\t"\
604 MOVNTQ(%%mm6, (%%ebx))\
606 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
607 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
608 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
609 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
611 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
612 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
613 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
615 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
616 "por %%mm3, %%mm6 \n\t"\
617 MOVNTQ(%%mm6, 8(%%ebx))\
619 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
620 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
621 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
623 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
624 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
625 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
627 "por %%mm1, %%mm3 \n\t"\
628 "por %%mm3, %%mm6 \n\t"\
629 MOVNTQ(%%mm6, 16(%%ebx))\
631 "addl $24, %%ebx \n\t"\
633 "addl $8, %%eax \n\t"\
634 "cmpl %5, %%eax \n\t"\
639 #define WRITEBGR24 WRITEBGR24MMX2
642 #define WRITEBGR24 WRITEBGR24MMX
645 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
646 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
647 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
648 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
655 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
656 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
657 : "%eax", "%edx", "%esi"
661 YSCALEYUV2YV12X(4096)
662 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
663 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
664 : "%eax", "%edx", "%esi"
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672 : "%eax", "%edx", "%esi"
675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676 chrFilter, chrSrc, chrFilterSize,
677 dest, uDest, vDest, dstW);
681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
689 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
696 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
704 :: "r" (lumSrc + dstW), "r" (dest + dstW),
709 //FIXME Optimize (just quickly writen not opti..)
710 //FIXME replace MINMAX with LUTs
712 for(i=0; i<dstW; i++)
714 int val= lumSrc[i]>>7;
716 dest[i]= MIN(MAX(val>>19, 0), 255);
720 for(i=0; i<(dstW>>1); i++)
723 int v=chrSrc[i + 2048]>>7;
725 uDest[i]= MIN(MAX(u>>19, 0), 255);
726 vDest[i]= MIN(MAX(v>>19, 0), 255);
733 * vertical scale YV12 to RGB
735 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
736 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
737 uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
746 if(dstbpp == 32) //FIXME untested
752 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
753 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
754 "r" (dest), "m" (dstW),
755 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
756 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
759 else if(dstbpp==24) //FIXME untested
763 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
764 "addl %4, %%ebx \n\t"
767 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
768 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
769 "r" (dest), "m" (dstW),
770 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
771 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
780 "paddusb b5Dither, %%mm2 \n\t"
781 "paddusb g5Dither, %%mm4 \n\t"
782 "paddusb r5Dither, %%mm5 \n\t"
787 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789 "r" (dest), "m" (dstW),
790 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
798 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
800 "paddusb b5Dither, %%mm2 \n\t"
801 "paddusb g6Dither, %%mm4 \n\t"
802 "paddusb r5Dither, %%mm5 \n\t"
807 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809 "r" (dest), "m" (dstW),
810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
816 chrFilter, chrSrc, chrFilterSize,
825 * vertical bilinear scale YV12 to RGB
827 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
828 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
830 int yalpha1=yalpha^4095;
831 int uvalpha1=uvalpha^4095;
843 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
844 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
846 "movq %%mm3, %%mm1 \n\t"
847 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
848 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
850 MOVNTQ(%%mm3, (%4, %%eax, 4))
851 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
853 "addl $4, %%eax \n\t"
854 "cmpl %5, %%eax \n\t"
858 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
859 "m" (yalpha1), "m" (uvalpha1)
870 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
871 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
873 "movq %%mm3, %%mm1 \n\t"
874 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
875 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
877 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
878 "psrlq $8, %%mm3 \n\t" // GR0BGR00
879 "pand bm00000111, %%mm2 \n\t" // BGR00000
880 "pand bm11111000, %%mm3 \n\t" // 000BGR00
881 "por %%mm2, %%mm3 \n\t" // BGRBGR00
882 "movq %%mm1, %%mm2 \n\t"
883 "psllq $48, %%mm1 \n\t" // 000000BG
884 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
886 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
887 "psrld $16, %%mm2 \n\t" // R000R000
888 "psrlq $24, %%mm1 \n\t" // 0BGR0000
889 "por %%mm2, %%mm1 \n\t" // RBGRR000
891 "movl %4, %%ebx \n\t"
892 "addl %%eax, %%ebx \n\t"
896 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
897 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
899 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
900 "psrlq $32, %%mm3 \n\t"
901 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
902 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
904 "addl $4, %%eax \n\t"
905 "cmpl %5, %%eax \n\t"
908 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
909 "m" (yalpha1), "m" (uvalpha1)
919 "paddusb g5Dither, %%mm1 \n\t"
920 "paddusb r5Dither, %%mm0 \n\t"
921 "paddusb b5Dither, %%mm3 \n\t"
923 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
924 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
925 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
927 "psrlw $3, %%mm3 \n\t"
928 "psllw $2, %%mm1 \n\t"
929 "psllw $7, %%mm0 \n\t"
930 "pand g15Mask, %%mm1 \n\t"
931 "pand r15Mask, %%mm0 \n\t"
933 "por %%mm3, %%mm1 \n\t"
934 "por %%mm1, %%mm0 \n\t"
936 MOVNTQ(%%mm0, (%4, %%eax, 2))
938 "addl $4, %%eax \n\t"
939 "cmpl %5, %%eax \n\t"
942 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
943 "m" (yalpha1), "m" (uvalpha1)
953 "paddusb g6Dither, %%mm1 \n\t"
954 "paddusb r5Dither, %%mm0 \n\t"
955 "paddusb b5Dither, %%mm3 \n\t"
957 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
958 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
959 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
961 "psrlw $3, %%mm3 \n\t"
962 "psllw $3, %%mm1 \n\t"
963 "psllw $8, %%mm0 \n\t"
964 "pand g16Mask, %%mm1 \n\t"
965 "pand r16Mask, %%mm0 \n\t"
967 "por %%mm3, %%mm1 \n\t"
968 "por %%mm1, %%mm0 \n\t"
970 MOVNTQ(%%mm0, (%4, %%eax, 2))
972 "addl $4, %%eax \n\t"
973 "cmpl %5, %%eax \n\t"
976 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
977 "m" (yalpha1), "m" (uvalpha1)
982 if(dstbpp==32 || dstbpp==24)
986 // vertical linear interpolation && yuv2rgb in a single step:
987 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
988 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
989 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
990 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
991 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
992 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1000 // vertical linear interpolation && yuv2rgb in a single step:
1001 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1005 ((uint16_t*)dest)[i] =
1006 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1007 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1008 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1014 for(i=0;i<dstW;i++){
1015 // vertical linear interpolation && yuv2rgb in a single step:
1016 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1017 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1018 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1020 ((uint16_t*)dest)[i] =
1021 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1022 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1023 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1037 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1038 "m" (yalpha1), "m" (uvalpha1)
1045 "movl %4, %%ebx \n\t"
1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1050 "m" (yalpha1), "m" (uvalpha1)
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1060 "paddusb b5Dither, %%mm2 \n\t"
1061 "paddusb g5Dither, %%mm4 \n\t"
1062 "paddusb r5Dither, %%mm5 \n\t"
1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1068 "m" (yalpha1), "m" (uvalpha1)
1076 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078 "paddusb b5Dither, %%mm2 \n\t"
1079 "paddusb g6Dither, %%mm4 \n\t"
1080 "paddusb r5Dither, %%mm5 \n\t"
1085 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1086 "m" (yalpha1), "m" (uvalpha1)
1094 for(i=0; i<dstW-1; i+=2){
1095 // vertical linear interpolation && yuv2rgb in a single step:
1096 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1097 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1098 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1099 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1101 int Cb= yuvtab_40cf[U];
1102 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1103 int Cr= yuvtab_3343[V];
1105 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1106 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1107 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1109 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1110 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1111 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1117 for(i=0; i<dstW-1; i+=2){
1118 // vertical linear interpolation && yuv2rgb in a single step:
1119 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1120 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1121 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1122 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1124 int Cb= yuvtab_40cf[U];
1125 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1126 int Cr= yuvtab_3343[V];
1128 dest[0]=clip_table[((Y1 + Cb) >>13)];
1129 dest[1]=clip_table[((Y1 + Cg) >>13)];
1130 dest[2]=clip_table[((Y1 + Cr) >>13)];
1132 dest[3]=clip_table[((Y2 + Cb) >>13)];
1133 dest[4]=clip_table[((Y2 + Cg) >>13)];
1134 dest[5]=clip_table[((Y2 + Cr) >>13)];
1141 for(i=0; i<dstW-1; i+=2){
1142 // vertical linear interpolation && yuv2rgb in a single step:
1143 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1144 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1145 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1146 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1148 int Cb= yuvtab_40cf[U];
1149 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1150 int Cr= yuvtab_3343[V];
1152 ((uint16_t*)dest)[i] =
1153 clip_table16b[(Y1 + Cb) >>13] |
1154 clip_table16g[(Y1 + Cg) >>13] |
1155 clip_table16r[(Y1 + Cr) >>13];
1157 ((uint16_t*)dest)[i+1] =
1158 clip_table16b[(Y2 + Cb) >>13] |
1159 clip_table16g[(Y2 + Cg) >>13] |
1160 clip_table16r[(Y2 + Cr) >>13];
1166 for(i=0; i<dstW-1; i+=2){
1167 // vertical linear interpolation && yuv2rgb in a single step:
1168 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1169 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1170 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1171 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1173 int Cb= yuvtab_40cf[U];
1174 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1175 int Cr= yuvtab_3343[V];
1177 ((uint16_t*)dest)[i] =
1178 clip_table15b[(Y1 + Cb) >>13] |
1179 clip_table15g[(Y1 + Cg) >>13] |
1180 clip_table15r[(Y1 + Cr) >>13];
1182 ((uint16_t*)dest)[i+1] =
1183 clip_table15b[(Y2 + Cb) >>13] |
1184 clip_table15g[(Y2 + Cg) >>13] |
1185 clip_table15r[(Y2 + Cr) >>13];
1193 * YV12 to RGB without scaling or interpolating
1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp)
1198 int uvalpha1=uvalpha^4095;
1199 const int yalpha1=0;
1201 if(fullUVIpol || allwaysIpol)
1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
1208 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1215 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1216 "m" (yalpha1), "m" (uvalpha1)
1223 "movl %4, %%ebx \n\t"
1226 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1227 "m" (yalpha1), "m" (uvalpha1)
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1237 "paddusb b5Dither, %%mm2 \n\t"
1238 "paddusb g5Dither, %%mm4 \n\t"
1239 "paddusb r5Dither, %%mm5 \n\t"
1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243 "m" (yalpha1), "m" (uvalpha1)
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 "paddusb b5Dither, %%mm2 \n\t"
1254 "paddusb g6Dither, %%mm4 \n\t"
1255 "paddusb r5Dither, %%mm5 \n\t"
1259 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1260 "m" (yalpha1), "m" (uvalpha1)
1272 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1273 "m" (yalpha1), "m" (uvalpha1)
1280 "movl %4, %%ebx \n\t"
1283 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1284 "m" (yalpha1), "m" (uvalpha1)
1292 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1294 "paddusb b5Dither, %%mm2 \n\t"
1295 "paddusb g5Dither, %%mm4 \n\t"
1296 "paddusb r5Dither, %%mm5 \n\t"
1299 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300 "m" (yalpha1), "m" (uvalpha1)
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1310 "paddusb b5Dither, %%mm2 \n\t"
1311 "paddusb g6Dither, %%mm4 \n\t"
1312 "paddusb r5Dither, %%mm5 \n\t"
1316 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1317 "m" (yalpha1), "m" (uvalpha1)
1323 //FIXME write 2 versions (for even & odd lines)
1328 for(i=0; i<dstW-1; i+=2){
1329 // vertical linear interpolation && yuv2rgb in a single step:
1330 int Y1=yuvtab_2568[buf0[i]>>7];
1331 int Y2=yuvtab_2568[buf0[i+1]>>7];
1332 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1333 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1335 int Cb= yuvtab_40cf[U];
1336 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1337 int Cr= yuvtab_3343[V];
1339 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1340 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1341 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1343 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1344 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1345 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1351 for(i=0; i<dstW-1; i+=2){
1352 // vertical linear interpolation && yuv2rgb in a single step:
1353 int Y1=yuvtab_2568[buf0[i]>>7];
1354 int Y2=yuvtab_2568[buf0[i+1]>>7];
1355 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1356 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1358 int Cb= yuvtab_40cf[U];
1359 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1360 int Cr= yuvtab_3343[V];
1362 dest[0]=clip_table[((Y1 + Cb) >>13)];
1363 dest[1]=clip_table[((Y1 + Cg) >>13)];
1364 dest[2]=clip_table[((Y1 + Cr) >>13)];
1366 dest[3]=clip_table[((Y2 + Cb) >>13)];
1367 dest[4]=clip_table[((Y2 + Cg) >>13)];
1368 dest[5]=clip_table[((Y2 + Cr) >>13)];
1375 for(i=0; i<dstW-1; i+=2){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y1=yuvtab_2568[buf0[i]>>7];
1378 int Y2=yuvtab_2568[buf0[i+1]>>7];
1379 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1380 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1382 int Cb= yuvtab_40cf[U];
1383 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1384 int Cr= yuvtab_3343[V];
1386 ((uint16_t*)dest)[i] =
1387 clip_table16b[(Y1 + Cb) >>13] |
1388 clip_table16g[(Y1 + Cg) >>13] |
1389 clip_table16r[(Y1 + Cr) >>13];
1391 ((uint16_t*)dest)[i+1] =
1392 clip_table16b[(Y2 + Cb) >>13] |
1393 clip_table16g[(Y2 + Cg) >>13] |
1394 clip_table16r[(Y2 + Cr) >>13];
1400 for(i=0; i<dstW-1; i+=2){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y1=yuvtab_2568[buf0[i]>>7];
1403 int Y2=yuvtab_2568[buf0[i+1]>>7];
1404 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1405 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1407 int Cb= yuvtab_40cf[U];
1408 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1409 int Cr= yuvtab_3343[V];
1411 ((uint16_t*)dest)[i] =
1412 clip_table15b[(Y1 + Cb) >>13] |
1413 clip_table15g[(Y1 + Cg) >>13] |
1414 clip_table15r[(Y1 + Cr) >>13];
1416 ((uint16_t*)dest)[i+1] =
1417 clip_table15b[(Y2 + Cb) >>13] |
1418 clip_table15g[(Y2 + Cg) >>13] |
1419 clip_table15r[(Y2 + Cr) >>13];
1425 // Bilinear / Bicubic scaling
1426 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1427 int16_t *filter, int16_t *filterPos, int filterSize)
1430 if(filterSize==4) // allways true for upscaling, sometimes for down too
1432 int counter= -2*dstW;
1434 filterPos-= counter/2;
1437 "pxor %%mm7, %%mm7 \n\t"
1438 "movq w02, %%mm6 \n\t"
1439 "pushl %%ebp \n\t" // we use 7 regs here ...
1440 "movl %%eax, %%ebp \n\t"
1443 "movzwl (%2, %%ebp), %%eax \n\t"
1444 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1445 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1446 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1447 "movd (%3, %%eax), %%mm0 \n\t"
1448 "movd (%3, %%ebx), %%mm2 \n\t"
1449 "punpcklbw %%mm7, %%mm0 \n\t"
1450 "punpcklbw %%mm7, %%mm2 \n\t"
1451 "pmaddwd %%mm1, %%mm0 \n\t"
1452 "pmaddwd %%mm2, %%mm3 \n\t"
1453 "psrad $8, %%mm0 \n\t"
1454 "psrad $8, %%mm3 \n\t"
1455 "packssdw %%mm3, %%mm0 \n\t"
1456 "pmaddwd %%mm6, %%mm0 \n\t"
1457 "packssdw %%mm0, %%mm0 \n\t"
1458 "movd %%mm0, (%4, %%ebp) \n\t"
1459 "addl $4, %%ebp \n\t"
1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1468 else if(filterSize==8)
1470 int counter= -2*dstW;
1472 filterPos-= counter/2;
1475 "pxor %%mm7, %%mm7 \n\t"
1476 "movq w02, %%mm6 \n\t"
1477 "pushl %%ebp \n\t" // we use 7 regs here ...
1478 "movl %%eax, %%ebp \n\t"
1481 "movzwl (%2, %%ebp), %%eax \n\t"
1482 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1483 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1484 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1485 "movd (%3, %%eax), %%mm0 \n\t"
1486 "movd (%3, %%ebx), %%mm2 \n\t"
1487 "punpcklbw %%mm7, %%mm0 \n\t"
1488 "punpcklbw %%mm7, %%mm2 \n\t"
1489 "pmaddwd %%mm1, %%mm0 \n\t"
1490 "pmaddwd %%mm2, %%mm3 \n\t"
1492 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1493 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1494 "movd 4(%3, %%eax), %%mm4 \n\t"
1495 "movd 4(%3, %%ebx), %%mm2 \n\t"
1496 "punpcklbw %%mm7, %%mm4 \n\t"
1497 "punpcklbw %%mm7, %%mm2 \n\t"
1498 "pmaddwd %%mm1, %%mm4 \n\t"
1499 "pmaddwd %%mm2, %%mm5 \n\t"
1500 "paddd %%mm4, %%mm0 \n\t"
1501 "paddd %%mm5, %%mm3 \n\t"
1503 "psrad $8, %%mm0 \n\t"
1504 "psrad $8, %%mm3 \n\t"
1505 "packssdw %%mm3, %%mm0 \n\t"
1506 "pmaddwd %%mm6, %%mm0 \n\t"
1507 "packssdw %%mm0, %%mm0 \n\t"
1508 "movd %%mm0, (%4, %%ebp) \n\t"
1509 "addl $4, %%ebp \n\t"
1514 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1520 int counter= -2*dstW;
1521 // filter-= counter*filterSize/2;
1522 filterPos-= counter/2;
1525 "pxor %%mm7, %%mm7 \n\t"
1526 "movq w02, %%mm6 \n\t"
1529 "movl %2, %%ecx \n\t"
1530 "movzwl (%%ecx, %0), %%eax \n\t"
1531 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1532 "movl %5, %%ecx \n\t"
1533 "pxor %%mm4, %%mm4 \n\t"
1534 "pxor %%mm5, %%mm5 \n\t"
1536 "movq (%1), %%mm1 \n\t"
1537 "movq (%1, %6), %%mm3 \n\t"
1538 "movd (%%ecx, %%eax), %%mm0 \n\t"
1539 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1540 "punpcklbw %%mm7, %%mm0 \n\t"
1541 "punpcklbw %%mm7, %%mm2 \n\t"
1542 "pmaddwd %%mm1, %%mm0 \n\t"
1543 "pmaddwd %%mm2, %%mm3 \n\t"
1544 "paddd %%mm3, %%mm5 \n\t"
1545 "paddd %%mm0, %%mm4 \n\t"
1547 "addl $4, %%ecx \n\t"
1548 "cmpl %4, %%ecx \n\t"
1551 "psrad $8, %%mm4 \n\t"
1552 "psrad $8, %%mm5 \n\t"
1553 "packssdw %%mm5, %%mm4 \n\t"
1554 "pmaddwd %%mm6, %%mm4 \n\t"
1555 "packssdw %%mm4, %%mm4 \n\t"
1556 "movl %3, %%eax \n\t"
1557 "movd %%mm4, (%%eax, %0) \n\t"
1561 : "+r" (counter), "+r" (filter)
1562 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1563 "m" (src), "r" (filterSize*2)
1564 : "%ebx", "%eax", "%ecx"
1569 for(i=0; i<dstW; i++)
1572 int srcPos= filterPos[i];
1574 // printf("filterPos: %d\n", filterPos[i]);
1575 for(j=0; j<filterSize; j++)
1577 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1578 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1580 // filter += hFilterSize;
1581 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1586 // *** horizontal scale Y line to temp buffer
1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1593 if(sws_flags != SWS_FAST_BILINEAR)
1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1598 else // Fast Bilinear upscale / crap downscale
1606 "pxor %%mm7, %%mm7 \n\t"
1607 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1608 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1609 "punpcklwd %%mm6, %%mm6 \n\t"
1610 "punpcklwd %%mm6, %%mm6 \n\t"
1611 "movq %%mm6, %%mm2 \n\t"
1612 "psllq $16, %%mm2 \n\t"
1613 "paddw %%mm6, %%mm2 \n\t"
1614 "psllq $16, %%mm2 \n\t"
1615 "paddw %%mm6, %%mm2 \n\t"
1616 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1617 "movq %%mm2, temp0 \n\t"
1618 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1619 "punpcklwd %%mm6, %%mm6 \n\t"
1620 "punpcklwd %%mm6, %%mm6 \n\t"
1621 "xorl %%eax, %%eax \n\t" // i
1622 "movl %0, %%esi \n\t" // src
1623 "movl %1, %%edi \n\t" // buf1
1624 "movl %3, %%edx \n\t" // (xInc*4)>>16
1625 "xorl %%ecx, %%ecx \n\t"
1626 "xorl %%ebx, %%ebx \n\t"
1627 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1629 #define FUNNY_Y_CODE \
1630 PREFETCH" 1024(%%esi) \n\t"\
1631 PREFETCH" 1056(%%esi) \n\t"\
1632 PREFETCH" 1088(%%esi) \n\t"\
1633 "call funnyYCode \n\t"\
1634 "movq temp0, %%mm2 \n\t"\
1635 "xorl %%ecx, %%ecx \n\t"
1646 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1647 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1648 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1650 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
1655 //NO MMX just normal asm ...
1657 "xorl %%eax, %%eax \n\t" // i
1658 "xorl %%ebx, %%ebx \n\t" // xx
1659 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1662 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1663 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1664 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1665 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1666 "shll $16, %%edi \n\t"
1667 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1668 "movl %1, %%edi \n\t"
1669 "shrl $9, %%esi \n\t"
1670 "movw %%si, (%%edi, %%eax, 2) \n\t"
1671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1672 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1674 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1675 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1676 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1677 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1678 "shll $16, %%edi \n\t"
1679 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1680 "movl %1, %%edi \n\t"
1681 "shrl $9, %%esi \n\t"
1682 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
1683 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1684 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1687 "addl $2, %%eax \n\t"
1688 "cmpl %2, %%eax \n\t"
1692 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1693 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1696 } //if MMX2 cant be used
1700 unsigned int xpos=0;
1701 for(i=0;i<dstWidth;i++)
1703 register unsigned int xx=xpos>>16;
1704 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1705 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc)
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1719 if(sws_flags != SWS_FAST_BILINEAR)
1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1725 else // Fast Bilinear upscale / crap downscale
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1735 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1736 "punpcklwd %%mm6, %%mm6 \n\t"
1737 "punpcklwd %%mm6, %%mm6 \n\t"
1738 "movq %%mm6, %%mm2 \n\t"
1739 "psllq $16, %%mm2 \n\t"
1740 "paddw %%mm6, %%mm2 \n\t"
1741 "psllq $16, %%mm2 \n\t"
1742 "paddw %%mm6, %%mm2 \n\t"
1743 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1744 "movq %%mm2, temp0 \n\t"
1745 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1746 "punpcklwd %%mm6, %%mm6 \n\t"
1747 "punpcklwd %%mm6, %%mm6 \n\t"
1748 "xorl %%eax, %%eax \n\t" // i
1749 "movl %0, %%esi \n\t" // src
1750 "movl %1, %%edi \n\t" // buf1
1751 "movl %3, %%edx \n\t" // (xInc*4)>>16
1752 "xorl %%ecx, %%ecx \n\t"
1753 "xorl %%ebx, %%ebx \n\t"
1754 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1756 #define FUNNYUVCODE \
1757 PREFETCH" 1024(%%esi) \n\t"\
1758 PREFETCH" 1056(%%esi) \n\t"\
1759 PREFETCH" 1088(%%esi) \n\t"\
1760 "call funnyUVCode \n\t"\
1761 "movq temp0, %%mm2 \n\t"\
1762 "xorl %%ecx, %%ecx \n\t"
1773 "xorl %%eax, %%eax \n\t" // i
1774 "movl %6, %%esi \n\t" // src
1775 "movl %1, %%edi \n\t" // buf1
1776 "addl $4096, %%edi \n\t"
1788 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1789 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1790 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1792 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1794 // printf("%d %d %d\n", dstWidth, i, srcW);
1795 dst[i] = src1[srcW-1]*128;
1796 dst[i+2048] = src2[srcW-1]*128;
1803 "xorl %%eax, %%eax \n\t" // i
1804 "xorl %%ebx, %%ebx \n\t" // xx
1805 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1808 "movl %0, %%esi \n\t"
1809 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1810 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1811 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1812 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1813 "shll $16, %%edi \n\t"
1814 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1815 "movl %1, %%edi \n\t"
1816 "shrl $9, %%esi \n\t"
1817 "movw %%si, (%%edi, %%eax, 2) \n\t"
1819 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1820 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1821 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1822 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1823 "shll $16, %%edi \n\t"
1824 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1825 "movl %1, %%edi \n\t"
1826 "shrl $9, %%esi \n\t"
1827 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1829 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1830 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1831 "addl $1, %%eax \n\t"
1832 "cmpl %2, %%eax \n\t"
1835 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1837 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1840 } //if MMX2 cant be used
1844 unsigned int xpos=0;
1845 for(i=0;i<dstWidth;i++)
1847 register unsigned int xx=xpos>>16;
1848 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1849 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1850 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1852 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1853 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
1862 int srcW, int dstW, int filterAlign, int one)
1865 double filter[8000];
1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
1870 if(ABS(xInc - 0x10000) <10) // unscaled
1873 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
1874 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
1876 for(i=0; i<dstW; i++)
1878 filter[i*(*filterSize)]=1;
1883 else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
1887 if(sws_flags==SWS_BICUBIC) *filterSize= 4;
1888 else *filterSize= 2;
1889 // printf("%d %d %d\n", filterSize, srcW, dstW);
1890 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1892 xDstInSrc= xInc/2 - 0x8000;
1893 for(i=0; i<dstW; i++)
1895 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
1899 if(sws_flags == SWS_BICUBIC)
1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
1904 // Equation is from VirtualDub
1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
1908 y4 = ( + A*d*d - A*d*d*d);
1910 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1911 filter[i*(*filterSize) + 0]= y1;
1912 filter[i*(*filterSize) + 1]= y2;
1913 filter[i*(*filterSize) + 2]= y3;
1914 filter[i*(*filterSize) + 3]= y4;
1915 // printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
1919 for(j=0; j<*filterSize; j++)
1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
1922 double coeff= 1.0 - d;
1923 if(coeff<0) coeff=0;
1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1925 filter[i*(*filterSize) + j]= coeff;
1935 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
1936 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
1937 // printf("%d %d %d\n", *filterSize, srcW, dstW);
1938 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1940 xDstInSrc= xInc/2 - 0x8000;
1941 for(i=0; i<dstW; i++)
1943 int xx= (int)((double)xDstInSrc/(double)(1<<16) - ((*filterSize)-1)*0.5 + 0.5);
1947 for(j=0; j<*filterSize; j++)
1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
1951 if(sws_flags == SWS_BICUBIC)
1955 // Equation is from VirtualDub
1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
1966 if(coeff<0) coeff=0;
1968 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1969 filter[i*(*filterSize) + j]= coeff;
1977 for(i=0; i<dstW; i++)
1980 if(filterPos[i] < 0)
1982 // Move filter coeffs left to compensate for filterPos
1983 for(j=1; j<*filterSize; j++)
1985 int left= MAX(j + filterPos[i], 0);
1986 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
1987 filter[i*(*filterSize) + j]=0;
1992 if(filterPos[i] + (*filterSize) > srcW)
1994 int shift= filterPos[i] + (*filterSize) - srcW;
1995 // Move filter coeffs right to compensate for filterPos
1996 for(j=(*filterSize)-2; j>=0; j--)
1998 int right= MIN(j + shift, (*filterSize)-1);
1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2000 filter[i*(*filterSize) +j]=0;
2002 filterPos[i]= srcW - (*filterSize);
2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2007 // and skip these than later
2010 for(i=0; i<dstW; i++)
2015 for(j=0; j<*filterSize; j++)
2017 sum+= filter[i*(*filterSize) + j];
2020 for(j=0; j<*filterSize; j++)
2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
2028 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
2037 // create an optimized horizontal scaling routine
2045 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
2046 "movq %%mm0, %%mm1 \n\t"
2047 "psrlq $8, %%mm0 \n\t"
2048 "punpcklbw %%mm7, %%mm1 \n\t"
2049 "movq %%mm2, %%mm3 \n\t"
2050 "punpcklbw %%mm7, %%mm0 \n\t"
2051 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
2052 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
2054 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
2055 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
2057 "psrlw $9, %%mm3 \n\t"
2058 "psubw %%mm1, %%mm0 \n\t"
2059 "pmullw %%mm3, %%mm0 \n\t"
2060 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
2061 "psllw $7, %%mm1 \n\t"
2062 "paddw %%mm1, %%mm0 \n\t"
2064 "movq %%mm0, (%%edi, %%eax) \n\t"
2066 "addl $8, %%eax \n\t"
2079 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
2080 "=r" (fragmentLength)
2083 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
2085 for(i=0; i<dstW/8; i++)
2092 int b=((xpos+xInc)>>16) - xx;
2093 int c=((xpos+xInc*2)>>16) - xx;
2094 int d=((xpos+xInc*3)>>16) - xx;
2096 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
2098 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
2099 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
2100 a | (b<<2) | (c<<4) | (d<<6);
2102 // if we dont need to read 8 bytes than dont :), reduces the chance of
2103 // crossing a cache line
2104 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
2106 funnyCode[fragmentLength*(i+4)/4]= RET;
2111 xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
2112 for(i=0; i<dstUVw/8; i++)
2119 int b=((xpos+chrXInc)>>16) - xx;
2120 int c=((xpos+chrXInc*2)>>16) - xx;
2121 int d=((xpos+chrXInc*3)>>16) - xx;
2123 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
2125 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
2126 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
2127 a | (b<<2) | (c<<4) | (d<<6);
2129 // if we dont need to read 8 bytes than dont :), reduces the chance of
2130 // crossing a cache line
2131 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
2133 funnyUVCode[fragmentLength*(i+4)/4]= RET;
2138 // funnyCode[0]= RET;
2142 static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
2143 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
2144 int srcW, int srcH, int dstW, int dstH){
2147 unsigned int lumXInc= (srcW << 16) / dstW;
2148 unsigned int lumYInc= (srcH << 16) / dstH;
2149 unsigned int chrXInc;
2150 unsigned int chrYInc;
2154 // used to detect a size change
2155 static int oldDstW= -1;
2156 static int oldSrcW= -1;
2157 static int oldDstH= -1;
2158 static int oldSrcH= -1;
2159 static int oldFlags=-1;
2161 static int lastInLumBuf;
2162 static int lastInChrBuf;
2164 int chrDstW, chrDstH;
2166 static int lumBufIndex=0;
2167 static int chrBufIndex=0;
2169 static int firstTime=1;
2171 const int widthAlign= dstbpp==12 ? 16 : 8;
2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173 const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
2174 : (((dstW+7)&(~7)))*bytespp - dststride;
2175 if(dststride%widthAlign !=0 )
2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
2179 "SwScaler: ->cannot do aligned memory acesses anymore\n",
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
2188 "SwScaler: ->using unoptimized C version for last line(s)\n",
2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2205 canMMX2BeUsed=0; // should be 0 anyway but ...
2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX)
2211 char *dither= " dithered";
2215 if(sws_flags==SWS_FAST_BILINEAR)
2216 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
2217 else if(sws_flags==SWS_BILINEAR)
2218 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
2219 else if(sws_flags==SWS_BICUBIC)
2220 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
2222 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
2225 fprintf(stderr, "with%s BGR15 output ", dither);
2227 fprintf(stderr, "with%s BGR16 output ", dither);
2229 fprintf(stderr, "with BGR24 output ");
2231 fprintf(stderr, "with BGR32 output ");
2233 fprintf(stderr, "with YV12 output ");
2235 fprintf(stderr, "without output ");
2238 fprintf(stderr, "using MMX2\n");
2239 #elif defined (HAVE_3DNOW)
2240 fprintf(stderr, "using 3DNOW\n");
2241 #elif defined (HAVE_MMX)
2242 fprintf(stderr, "using MMX\n");
2243 #elif defined (ARCH_X86)
2244 fprintf(stderr, "using X86 ASM\n");
2246 fprintf(stderr, "using C\n");
2251 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2252 // n-2 is the last chrominance sample available
2253 // this is not perfect, but noone shuld notice the difference, the more correct variant
2254 // would be like the vertical one, but that would require some special code for the
2255 // first and last pixel
2256 if(sws_flags==SWS_FAST_BILINEAR)
2258 if(canMMX2BeUsed) lumXInc+= 20;
2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2264 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
2265 else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
2267 if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
2268 else chrYInc= lumYInc>>1, chrDstH= dstH;
2270 // force calculation of the horizontal interpolation of the first line
2273 // printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
2278 //precalculate horizontal scaler filter coefficients
2279 if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
2282 const int filterAlign=4;
2284 const int filterAlign=1;
2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2289 srcW , dstW , filterAlign, 1<<14);
2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
2294 // cant downscale !!!
2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2297 initMMX2HScaler(dstW , lumXInc, funnyYCode);
2298 initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
2301 } // Init Horizontal stuff
2303 if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
2306 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
2308 // deallocate pixbufs
2309 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2313 srcH , dstH, 1, (1<<12)-4);
2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2319 for(i=0; i<dstH; i++)
2321 int chrI= i*chrDstH / dstH;
2322 int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1,
2323 ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
2324 nextSlice&= ~1; // Slices start at even boundaries
2325 if(vLumFilterPos[i ] + vLumBufSize < nextSlice)
2326 vLumBufSize= nextSlice - vLumFilterPos[i ];
2327 if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
2328 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
2331 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2332 // allocate several megabytes to handle all possible cases)
2333 for(i=0; i<vLumBufSize; i++)
2334 lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
2335 for(i=0; i<vChrBufSize; i++)
2336 chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
2338 //try to avoid drawing green stuff between the right end and the stride end
2339 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
2340 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
2342 ASSERT(chrDstH<=dstH)
2343 ASSERT(vLumFilterSize*dstH*4<16000)
2344 ASSERT(vChrFilterSize*chrDstH*4<16000)
2346 // pack filter data for mmx code
2347 for(i=0; i<vLumFilterSize*dstH; i++)
2348 lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
2350 for(i=0; i<vChrFilterSize*chrDstH; i++)
2351 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2356 if(firstTime && verbose)
2370 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2371 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2374 if(hLumFilterSize==4)
2375 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2376 else if(hLumFilterSize==8)
2377 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2379 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2381 if(hChrFilterSize==4)
2382 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2383 else if(hChrFilterSize==8)
2384 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2386 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2388 #elif defined (ARCH_X86)
2389 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2391 if(sws_flags==SWS_FAST_BILINEAR)
2392 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2394 printf("SwScaler: using C scaler for horizontal scaling\n");
2399 if(vLumFilterSize==1)
2400 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2402 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2406 if(vLumFilterSize==1 && vChrFilterSize==2)
2407 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2408 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2409 else if(vLumFilterSize==2 && vChrFilterSize==2)
2410 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2412 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2416 printf("SwScaler: using %s YV12->BGR24 Converter\n",
2417 mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2419 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2421 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2428 for(;dstY < dstH; dstY++){
2429 unsigned char *dest =dstptr[0]+dststride*dstY;
2430 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
2431 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
2432 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
2434 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2435 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2436 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2437 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2439 if(sws_flags == SWS_FAST_BILINEAR)
2442 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2443 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2446 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2447 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2449 // Do we have enough lines in this slice to output the dstY line
2450 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2452 //Do horizontal scaling
2453 while(lastInLumBuf < lastLumSrcY)
2455 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2457 ASSERT(lumBufIndex < 2*vLumBufSize)
2458 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2459 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2460 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2461 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2464 while(lastInChrBuf < lastChrSrcY)
2466 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2467 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2469 ASSERT(chrBufIndex < 2*vChrBufSize)
2470 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2471 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2472 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2475 //wrap buf index around to stay inside the ring buffer
2476 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2477 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2479 else // not enough lines left in this slice -> load the rest in the buffer
2481 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2482 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2483 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2484 vChrBufSize, vLumBufSize);
2486 //Do horizontal scaling
2487 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2489 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2491 ASSERT(lumBufIndex < 2*vLumBufSize)
2492 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2493 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2494 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2497 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2499 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2500 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2502 ASSERT(chrBufIndex < 2*vChrBufSize)
2503 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2504 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2505 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2508 //wrap buf index around to stay inside the ring buffer
2509 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2510 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2511 break; //we cant output a dstY line so lets try with the next slice
2515 b5Dither= dither8[dstY&1];
2516 g6Dither= dither4[dstY&1];
2517 g5Dither= dither8[dstY&1];
2518 r5Dither= dither8[(dstY+1)&1];
2520 if(dstY < dstH-2 || over<=0)
2522 if(dstbpp==12) //YV12
2524 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2525 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2527 int16_t *lumBuf = lumPixBuf[0];
2528 int16_t *chrBuf= chrPixBuf[0];
2529 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2533 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2534 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2536 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2537 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2538 dest, uDest, vDest, dstW,
2539 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2544 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2545 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2547 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2548 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2549 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2551 int chrAlpha= vChrFilter[2*dstY+1];
2553 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2554 dest, dstW, chrAlpha, dstbpp);
2556 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2558 int lumAlpha= vLumFilter[2*dstY+1];
2559 int chrAlpha= vChrFilter[2*dstY+1];
2561 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2562 dest, dstW, lumAlpha, chrAlpha, dstbpp);
2567 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2568 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2570 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2574 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2576 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2577 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2578 if(dstbpp==12) //YV12
2580 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2582 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2583 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2584 dest, uDest, vDest, dstW);
2588 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2589 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2591 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2592 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2593 dest, dstW, dstbpp);
2599 __asm __volatile(SFENCE:::"memory");
2600 __asm __volatile(EMMS:::"memory");