2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
45 #define SFENCE "sfence"
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #include "swscale_altivec_template.c"
66 #define YSCALEYUV2YV12X(x, offset) \
67 "xorl %%eax, %%eax \n\t"\
68 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
69 "movq %%mm3, %%mm4 \n\t"\
70 "leal " offset "(%0), %%edx \n\t"\
71 "movl (%%edx), %%esi \n\t"\
72 ".balign 16 \n\t" /* FIXME Unroll? */\
74 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
75 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
76 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
77 "addl $16, %%edx \n\t"\
78 "movl (%%edx), %%esi \n\t"\
79 "testl %%esi, %%esi \n\t"\
80 "pmulhw %%mm0, %%mm2 \n\t"\
81 "pmulhw %%mm0, %%mm5 \n\t"\
82 "paddw %%mm2, %%mm3 \n\t"\
83 "paddw %%mm5, %%mm4 \n\t"\
85 "psraw $3, %%mm3 \n\t"\
86 "psraw $3, %%mm4 \n\t"\
87 "packuswb %%mm4, %%mm3 \n\t"\
88 MOVNTQ(%%mm3, (%1, %%eax))\
89 "addl $8, %%eax \n\t"\
90 "cmpl %2, %%eax \n\t"\
91 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
92 "movq %%mm3, %%mm4 \n\t"\
93 "leal " offset "(%0), %%edx \n\t"\
94 "movl (%%edx), %%esi \n\t"\
97 #define YSCALEYUV2YV121 \
98 "movl %2, %%eax \n\t"\
99 ".balign 16 \n\t" /* FIXME Unroll? */\
101 "movq (%0, %%eax, 2), %%mm0 \n\t"\
102 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
103 "psraw $7, %%mm0 \n\t"\
104 "psraw $7, %%mm1 \n\t"\
105 "packuswb %%mm1, %%mm0 \n\t"\
106 MOVNTQ(%%mm0, (%1, %%eax))\
107 "addl $8, %%eax \n\t"\
111 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
112 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
113 "r" (dest), "m" (dstW),
114 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
115 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
117 #define YSCALEYUV2PACKEDX \
118 "xorl %%eax, %%eax \n\t"\
122 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
123 "movl (%%edx), %%esi \n\t"\
124 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
125 "movq %%mm3, %%mm4 \n\t"\
128 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
129 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
130 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
131 "addl $16, %%edx \n\t"\
132 "movl (%%edx), %%esi \n\t"\
133 "pmulhw %%mm0, %%mm2 \n\t"\
134 "pmulhw %%mm0, %%mm5 \n\t"\
135 "paddw %%mm2, %%mm3 \n\t"\
136 "paddw %%mm5, %%mm4 \n\t"\
137 "testl %%esi, %%esi \n\t"\
140 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
141 "movl (%%edx), %%esi \n\t"\
142 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
143 "movq %%mm1, %%mm7 \n\t"\
146 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
147 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
148 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
149 "addl $16, %%edx \n\t"\
150 "movl (%%edx), %%esi \n\t"\
151 "pmulhw %%mm0, %%mm2 \n\t"\
152 "pmulhw %%mm0, %%mm5 \n\t"\
153 "paddw %%mm2, %%mm1 \n\t"\
154 "paddw %%mm5, %%mm7 \n\t"\
155 "testl %%esi, %%esi \n\t"\
159 #define YSCALEYUV2RGBX \
161 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
162 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
163 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
164 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
165 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
166 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
167 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
168 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
169 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
170 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
171 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
172 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
173 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
174 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
175 "paddw %%mm3, %%mm4 \n\t"\
176 "movq %%mm2, %%mm0 \n\t"\
177 "movq %%mm5, %%mm6 \n\t"\
178 "movq %%mm4, %%mm3 \n\t"\
179 "punpcklwd %%mm2, %%mm2 \n\t"\
180 "punpcklwd %%mm5, %%mm5 \n\t"\
181 "punpcklwd %%mm4, %%mm4 \n\t"\
182 "paddw %%mm1, %%mm2 \n\t"\
183 "paddw %%mm1, %%mm5 \n\t"\
184 "paddw %%mm1, %%mm4 \n\t"\
185 "punpckhwd %%mm0, %%mm0 \n\t"\
186 "punpckhwd %%mm6, %%mm6 \n\t"\
187 "punpckhwd %%mm3, %%mm3 \n\t"\
188 "paddw %%mm7, %%mm0 \n\t"\
189 "paddw %%mm7, %%mm6 \n\t"\
190 "paddw %%mm7, %%mm3 \n\t"\
191 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
192 "packuswb %%mm0, %%mm2 \n\t"\
193 "packuswb %%mm6, %%mm5 \n\t"\
194 "packuswb %%mm3, %%mm4 \n\t"\
195 "pxor %%mm7, %%mm7 \n\t"
197 #define FULL_YSCALEYUV2RGB \
198 "pxor %%mm7, %%mm7 \n\t"\
199 "movd %6, %%mm6 \n\t" /*yalpha1*/\
200 "punpcklwd %%mm6, %%mm6 \n\t"\
201 "punpcklwd %%mm6, %%mm6 \n\t"\
202 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
203 "punpcklwd %%mm5, %%mm5 \n\t"\
204 "punpcklwd %%mm5, %%mm5 \n\t"\
205 "xorl %%eax, %%eax \n\t"\
208 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
209 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
210 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
211 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
212 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
213 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
214 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
215 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
216 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
217 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
218 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
219 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
220 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
221 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
222 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
223 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
224 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
225 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
229 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
230 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
231 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
232 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
233 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
234 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
237 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
238 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
239 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
240 "paddw %%mm1, %%mm3 \n\t" /* B*/\
241 "paddw %%mm1, %%mm0 \n\t" /* R*/\
242 "packuswb %%mm3, %%mm3 \n\t"\
244 "packuswb %%mm0, %%mm0 \n\t"\
245 "paddw %%mm4, %%mm2 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t" /* G*/\
248 "packuswb %%mm1, %%mm1 \n\t"
251 #define YSCALEYUV2PACKED(index, c) \
252 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
253 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
254 "psraw $3, %%mm0 \n\t"\
255 "psraw $3, %%mm1 \n\t"\
256 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
257 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
258 "xorl "#index", "#index" \n\t"\
261 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
262 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
263 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
264 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
265 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
266 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
267 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
268 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
269 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
270 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
271 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
272 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
273 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
274 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
275 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
276 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
277 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
278 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
279 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
280 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
281 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
282 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
283 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
284 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
285 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 #define YSCALEYUV2RGB(index, c) \
288 "xorl "#index", "#index" \n\t"\
291 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
292 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
293 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
294 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
295 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
296 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
297 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
298 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
299 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
300 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
301 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
302 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
303 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
304 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
305 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
306 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
307 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
308 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
309 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
310 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
311 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
312 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
313 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
314 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
315 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
316 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
317 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
318 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
319 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
320 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
321 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
322 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
323 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
324 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
325 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
326 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
327 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
328 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
329 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
330 "paddw %%mm3, %%mm4 \n\t"\
331 "movq %%mm2, %%mm0 \n\t"\
332 "movq %%mm5, %%mm6 \n\t"\
333 "movq %%mm4, %%mm3 \n\t"\
334 "punpcklwd %%mm2, %%mm2 \n\t"\
335 "punpcklwd %%mm5, %%mm5 \n\t"\
336 "punpcklwd %%mm4, %%mm4 \n\t"\
337 "paddw %%mm1, %%mm2 \n\t"\
338 "paddw %%mm1, %%mm5 \n\t"\
339 "paddw %%mm1, %%mm4 \n\t"\
340 "punpckhwd %%mm0, %%mm0 \n\t"\
341 "punpckhwd %%mm6, %%mm6 \n\t"\
342 "punpckhwd %%mm3, %%mm3 \n\t"\
343 "paddw %%mm7, %%mm0 \n\t"\
344 "paddw %%mm7, %%mm6 \n\t"\
345 "paddw %%mm7, %%mm3 \n\t"\
346 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
347 "packuswb %%mm0, %%mm2 \n\t"\
348 "packuswb %%mm6, %%mm5 \n\t"\
349 "packuswb %%mm3, %%mm4 \n\t"\
350 "pxor %%mm7, %%mm7 \n\t"
352 #define YSCALEYUV2PACKED1(index, c) \
353 "xorl "#index", "#index" \n\t"\
356 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
357 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
358 "psraw $7, %%mm3 \n\t" \
359 "psraw $7, %%mm4 \n\t" \
360 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
361 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
362 "psraw $7, %%mm1 \n\t" \
363 "psraw $7, %%mm7 \n\t" \
365 #define YSCALEYUV2RGB1(index, c) \
366 "xorl "#index", "#index" \n\t"\
369 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
370 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
371 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
372 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
373 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
374 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
375 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
376 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
377 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
378 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
379 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
380 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
381 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
382 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
384 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
385 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
386 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
387 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
388 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
389 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
390 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
391 "paddw %%mm3, %%mm4 \n\t"\
392 "movq %%mm2, %%mm0 \n\t"\
393 "movq %%mm5, %%mm6 \n\t"\
394 "movq %%mm4, %%mm3 \n\t"\
395 "punpcklwd %%mm2, %%mm2 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "punpcklwd %%mm4, %%mm4 \n\t"\
398 "paddw %%mm1, %%mm2 \n\t"\
399 "paddw %%mm1, %%mm5 \n\t"\
400 "paddw %%mm1, %%mm4 \n\t"\
401 "punpckhwd %%mm0, %%mm0 \n\t"\
402 "punpckhwd %%mm6, %%mm6 \n\t"\
403 "punpckhwd %%mm3, %%mm3 \n\t"\
404 "paddw %%mm7, %%mm0 \n\t"\
405 "paddw %%mm7, %%mm6 \n\t"\
406 "paddw %%mm7, %%mm3 \n\t"\
407 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
408 "packuswb %%mm0, %%mm2 \n\t"\
409 "packuswb %%mm6, %%mm5 \n\t"\
410 "packuswb %%mm3, %%mm4 \n\t"\
411 "pxor %%mm7, %%mm7 \n\t"
413 #define YSCALEYUV2PACKED1b(index, c) \
414 "xorl "#index", "#index" \n\t"\
417 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
418 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
419 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
420 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
421 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
422 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
423 "psrlw $8, %%mm3 \n\t" \
424 "psrlw $8, %%mm4 \n\t" \
425 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
426 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
427 "psraw $7, %%mm1 \n\t" \
428 "psraw $7, %%mm7 \n\t"
430 // do vertical chrominance interpolation
431 #define YSCALEYUV2RGB1b(index, c) \
432 "xorl "#index", "#index" \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
440 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
441 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
442 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
443 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
444 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
445 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
446 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
447 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
448 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
449 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
450 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
451 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
452 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
454 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
455 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461 "paddw %%mm3, %%mm4 \n\t"\
462 "movq %%mm2, %%mm0 \n\t"\
463 "movq %%mm5, %%mm6 \n\t"\
464 "movq %%mm4, %%mm3 \n\t"\
465 "punpcklwd %%mm2, %%mm2 \n\t"\
466 "punpcklwd %%mm5, %%mm5 \n\t"\
467 "punpcklwd %%mm4, %%mm4 \n\t"\
468 "paddw %%mm1, %%mm2 \n\t"\
469 "paddw %%mm1, %%mm5 \n\t"\
470 "paddw %%mm1, %%mm4 \n\t"\
471 "punpckhwd %%mm0, %%mm0 \n\t"\
472 "punpckhwd %%mm6, %%mm6 \n\t"\
473 "punpckhwd %%mm3, %%mm3 \n\t"\
474 "paddw %%mm7, %%mm0 \n\t"\
475 "paddw %%mm7, %%mm6 \n\t"\
476 "paddw %%mm7, %%mm3 \n\t"\
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478 "packuswb %%mm0, %%mm2 \n\t"\
479 "packuswb %%mm6, %%mm5 \n\t"\
480 "packuswb %%mm3, %%mm4 \n\t"\
481 "pxor %%mm7, %%mm7 \n\t"
483 #define WRITEBGR32(dst, dstw, index) \
484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
485 "movq %%mm2, %%mm1 \n\t" /* B */\
486 "movq %%mm5, %%mm6 \n\t" /* R */\
487 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
488 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
489 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
490 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
491 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
492 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
493 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
494 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
495 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
496 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
498 MOVNTQ(%%mm0, (dst, index, 4))\
499 MOVNTQ(%%mm2, 8(dst, index, 4))\
500 MOVNTQ(%%mm1, 16(dst, index, 4))\
501 MOVNTQ(%%mm3, 24(dst, index, 4))\
503 "addl $8, "#index" \n\t"\
504 "cmpl "#dstw", "#index" \n\t"\
507 #define WRITEBGR16(dst, dstw, index) \
508 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
509 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
510 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
511 "psrlq $3, %%mm2 \n\t"\
513 "movq %%mm2, %%mm1 \n\t"\
514 "movq %%mm4, %%mm3 \n\t"\
516 "punpcklbw %%mm7, %%mm3 \n\t"\
517 "punpcklbw %%mm5, %%mm2 \n\t"\
518 "punpckhbw %%mm7, %%mm4 \n\t"\
519 "punpckhbw %%mm5, %%mm1 \n\t"\
521 "psllq $3, %%mm3 \n\t"\
522 "psllq $3, %%mm4 \n\t"\
524 "por %%mm3, %%mm2 \n\t"\
525 "por %%mm4, %%mm1 \n\t"\
527 MOVNTQ(%%mm2, (dst, index, 2))\
528 MOVNTQ(%%mm1, 8(dst, index, 2))\
530 "addl $8, "#index" \n\t"\
531 "cmpl "#dstw", "#index" \n\t"\
534 #define WRITEBGR15(dst, dstw, index) \
535 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
536 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
537 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
538 "psrlq $3, %%mm2 \n\t"\
539 "psrlq $1, %%mm5 \n\t"\
541 "movq %%mm2, %%mm1 \n\t"\
542 "movq %%mm4, %%mm3 \n\t"\
544 "punpcklbw %%mm7, %%mm3 \n\t"\
545 "punpcklbw %%mm5, %%mm2 \n\t"\
546 "punpckhbw %%mm7, %%mm4 \n\t"\
547 "punpckhbw %%mm5, %%mm1 \n\t"\
549 "psllq $2, %%mm3 \n\t"\
550 "psllq $2, %%mm4 \n\t"\
552 "por %%mm3, %%mm2 \n\t"\
553 "por %%mm4, %%mm1 \n\t"\
555 MOVNTQ(%%mm2, (dst, index, 2))\
556 MOVNTQ(%%mm1, 8(dst, index, 2))\
558 "addl $8, "#index" \n\t"\
559 "cmpl "#dstw", "#index" \n\t"\
562 #define WRITEBGR24OLD(dst, dstw, index) \
563 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
564 "movq %%mm2, %%mm1 \n\t" /* B */\
565 "movq %%mm5, %%mm6 \n\t" /* R */\
566 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
567 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
568 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
569 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
570 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
571 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
572 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
573 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
574 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
575 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
577 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
578 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
579 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
580 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
581 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
582 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
583 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
584 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
586 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
588 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
589 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
590 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
591 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
592 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
593 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
594 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
595 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
596 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
597 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
598 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
600 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
601 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
602 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
603 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
604 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
605 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
606 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
607 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
609 MOVNTQ(%%mm0, (dst))\
610 MOVNTQ(%%mm2, 8(dst))\
611 MOVNTQ(%%mm3, 16(dst))\
612 "addl $24, "#dst" \n\t"\
614 "addl $8, "#index" \n\t"\
615 "cmpl "#dstw", "#index" \n\t"\
618 #define WRITEBGR24MMX(dst, dstw, index) \
619 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
620 "movq %%mm2, %%mm1 \n\t" /* B */\
621 "movq %%mm5, %%mm6 \n\t" /* R */\
622 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
623 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
624 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
625 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
626 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
627 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
628 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
629 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
630 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
631 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
633 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
634 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
635 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
636 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
638 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
639 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
640 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
641 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
643 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
644 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
645 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
646 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
648 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
649 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
650 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
651 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
652 MOVNTQ(%%mm0, (dst))\
654 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
655 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
656 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
657 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
658 MOVNTQ(%%mm6, 8(dst))\
660 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
661 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
662 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
663 MOVNTQ(%%mm5, 16(dst))\
665 "addl $24, "#dst" \n\t"\
667 "addl $8, "#index" \n\t"\
668 "cmpl "#dstw", "#index" \n\t"\
671 #define WRITEBGR24MMX2(dst, dstw, index) \
672 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
673 "movq "MANGLE(M24A)", %%mm0 \n\t"\
674 "movq "MANGLE(M24C)", %%mm7 \n\t"\
675 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
676 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
677 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
679 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
680 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
681 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
683 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
684 "por %%mm1, %%mm6 \n\t"\
685 "por %%mm3, %%mm6 \n\t"\
686 MOVNTQ(%%mm6, (dst))\
688 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
689 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
690 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
691 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
693 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
694 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
695 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
697 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, 8(dst))\
701 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
702 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
703 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
705 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
706 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
707 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
709 "por %%mm1, %%mm3 \n\t"\
710 "por %%mm3, %%mm6 \n\t"\
711 MOVNTQ(%%mm6, 16(dst))\
713 "addl $24, "#dst" \n\t"\
715 "addl $8, "#index" \n\t"\
716 "cmpl "#dstw", "#index" \n\t"\
721 #define WRITEBGR24 WRITEBGR24MMX2
724 #define WRITEBGR24 WRITEBGR24MMX
727 #define WRITEYUY2(dst, dstw, index) \
728 "packuswb %%mm3, %%mm3 \n\t"\
729 "packuswb %%mm4, %%mm4 \n\t"\
730 "packuswb %%mm7, %%mm1 \n\t"\
731 "punpcklbw %%mm4, %%mm3 \n\t"\
732 "movq %%mm1, %%mm7 \n\t"\
733 "punpcklbw %%mm3, %%mm1 \n\t"\
734 "punpckhbw %%mm3, %%mm7 \n\t"\
736 MOVNTQ(%%mm1, (dst, index, 2))\
737 MOVNTQ(%%mm7, 8(dst, index, 2))\
739 "addl $8, "#index" \n\t"\
740 "cmpl "#dstw", "#index" \n\t"\
744 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
745 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
746 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
752 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
753 :: "r" (&c->redDither),
754 "r" (uDest), "m" (chrDstW)
755 : "%eax", "%edx", "%esi"
759 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
760 :: "r" (&c->redDither),
761 "r" (vDest), "m" (chrDstW)
762 : "%eax", "%edx", "%esi"
767 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
768 :: "r" (&c->redDither),
769 "r" (dest), "m" (dstW)
770 : "%eax", "%edx", "%esi"
774 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
775 chrFilter, chrSrc, chrFilterSize,
776 dest, uDest, vDest, dstW, chrDstW);
778 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
779 chrFilter, chrSrc, chrFilterSize,
780 dest, uDest, vDest, dstW, chrDstW);
781 #endif //!HAVE_ALTIVEC
785 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
786 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
793 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
800 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
808 :: "r" (lumSrc + dstW), "r" (dest + dstW),
814 for(i=0; i<dstW; i++)
816 int val= lumSrc[i]>>7;
827 for(i=0; i<chrDstW; i++)
830 int v=chrSrc[i + 2048]>>7;
834 else if (u>255) u=255;
836 else if (v>255) v=255;
847 * vertical scale YV12 to RGB
849 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
850 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
851 uint8_t *dest, int dstW, int dstY)
861 WRITEBGR32(%4, %5, %%eax)
863 :: "r" (&c->redDither),
864 "m" (dummy), "m" (dummy), "m" (dummy),
865 "r" (dest), "m" (dstW)
866 : "%eax", "%edx", "%esi"
874 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
875 "addl %4, %%ebx \n\t"
876 WRITEBGR24(%%ebx, %5, %%eax)
878 :: "r" (&c->redDither),
879 "m" (dummy), "m" (dummy), "m" (dummy),
880 "r" (dest), "m" (dstW)
881 : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
889 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
891 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
892 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
893 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
896 WRITEBGR15(%4, %5, %%eax)
898 :: "r" (&c->redDither),
899 "m" (dummy), "m" (dummy), "m" (dummy),
900 "r" (dest), "m" (dstW)
901 : "%eax", "%edx", "%esi"
909 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
911 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
912 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
913 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
916 WRITEBGR16(%4, %5, %%eax)
918 :: "r" (&c->redDither),
919 "m" (dummy), "m" (dummy), "m" (dummy),
920 "r" (dest), "m" (dstW)
921 : "%eax", "%edx", "%esi"
929 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
931 "psraw $3, %%mm3 \n\t"
932 "psraw $3, %%mm4 \n\t"
933 "psraw $3, %%mm1 \n\t"
934 "psraw $3, %%mm7 \n\t"
935 WRITEYUY2(%4, %5, %%eax)
937 :: "r" (&c->redDither),
938 "m" (dummy), "m" (dummy), "m" (dummy),
939 "r" (dest), "m" (dstW)
940 : "%eax", "%edx", "%esi"
946 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
947 chrFilter, chrSrc, chrFilterSize,
954 * vertical bilinear scale YV12 to RGB
956 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
957 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
959 int yalpha1=yalpha^4095;
960 int uvalpha1=uvalpha^4095;
964 if(flags&SWS_FULL_CHR_H_INT)
974 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
975 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
977 "movq %%mm3, %%mm1 \n\t"
978 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
979 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
981 MOVNTQ(%%mm3, (%4, %%eax, 4))
982 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
984 "addl $4, %%eax \n\t"
985 "cmpl %5, %%eax \n\t"
989 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
990 "m" (yalpha1), "m" (uvalpha1)
1000 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1001 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1003 "movq %%mm3, %%mm1 \n\t"
1004 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1005 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1007 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1008 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1009 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1010 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1011 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1012 "movq %%mm1, %%mm2 \n\t"
1013 "psllq $48, %%mm1 \n\t" // 000000BG
1014 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1016 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1017 "psrld $16, %%mm2 \n\t" // R000R000
1018 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1019 "por %%mm2, %%mm1 \n\t" // RBGRR000
1021 "movl %4, %%ebx \n\t"
1022 "addl %%eax, %%ebx \n\t"
1026 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
1027 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
1029 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
1030 "psrlq $32, %%mm3 \n\t"
1031 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
1032 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
1034 "addl $4, %%eax \n\t"
1035 "cmpl %5, %%eax \n\t"
1038 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1039 "m" (yalpha1), "m" (uvalpha1)
1048 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1049 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1050 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1052 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1053 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1054 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1056 "psrlw $3, %%mm3 \n\t"
1057 "psllw $2, %%mm1 \n\t"
1058 "psllw $7, %%mm0 \n\t"
1059 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1060 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1062 "por %%mm3, %%mm1 \n\t"
1063 "por %%mm1, %%mm0 \n\t"
1065 MOVNTQ(%%mm0, (%4, %%eax, 2))
1067 "addl $4, %%eax \n\t"
1068 "cmpl %5, %%eax \n\t"
1071 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1072 "m" (yalpha1), "m" (uvalpha1)
1081 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1082 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1083 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1085 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1086 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1087 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1089 "psrlw $3, %%mm3 \n\t"
1090 "psllw $3, %%mm1 \n\t"
1091 "psllw $8, %%mm0 \n\t"
1092 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1093 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1095 "por %%mm3, %%mm1 \n\t"
1096 "por %%mm1, %%mm0 \n\t"
1098 MOVNTQ(%%mm0, (%4, %%eax, 2))
1100 "addl $4, %%eax \n\t"
1101 "cmpl %5, %%eax \n\t"
1104 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1105 "m" (yalpha1), "m" (uvalpha1)
1114 if(dstFormat==IMGFMT_BGR32)
1117 #ifdef WORDS_BIGENDIAN
1120 for(i=0;i<dstW;i++){
1121 // vertical linear interpolation && yuv2rgb in a single step:
1122 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1123 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1124 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1125 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1126 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1127 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1131 else if(dstFormat==IMGFMT_BGR24)
1134 for(i=0;i<dstW;i++){
1135 // vertical linear interpolation && yuv2rgb in a single step:
1136 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1137 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1138 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1139 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1140 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1141 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1145 else if(dstFormat==IMGFMT_BGR16)
1148 for(i=0;i<dstW;i++){
1149 // vertical linear interpolation && yuv2rgb in a single step:
1150 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1152 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 ((uint16_t*)dest)[i] =
1155 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1156 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1157 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1160 else if(dstFormat==IMGFMT_BGR15)
1163 for(i=0;i<dstW;i++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1169 ((uint16_t*)dest)[i] =
1170 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1171 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1172 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1180 switch(c->dstFormat)
1182 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1185 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1186 "movl %4, %%esp \n\t"
1187 YSCALEYUV2RGB(%%eax, %5)
1188 WRITEBGR32(%%esp, 8280(%5), %%eax)
1189 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1191 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1198 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1199 "movl %4, %%esp \n\t"
1200 YSCALEYUV2RGB(%%eax, %5)
1201 WRITEBGR24(%%esp, 8280(%5), %%eax)
1202 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1203 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1210 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1211 "movl %4, %%esp \n\t"
1212 YSCALEYUV2RGB(%%eax, %5)
1213 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1215 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1216 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1217 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1220 WRITEBGR15(%%esp, 8280(%5), %%eax)
1221 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1223 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1230 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1231 "movl %4, %%esp \n\t"
1232 YSCALEYUV2RGB(%%eax, %5)
1233 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1235 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1237 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1240 WRITEBGR16(%%esp, 8280(%5), %%eax)
1241 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1242 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1249 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1250 "movl %4, %%esp \n\t"
1251 YSCALEYUV2PACKED(%%eax, %5)
1252 WRITEYUY2(%%esp, 8280(%5), %%eax)
1253 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1254 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1262 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1266 * YV12 to RGB without scaling or interpolating
1268 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1269 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1271 const int yalpha1=0;
1274 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1275 const int yalpha= 4096; //FIXME ...
1277 if(flags&SWS_FULL_CHR_H_INT)
1279 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1284 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1290 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1291 "movl %4, %%esp \n\t"
1292 YSCALEYUV2RGB1(%%eax, %5)
1293 WRITEBGR32(%%esp, 8280(%5), %%eax)
1294 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1296 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1303 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1304 "movl %4, %%esp \n\t"
1305 YSCALEYUV2RGB1(%%eax, %5)
1306 WRITEBGR24(%%esp, 8280(%5), %%eax)
1307 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1309 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1316 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1317 "movl %4, %%esp \n\t"
1318 YSCALEYUV2RGB1(%%eax, %5)
1319 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1322 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1323 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1325 WRITEBGR15(%%esp, 8280(%5), %%eax)
1326 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1328 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1335 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1336 "movl %4, %%esp \n\t"
1337 YSCALEYUV2RGB1(%%eax, %5)
1338 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1340 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1341 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1342 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1345 WRITEBGR16(%%esp, 8280(%5), %%eax)
1346 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1348 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1355 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1356 "movl %4, %%esp \n\t"
1357 YSCALEYUV2PACKED1(%%eax, %5)
1358 WRITEYUY2(%%esp, 8280(%5), %%eax)
1359 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1361 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1374 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1375 "movl %4, %%esp \n\t"
1376 YSCALEYUV2RGB1b(%%eax, %5)
1377 WRITEBGR32(%%esp, 8280(%5), %%eax)
1378 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1380 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1387 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1388 "movl %4, %%esp \n\t"
1389 YSCALEYUV2RGB1b(%%eax, %5)
1390 WRITEBGR24(%%esp, 8280(%5), %%eax)
1391 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1393 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1400 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1401 "movl %4, %%esp \n\t"
1402 YSCALEYUV2RGB1b(%%eax, %5)
1403 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1405 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1406 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1407 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1409 WRITEBGR15(%%esp, 8280(%5), %%eax)
1410 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1412 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1419 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1420 "movl %4, %%esp \n\t"
1421 YSCALEYUV2RGB1b(%%eax, %5)
1422 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1425 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1426 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1429 WRITEBGR16(%%esp, 8280(%5), %%eax)
1430 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1432 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1439 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1440 "movl %4, %%esp \n\t"
1441 YSCALEYUV2PACKED1b(%%eax, %5)
1442 WRITEYUY2(%%esp, 8280(%5), %%eax)
1443 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1445 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453 if( uvalpha < 2048 )
1455 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1457 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1461 //FIXME yuy2* can read upto 7 samples to much
1463 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1467 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1468 "movl %0, %%eax \n\t"
1470 "movq (%1, %%eax,2), %%mm0 \n\t"
1471 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1472 "pand %%mm2, %%mm0 \n\t"
1473 "pand %%mm2, %%mm1 \n\t"
1474 "packuswb %%mm1, %%mm0 \n\t"
1475 "movq %%mm0, (%2, %%eax) \n\t"
1476 "addl $8, %%eax \n\t"
1478 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1483 for(i=0; i<width; i++)
1488 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1490 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1492 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1493 "movl %0, %%eax \n\t"
1495 "movq (%1, %%eax,4), %%mm0 \n\t"
1496 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1497 "movq (%2, %%eax,4), %%mm2 \n\t"
1498 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1501 "psrlw $8, %%mm0 \n\t"
1502 "psrlw $8, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
1504 "movq %%mm0, %%mm1 \n\t"
1505 "psrlw $8, %%mm0 \n\t"
1506 "pand %%mm4, %%mm1 \n\t"
1507 "packuswb %%mm0, %%mm0 \n\t"
1508 "packuswb %%mm1, %%mm1 \n\t"
1509 "movd %%mm0, (%4, %%eax) \n\t"
1510 "movd %%mm1, (%3, %%eax) \n\t"
1511 "addl $4, %%eax \n\t"
1513 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1518 for(i=0; i<width; i++)
1520 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1521 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1526 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1527 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1531 "movl %0, %%eax \n\t"
1533 "movq (%1, %%eax,2), %%mm0 \n\t"
1534 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1535 "psrlw $8, %%mm0 \n\t"
1536 "psrlw $8, %%mm1 \n\t"
1537 "packuswb %%mm1, %%mm0 \n\t"
1538 "movq %%mm0, (%2, %%eax) \n\t"
1539 "addl $8, %%eax \n\t"
1541 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1546 for(i=0; i<width; i++)
1551 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1553 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1555 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1556 "movl %0, %%eax \n\t"
1558 "movq (%1, %%eax,4), %%mm0 \n\t"
1559 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1560 "movq (%2, %%eax,4), %%mm2 \n\t"
1561 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1564 "pand %%mm4, %%mm0 \n\t"
1565 "pand %%mm4, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "movq %%mm0, %%mm1 \n\t"
1568 "psrlw $8, %%mm0 \n\t"
1569 "pand %%mm4, %%mm1 \n\t"
1570 "packuswb %%mm0, %%mm0 \n\t"
1571 "packuswb %%mm1, %%mm1 \n\t"
1572 "movd %%mm0, (%4, %%eax) \n\t"
1573 "movd %%mm1, (%3, %%eax) \n\t"
1574 "addl $4, %%eax \n\t"
1576 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1581 for(i=0; i<width; i++)
1583 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1584 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1589 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1591 #ifdef HAVE_MMXFIXME
1594 for(i=0; i<width; i++)
1596 int b= ((uint32_t*)src)[i]&0xFF;
1597 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1598 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1600 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1605 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1607 #ifdef HAVE_MMXFIXME
1610 for(i=0; i<width; i++)
1612 const int a= ((uint32_t*)src1)[2*i+0];
1613 const int e= ((uint32_t*)src1)[2*i+1];
1614 const int c= ((uint32_t*)src2)[2*i+0];
1615 const int d= ((uint32_t*)src2)[2*i+1];
1616 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1617 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1618 const int b= l&0x3FF;
1622 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1623 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1628 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1632 "movl %2, %%eax \n\t"
1633 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1634 "movq "MANGLE(w1111)", %%mm5 \n\t"
1635 "pxor %%mm7, %%mm7 \n\t"
1636 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1639 PREFETCH" 64(%0, %%ebx) \n\t"
1640 "movd (%0, %%ebx), %%mm0 \n\t"
1641 "movd 3(%0, %%ebx), %%mm1 \n\t"
1642 "punpcklbw %%mm7, %%mm0 \n\t"
1643 "punpcklbw %%mm7, %%mm1 \n\t"
1644 "movd 6(%0, %%ebx), %%mm2 \n\t"
1645 "movd 9(%0, %%ebx), %%mm3 \n\t"
1646 "punpcklbw %%mm7, %%mm2 \n\t"
1647 "punpcklbw %%mm7, %%mm3 \n\t"
1648 "pmaddwd %%mm6, %%mm0 \n\t"
1649 "pmaddwd %%mm6, %%mm1 \n\t"
1650 "pmaddwd %%mm6, %%mm2 \n\t"
1651 "pmaddwd %%mm6, %%mm3 \n\t"
1652 #ifndef FAST_BGR2YV12
1653 "psrad $8, %%mm0 \n\t"
1654 "psrad $8, %%mm1 \n\t"
1655 "psrad $8, %%mm2 \n\t"
1656 "psrad $8, %%mm3 \n\t"
1658 "packssdw %%mm1, %%mm0 \n\t"
1659 "packssdw %%mm3, %%mm2 \n\t"
1660 "pmaddwd %%mm5, %%mm0 \n\t"
1661 "pmaddwd %%mm5, %%mm2 \n\t"
1662 "packssdw %%mm2, %%mm0 \n\t"
1663 "psraw $7, %%mm0 \n\t"
1665 "movd 12(%0, %%ebx), %%mm4 \n\t"
1666 "movd 15(%0, %%ebx), %%mm1 \n\t"
1667 "punpcklbw %%mm7, %%mm4 \n\t"
1668 "punpcklbw %%mm7, %%mm1 \n\t"
1669 "movd 18(%0, %%ebx), %%mm2 \n\t"
1670 "movd 21(%0, %%ebx), %%mm3 \n\t"
1671 "punpcklbw %%mm7, %%mm2 \n\t"
1672 "punpcklbw %%mm7, %%mm3 \n\t"
1673 "pmaddwd %%mm6, %%mm4 \n\t"
1674 "pmaddwd %%mm6, %%mm1 \n\t"
1675 "pmaddwd %%mm6, %%mm2 \n\t"
1676 "pmaddwd %%mm6, %%mm3 \n\t"
1677 #ifndef FAST_BGR2YV12
1678 "psrad $8, %%mm4 \n\t"
1679 "psrad $8, %%mm1 \n\t"
1680 "psrad $8, %%mm2 \n\t"
1681 "psrad $8, %%mm3 \n\t"
1683 "packssdw %%mm1, %%mm4 \n\t"
1684 "packssdw %%mm3, %%mm2 \n\t"
1685 "pmaddwd %%mm5, %%mm4 \n\t"
1686 "pmaddwd %%mm5, %%mm2 \n\t"
1687 "addl $24, %%ebx \n\t"
1688 "packssdw %%mm2, %%mm4 \n\t"
1689 "psraw $7, %%mm4 \n\t"
1691 "packuswb %%mm4, %%mm0 \n\t"
1692 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1694 "movq %%mm0, (%1, %%eax) \n\t"
1695 "addl $8, %%eax \n\t"
1697 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1702 for(i=0; i<width; i++)
1708 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1713 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1717 "movl %4, %%eax \n\t"
1718 "movq "MANGLE(w1111)", %%mm5 \n\t"
1719 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1720 "pxor %%mm7, %%mm7 \n\t"
1721 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1722 "addl %%ebx, %%ebx \n\t"
1725 PREFETCH" 64(%0, %%ebx) \n\t"
1726 PREFETCH" 64(%1, %%ebx) \n\t"
1727 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728 "movq (%0, %%ebx), %%mm0 \n\t"
1729 "movq (%1, %%ebx), %%mm1 \n\t"
1730 "movq 6(%0, %%ebx), %%mm2 \n\t"
1731 "movq 6(%1, %%ebx), %%mm3 \n\t"
1734 "movq %%mm0, %%mm1 \n\t"
1735 "movq %%mm2, %%mm3 \n\t"
1736 "psrlq $24, %%mm0 \n\t"
1737 "psrlq $24, %%mm2 \n\t"
1740 "punpcklbw %%mm7, %%mm0 \n\t"
1741 "punpcklbw %%mm7, %%mm2 \n\t"
1743 "movd (%0, %%ebx), %%mm0 \n\t"
1744 "movd (%1, %%ebx), %%mm1 \n\t"
1745 "movd 3(%0, %%ebx), %%mm2 \n\t"
1746 "movd 3(%1, %%ebx), %%mm3 \n\t"
1747 "punpcklbw %%mm7, %%mm0 \n\t"
1748 "punpcklbw %%mm7, %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm2 \n\t"
1750 "punpcklbw %%mm7, %%mm3 \n\t"
1751 "paddw %%mm1, %%mm0 \n\t"
1752 "paddw %%mm3, %%mm2 \n\t"
1753 "paddw %%mm2, %%mm0 \n\t"
1754 "movd 6(%0, %%ebx), %%mm4 \n\t"
1755 "movd 6(%1, %%ebx), %%mm1 \n\t"
1756 "movd 9(%0, %%ebx), %%mm2 \n\t"
1757 "movd 9(%1, %%ebx), %%mm3 \n\t"
1758 "punpcklbw %%mm7, %%mm4 \n\t"
1759 "punpcklbw %%mm7, %%mm1 \n\t"
1760 "punpcklbw %%mm7, %%mm2 \n\t"
1761 "punpcklbw %%mm7, %%mm3 \n\t"
1762 "paddw %%mm1, %%mm4 \n\t"
1763 "paddw %%mm3, %%mm2 \n\t"
1764 "paddw %%mm4, %%mm2 \n\t"
1765 "psrlw $2, %%mm0 \n\t"
1766 "psrlw $2, %%mm2 \n\t"
1768 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1769 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1771 "pmaddwd %%mm0, %%mm1 \n\t"
1772 "pmaddwd %%mm2, %%mm3 \n\t"
1773 "pmaddwd %%mm6, %%mm0 \n\t"
1774 "pmaddwd %%mm6, %%mm2 \n\t"
1775 #ifndef FAST_BGR2YV12
1776 "psrad $8, %%mm0 \n\t"
1777 "psrad $8, %%mm1 \n\t"
1778 "psrad $8, %%mm2 \n\t"
1779 "psrad $8, %%mm3 \n\t"
1781 "packssdw %%mm2, %%mm0 \n\t"
1782 "packssdw %%mm3, %%mm1 \n\t"
1783 "pmaddwd %%mm5, %%mm0 \n\t"
1784 "pmaddwd %%mm5, %%mm1 \n\t"
1785 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1786 "psraw $7, %%mm0 \n\t"
1788 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1789 "movq 12(%0, %%ebx), %%mm4 \n\t"
1790 "movq 12(%1, %%ebx), %%mm1 \n\t"
1791 "movq 18(%0, %%ebx), %%mm2 \n\t"
1792 "movq 18(%1, %%ebx), %%mm3 \n\t"
1795 "movq %%mm4, %%mm1 \n\t"
1796 "movq %%mm2, %%mm3 \n\t"
1797 "psrlq $24, %%mm4 \n\t"
1798 "psrlq $24, %%mm2 \n\t"
1801 "punpcklbw %%mm7, %%mm4 \n\t"
1802 "punpcklbw %%mm7, %%mm2 \n\t"
1804 "movd 12(%0, %%ebx), %%mm4 \n\t"
1805 "movd 12(%1, %%ebx), %%mm1 \n\t"
1806 "movd 15(%0, %%ebx), %%mm2 \n\t"
1807 "movd 15(%1, %%ebx), %%mm3 \n\t"
1808 "punpcklbw %%mm7, %%mm4 \n\t"
1809 "punpcklbw %%mm7, %%mm1 \n\t"
1810 "punpcklbw %%mm7, %%mm2 \n\t"
1811 "punpcklbw %%mm7, %%mm3 \n\t"
1812 "paddw %%mm1, %%mm4 \n\t"
1813 "paddw %%mm3, %%mm2 \n\t"
1814 "paddw %%mm2, %%mm4 \n\t"
1815 "movd 18(%0, %%ebx), %%mm5 \n\t"
1816 "movd 18(%1, %%ebx), %%mm1 \n\t"
1817 "movd 21(%0, %%ebx), %%mm2 \n\t"
1818 "movd 21(%1, %%ebx), %%mm3 \n\t"
1819 "punpcklbw %%mm7, %%mm5 \n\t"
1820 "punpcklbw %%mm7, %%mm1 \n\t"
1821 "punpcklbw %%mm7, %%mm2 \n\t"
1822 "punpcklbw %%mm7, %%mm3 \n\t"
1823 "paddw %%mm1, %%mm5 \n\t"
1824 "paddw %%mm3, %%mm2 \n\t"
1825 "paddw %%mm5, %%mm2 \n\t"
1826 "movq "MANGLE(w1111)", %%mm5 \n\t"
1827 "psrlw $2, %%mm4 \n\t"
1828 "psrlw $2, %%mm2 \n\t"
1830 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1831 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1833 "pmaddwd %%mm4, %%mm1 \n\t"
1834 "pmaddwd %%mm2, %%mm3 \n\t"
1835 "pmaddwd %%mm6, %%mm4 \n\t"
1836 "pmaddwd %%mm6, %%mm2 \n\t"
1837 #ifndef FAST_BGR2YV12
1838 "psrad $8, %%mm4 \n\t"
1839 "psrad $8, %%mm1 \n\t"
1840 "psrad $8, %%mm2 \n\t"
1841 "psrad $8, %%mm3 \n\t"
1843 "packssdw %%mm2, %%mm4 \n\t"
1844 "packssdw %%mm3, %%mm1 \n\t"
1845 "pmaddwd %%mm5, %%mm4 \n\t"
1846 "pmaddwd %%mm5, %%mm1 \n\t"
1847 "addl $24, %%ebx \n\t"
1848 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1849 "psraw $7, %%mm4 \n\t"
1851 "movq %%mm0, %%mm1 \n\t"
1852 "punpckldq %%mm4, %%mm0 \n\t"
1853 "punpckhdq %%mm4, %%mm1 \n\t"
1854 "packsswb %%mm1, %%mm0 \n\t"
1855 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1857 "movd %%mm0, (%2, %%eax) \n\t"
1858 "punpckhdq %%mm0, %%mm0 \n\t"
1859 "movd %%mm0, (%3, %%eax) \n\t"
1860 "addl $4, %%eax \n\t"
1862 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1867 for(i=0; i<width; i++)
1869 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1870 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1871 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1873 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1874 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1879 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1882 for(i=0; i<width; i++)
1884 int d= ((uint16_t*)src)[i];
1887 int r= (d>>11)&0x1F;
1889 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1893 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1896 for(i=0; i<width; i++)
1898 int d0= ((uint32_t*)src1)[i];
1899 int d1= ((uint32_t*)src2)[i];
1901 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1902 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1904 int dh2= (dh>>11) + (dh<<21);
1908 int r= (d>>11)&0x7F;
1910 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1911 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1915 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1918 for(i=0; i<width; i++)
1920 int d= ((uint16_t*)src)[i];
1923 int r= (d>>10)&0x1F;
1925 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1929 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1932 for(i=0; i<width; i++)
1934 int d0= ((uint32_t*)src1)[i];
1935 int d1= ((uint32_t*)src2)[i];
1937 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1938 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1940 int dh2= (dh>>11) + (dh<<21);
1944 int r= (d>>10)&0x7F;
1946 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1947 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1952 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1955 for(i=0; i<width; i++)
1957 int r= ((uint32_t*)src)[i]&0xFF;
1958 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1959 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1961 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1965 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1968 for(i=0; i<width; i++)
1970 const int a= ((uint32_t*)src1)[2*i+0];
1971 const int e= ((uint32_t*)src1)[2*i+1];
1972 const int c= ((uint32_t*)src2)[2*i+0];
1973 const int d= ((uint32_t*)src2)[2*i+1];
1974 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1975 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1976 const int r= l&0x3FF;
1980 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1981 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1985 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1988 for(i=0; i<width; i++)
1994 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1998 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2001 for(i=0; i<width; i++)
2003 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2004 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2005 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2007 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2008 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2013 // Bilinear / Bicubic scaling
2014 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2015 int16_t *filter, int16_t *filterPos, int filterSize)
2018 assert(filterSize % 4 == 0 && filterSize>0);
2019 if(filterSize==4) // allways true for upscaling, sometimes for down too
2021 int counter= -2*dstW;
2023 filterPos-= counter/2;
2026 "pxor %%mm7, %%mm7 \n\t"
2027 "movq "MANGLE(w02)", %%mm6 \n\t"
2028 "pushl %%ebp \n\t" // we use 7 regs here ...
2029 "movl %%eax, %%ebp \n\t"
2032 "movzwl (%2, %%ebp), %%eax \n\t"
2033 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2034 "movq (%1, %%ebp, 4), %%mm1 \n\t"
2035 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
2036 "movd (%3, %%eax), %%mm0 \n\t"
2037 "movd (%3, %%ebx), %%mm2 \n\t"
2038 "punpcklbw %%mm7, %%mm0 \n\t"
2039 "punpcklbw %%mm7, %%mm2 \n\t"
2040 "pmaddwd %%mm1, %%mm0 \n\t"
2041 "pmaddwd %%mm2, %%mm3 \n\t"
2042 "psrad $8, %%mm0 \n\t"
2043 "psrad $8, %%mm3 \n\t"
2044 "packssdw %%mm3, %%mm0 \n\t"
2045 "pmaddwd %%mm6, %%mm0 \n\t"
2046 "packssdw %%mm0, %%mm0 \n\t"
2047 "movd %%mm0, (%4, %%ebp) \n\t"
2048 "addl $4, %%ebp \n\t"
2053 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2057 else if(filterSize==8)
2059 int counter= -2*dstW;
2061 filterPos-= counter/2;
2064 "pxor %%mm7, %%mm7 \n\t"
2065 "movq "MANGLE(w02)", %%mm6 \n\t"
2066 "pushl %%ebp \n\t" // we use 7 regs here ...
2067 "movl %%eax, %%ebp \n\t"
2070 "movzwl (%2, %%ebp), %%eax \n\t"
2071 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2072 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2073 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2074 "movd (%3, %%eax), %%mm0 \n\t"
2075 "movd (%3, %%ebx), %%mm2 \n\t"
2076 "punpcklbw %%mm7, %%mm0 \n\t"
2077 "punpcklbw %%mm7, %%mm2 \n\t"
2078 "pmaddwd %%mm1, %%mm0 \n\t"
2079 "pmaddwd %%mm2, %%mm3 \n\t"
2081 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2082 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2083 "movd 4(%3, %%eax), %%mm4 \n\t"
2084 "movd 4(%3, %%ebx), %%mm2 \n\t"
2085 "punpcklbw %%mm7, %%mm4 \n\t"
2086 "punpcklbw %%mm7, %%mm2 \n\t"
2087 "pmaddwd %%mm1, %%mm4 \n\t"
2088 "pmaddwd %%mm2, %%mm5 \n\t"
2089 "paddd %%mm4, %%mm0 \n\t"
2090 "paddd %%mm5, %%mm3 \n\t"
2092 "psrad $8, %%mm0 \n\t"
2093 "psrad $8, %%mm3 \n\t"
2094 "packssdw %%mm3, %%mm0 \n\t"
2095 "pmaddwd %%mm6, %%mm0 \n\t"
2096 "packssdw %%mm0, %%mm0 \n\t"
2097 "movd %%mm0, (%4, %%ebp) \n\t"
2098 "addl $4, %%ebp \n\t"
2103 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2109 int counter= -2*dstW;
2110 // filter-= counter*filterSize/2;
2111 filterPos-= counter/2;
2114 "pxor %%mm7, %%mm7 \n\t"
2115 "movq "MANGLE(w02)", %%mm6 \n\t"
2118 "movl %2, %%ecx \n\t"
2119 "movzwl (%%ecx, %0), %%eax \n\t"
2120 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2121 "movl %5, %%ecx \n\t"
2122 "pxor %%mm4, %%mm4 \n\t"
2123 "pxor %%mm5, %%mm5 \n\t"
2125 "movq (%1), %%mm1 \n\t"
2126 "movq (%1, %6), %%mm3 \n\t"
2127 "movd (%%ecx, %%eax), %%mm0 \n\t"
2128 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2129 "punpcklbw %%mm7, %%mm0 \n\t"
2130 "punpcklbw %%mm7, %%mm2 \n\t"
2131 "pmaddwd %%mm1, %%mm0 \n\t"
2132 "pmaddwd %%mm2, %%mm3 \n\t"
2133 "paddd %%mm3, %%mm5 \n\t"
2134 "paddd %%mm0, %%mm4 \n\t"
2136 "addl $4, %%ecx \n\t"
2137 "cmpl %4, %%ecx \n\t"
2140 "psrad $8, %%mm4 \n\t"
2141 "psrad $8, %%mm5 \n\t"
2142 "packssdw %%mm5, %%mm4 \n\t"
2143 "pmaddwd %%mm6, %%mm4 \n\t"
2144 "packssdw %%mm4, %%mm4 \n\t"
2145 "movl %3, %%eax \n\t"
2146 "movd %%mm4, (%%eax, %0) \n\t"
2150 : "+r" (counter), "+r" (filter)
2151 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2152 "m" (src), "r" (filterSize*2)
2153 : "%ebx", "%eax", "%ecx"
2158 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2161 for(i=0; i<dstW; i++)
2164 int srcPos= filterPos[i];
2166 // printf("filterPos: %d\n", filterPos[i]);
2167 for(j=0; j<filterSize; j++)
2169 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2170 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2172 // filter += hFilterSize;
2173 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2179 // *** horizontal scale Y line to temp buffer
2180 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2181 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2182 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2183 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2184 int32_t *mmx2FilterPos)
2186 if(srcFormat==IMGFMT_YUY2)
2188 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2189 src= formatConvBuffer;
2191 else if(srcFormat==IMGFMT_UYVY)
2193 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2194 src= formatConvBuffer;
2196 else if(srcFormat==IMGFMT_BGR32)
2198 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2199 src= formatConvBuffer;
2201 else if(srcFormat==IMGFMT_BGR24)
2203 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2204 src= formatConvBuffer;
2206 else if(srcFormat==IMGFMT_BGR16)
2208 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2209 src= formatConvBuffer;
2211 else if(srcFormat==IMGFMT_BGR15)
2213 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2214 src= formatConvBuffer;
2216 else if(srcFormat==IMGFMT_RGB32)
2218 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2219 src= formatConvBuffer;
2221 else if(srcFormat==IMGFMT_RGB24)
2223 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2224 src= formatConvBuffer;
2228 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2229 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2231 if(!(flags&SWS_FAST_BILINEAR))
2234 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2236 else // Fast Bilinear upscale / crap downscale
2244 "pxor %%mm7, %%mm7 \n\t"
2245 "movl %0, %%ecx \n\t"
2246 "movl %1, %%edi \n\t"
2247 "movl %2, %%edx \n\t"
2248 "movl %3, %%ebx \n\t"
2249 "xorl %%eax, %%eax \n\t" // i
2250 PREFETCH" (%%ecx) \n\t"
2251 PREFETCH" 32(%%ecx) \n\t"
2252 PREFETCH" 64(%%ecx) \n\t"
2254 #define FUNNY_Y_CODE \
2255 "movl (%%ebx), %%esi \n\t"\
2257 "addl (%%ebx, %%eax), %%ecx \n\t"\
2258 "addl %%eax, %%edi \n\t"\
2259 "xorl %%eax, %%eax \n\t"\
2270 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2272 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2274 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2279 //NO MMX just normal asm ...
2281 "xorl %%eax, %%eax \n\t" // i
2282 "xorl %%ebx, %%ebx \n\t" // xx
2283 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2286 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2287 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2288 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2289 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2290 "shll $16, %%edi \n\t"
2291 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2292 "movl %1, %%edi \n\t"
2293 "shrl $9, %%esi \n\t"
2294 "movw %%si, (%%edi, %%eax, 2) \n\t"
2295 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2296 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2298 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2299 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2300 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2301 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2302 "shll $16, %%edi \n\t"
2303 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2304 "movl %1, %%edi \n\t"
2305 "shrl $9, %%esi \n\t"
2306 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2307 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2308 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2311 "addl $2, %%eax \n\t"
2312 "cmpl %2, %%eax \n\t"
2316 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2317 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2320 } //if MMX2 can't be used
2324 unsigned int xpos=0;
2325 for(i=0;i<dstWidth;i++)
2327 register unsigned int xx=xpos>>16;
2328 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2329 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2336 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2337 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2338 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2339 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2340 int32_t *mmx2FilterPos)
2342 if(srcFormat==IMGFMT_YUY2)
2344 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2345 src1= formatConvBuffer;
2346 src2= formatConvBuffer+2048;
2348 else if(srcFormat==IMGFMT_UYVY)
2350 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2351 src1= formatConvBuffer;
2352 src2= formatConvBuffer+2048;
2354 else if(srcFormat==IMGFMT_BGR32)
2356 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2357 src1= formatConvBuffer;
2358 src2= formatConvBuffer+2048;
2360 else if(srcFormat==IMGFMT_BGR24)
2362 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2363 src1= formatConvBuffer;
2364 src2= formatConvBuffer+2048;
2366 else if(srcFormat==IMGFMT_BGR16)
2368 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2369 src1= formatConvBuffer;
2370 src2= formatConvBuffer+2048;
2372 else if(srcFormat==IMGFMT_BGR15)
2374 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2375 src1= formatConvBuffer;
2376 src2= formatConvBuffer+2048;
2378 else if(srcFormat==IMGFMT_RGB32)
2380 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2381 src1= formatConvBuffer;
2382 src2= formatConvBuffer+2048;
2384 else if(srcFormat==IMGFMT_RGB24)
2386 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2387 src1= formatConvBuffer;
2388 src2= formatConvBuffer+2048;
2390 else if(isGray(srcFormat))
2396 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2397 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2399 if(!(flags&SWS_FAST_BILINEAR))
2402 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2403 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2405 else // Fast Bilinear upscale / crap downscale
2413 "pxor %%mm7, %%mm7 \n\t"
2414 "movl %0, %%ecx \n\t"
2415 "movl %1, %%edi \n\t"
2416 "movl %2, %%edx \n\t"
2417 "movl %3, %%ebx \n\t"
2418 "xorl %%eax, %%eax \n\t" // i
2419 PREFETCH" (%%ecx) \n\t"
2420 PREFETCH" 32(%%ecx) \n\t"
2421 PREFETCH" 64(%%ecx) \n\t"
2423 #define FUNNY_UV_CODE \
2424 "movl (%%ebx), %%esi \n\t"\
2426 "addl (%%ebx, %%eax), %%ecx \n\t"\
2427 "addl %%eax, %%edi \n\t"\
2428 "xorl %%eax, %%eax \n\t"\
2434 "xorl %%eax, %%eax \n\t" // i
2435 "movl %5, %%ecx \n\t" // src
2436 "movl %1, %%edi \n\t" // buf1
2437 "addl $4096, %%edi \n\t"
2438 PREFETCH" (%%ecx) \n\t"
2439 PREFETCH" 32(%%ecx) \n\t"
2440 PREFETCH" 64(%%ecx) \n\t"
2447 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2448 "m" (funnyUVCode), "m" (src2)
2449 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2451 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2453 // printf("%d %d %d\n", dstWidth, i, srcW);
2454 dst[i] = src1[srcW-1]*128;
2455 dst[i+2048] = src2[srcW-1]*128;
2462 "xorl %%eax, %%eax \n\t" // i
2463 "xorl %%ebx, %%ebx \n\t" // xx
2464 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2467 "movl %0, %%esi \n\t"
2468 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2469 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2470 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2471 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2472 "shll $16, %%edi \n\t"
2473 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2474 "movl %1, %%edi \n\t"
2475 "shrl $9, %%esi \n\t"
2476 "movw %%si, (%%edi, %%eax, 2) \n\t"
2478 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2479 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2480 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2481 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2482 "shll $16, %%edi \n\t"
2483 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2484 "movl %1, %%edi \n\t"
2485 "shrl $9, %%esi \n\t"
2486 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2488 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2489 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2490 "addl $1, %%eax \n\t"
2491 "cmpl %2, %%eax \n\t"
2494 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2496 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2499 } //if MMX2 can't be used
2503 unsigned int xpos=0;
2504 for(i=0;i<dstWidth;i++)
2506 register unsigned int xx=xpos>>16;
2507 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2508 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2509 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2511 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2512 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2520 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2521 int srcSliceH, uint8_t* dst[], int dstStride[]){
2523 /* load a few things into local vars to make the code more readable? and faster */
2524 const int srcW= c->srcW;
2525 const int dstW= c->dstW;
2526 const int dstH= c->dstH;
2527 const int chrDstW= c->chrDstW;
2528 const int chrSrcW= c->chrSrcW;
2529 const int lumXInc= c->lumXInc;
2530 const int chrXInc= c->chrXInc;
2531 const int dstFormat= c->dstFormat;
2532 const int srcFormat= c->srcFormat;
2533 const int flags= c->flags;
2534 const int canMMX2BeUsed= c->canMMX2BeUsed;
2535 int16_t *vLumFilterPos= c->vLumFilterPos;
2536 int16_t *vChrFilterPos= c->vChrFilterPos;
2537 int16_t *hLumFilterPos= c->hLumFilterPos;
2538 int16_t *hChrFilterPos= c->hChrFilterPos;
2539 int16_t *vLumFilter= c->vLumFilter;
2540 int16_t *vChrFilter= c->vChrFilter;
2541 int16_t *hLumFilter= c->hLumFilter;
2542 int16_t *hChrFilter= c->hChrFilter;
2543 int32_t *lumMmxFilter= c->lumMmxFilter;
2544 int32_t *chrMmxFilter= c->chrMmxFilter;
2545 const int vLumFilterSize= c->vLumFilterSize;
2546 const int vChrFilterSize= c->vChrFilterSize;
2547 const int hLumFilterSize= c->hLumFilterSize;
2548 const int hChrFilterSize= c->hChrFilterSize;
2549 int16_t **lumPixBuf= c->lumPixBuf;
2550 int16_t **chrPixBuf= c->chrPixBuf;
2551 const int vLumBufSize= c->vLumBufSize;
2552 const int vChrBufSize= c->vChrBufSize;
2553 uint8_t *funnyYCode= c->funnyYCode;
2554 uint8_t *funnyUVCode= c->funnyUVCode;
2555 uint8_t *formatConvBuffer= c->formatConvBuffer;
2556 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2557 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2560 /* vars whch will change and which we need to storw back in the context */
2562 int lumBufIndex= c->lumBufIndex;
2563 int chrBufIndex= c->chrBufIndex;
2564 int lastInLumBuf= c->lastInLumBuf;
2565 int lastInChrBuf= c->lastInChrBuf;
2567 if(isPacked(c->srcFormat)){
2573 srcStride[2]= srcStride[0];
2575 srcStride[1]<<= c->vChrDrop;
2576 srcStride[2]<<= c->vChrDrop;
2578 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2579 // (int)dst[0], (int)dst[1], (int)dst[2]);
2581 #if 0 //self test FIXME move to a vfilter or something
2583 static volatile int i=0;
2585 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2586 selfTest(src, srcStride, c->srcW, c->srcH);
2591 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2592 //dstStride[0],dstStride[1],dstStride[2]);
2594 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2596 static int firstTime=1; //FIXME move this into the context perhaps
2597 if(flags & SWS_PRINT_INFO && firstTime)
2599 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2600 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2605 /* Note the user might start scaling the picture in the middle so this will not get executed
2606 this is not really intended but works currently, so ppl might do it */
2617 for(;dstY < dstH; dstY++){
2618 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2619 const int chrDstY= dstY>>c->chrDstVSubSample;
2620 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2621 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2623 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2624 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2625 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2626 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2628 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2629 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2630 //handle holes (FAST_BILINEAR & weird filters)
2631 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2632 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2633 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2634 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2635 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2637 // Do we have enough lines in this slice to output the dstY line
2638 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2640 //Do horizontal scaling
2641 while(lastInLumBuf < lastLumSrcY)
2643 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2645 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2646 ASSERT(lumBufIndex < 2*vLumBufSize)
2647 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2648 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2649 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2650 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2651 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2652 funnyYCode, c->srcFormat, formatConvBuffer,
2653 c->lumMmx2Filter, c->lumMmx2FilterPos);
2656 while(lastInChrBuf < lastChrSrcY)
2658 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2659 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2661 ASSERT(chrBufIndex < 2*vChrBufSize)
2662 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2663 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2664 //FIXME replace parameters through context struct (some at least)
2666 if(!(isGray(srcFormat) || isGray(dstFormat)))
2667 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2668 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2669 funnyUVCode, c->srcFormat, formatConvBuffer,
2670 c->chrMmx2Filter, c->chrMmx2FilterPos);
2673 //wrap buf index around to stay inside the ring buffer
2674 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2675 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2677 else // not enough lines left in this slice -> load the rest in the buffer
2679 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2680 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2681 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2682 vChrBufSize, vLumBufSize);*/
2684 //Do horizontal scaling
2685 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2687 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2689 ASSERT(lumBufIndex < 2*vLumBufSize)
2690 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2691 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2692 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2693 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2694 funnyYCode, c->srcFormat, formatConvBuffer,
2695 c->lumMmx2Filter, c->lumMmx2FilterPos);
2698 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2700 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2701 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2703 ASSERT(chrBufIndex < 2*vChrBufSize)
2704 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2705 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2707 if(!(isGray(srcFormat) || isGray(dstFormat)))
2708 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2709 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2710 funnyUVCode, c->srcFormat, formatConvBuffer,
2711 c->chrMmx2Filter, c->chrMmx2FilterPos);
2714 //wrap buf index around to stay inside the ring buffer
2715 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2716 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2717 break; //we can't output a dstY line so let's try with the next slice
2721 b5Dither= dither8[dstY&1];
2722 g6Dither= dither4[dstY&1];
2723 g5Dither= dither8[dstY&1];
2724 r5Dither= dither8[(dstY+1)&1];
2728 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2729 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2732 for(i=0; i<vLumFilterSize; i++)
2734 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2735 lumMmxFilter[4*i+2]=
2736 lumMmxFilter[4*i+3]=
2737 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2739 for(i=0; i<vChrFilterSize; i++)
2741 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2742 chrMmxFilter[4*i+2]=
2743 chrMmxFilter[4*i+3]=
2744 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2747 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2749 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2750 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2751 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2753 int16_t *lumBuf = lumPixBuf[0];
2754 int16_t *chrBuf= chrPixBuf[0];
2755 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2760 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2761 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2762 dest, uDest, vDest, dstW, chrDstW);
2767 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2768 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2769 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2771 int chrAlpha= vChrFilter[2*dstY+1];
2772 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2773 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2775 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2777 int lumAlpha= vLumFilter[2*dstY+1];
2778 int chrAlpha= vChrFilter[2*dstY+1];
2779 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2780 dest, dstW, lumAlpha, chrAlpha, dstY);
2784 RENAME(yuv2packedX)(c,
2785 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2786 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2791 else // hmm looks like we can't use MMX here without overwriting this array's tail
2793 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2794 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2795 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2797 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2800 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2801 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2802 dest, uDest, vDest, dstW, chrDstW);
2806 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2807 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2809 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2810 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2817 __asm __volatile(SFENCE:::"memory");
2818 __asm __volatile(EMMS:::"memory");
2820 /* store changed local vars back in the context */
2822 c->lumBufIndex= lumBufIndex;
2823 c->chrBufIndex= chrBufIndex;
2824 c->lastInLumBuf= lastInLumBuf;
2825 c->lastInChrBuf= lastInChrBuf;
2827 return dstY - lastDstY;