2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
45 #define SFENCE "sfence"
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define YSCALEYUV2YV12X(x) \
63 "xorl %%eax, %%eax \n\t"\
64 "pxor %%mm3, %%mm3 \n\t"\
65 "pxor %%mm4, %%mm4 \n\t"\
66 "movl %0, %%edx \n\t"\
67 ".balign 16 \n\t" /* FIXME Unroll? */\
69 "movl (%1, %%edx, 4), %%esi \n\t"\
70 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
71 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
72 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 "addl $1, %%edx \n\t"\
79 "psraw $3, %%mm3 \n\t"\
80 "psraw $3, %%mm4 \n\t"\
81 "packuswb %%mm4, %%mm3 \n\t"\
82 MOVNTQ(%%mm3, (%3, %%eax))\
83 "addl $8, %%eax \n\t"\
84 "cmpl %4, %%eax \n\t"\
85 "pxor %%mm3, %%mm3 \n\t"\
86 "pxor %%mm4, %%mm4 \n\t"\
87 "movl %0, %%edx \n\t"\
90 #define YSCALEYUV2YV121 \
91 "movl %2, %%eax \n\t"\
92 ".balign 16 \n\t" /* FIXME Unroll? */\
94 "movq (%0, %%eax, 2), %%mm0 \n\t"\
95 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
96 "psraw $7, %%mm0 \n\t"\
97 "psraw $7, %%mm1 \n\t"\
98 "packuswb %%mm1, %%mm0 \n\t"\
99 MOVNTQ(%%mm0, (%1, %%eax))\
100 "addl $8, %%eax \n\t"\
104 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
105 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
106 "r" (dest), "m" (dstW),
107 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
108 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110 #define YSCALEYUV2PACKEDX \
111 "xorl %%eax, %%eax \n\t"\
114 "movl %1, %%edx \n\t" /* -chrFilterSize */\
115 "movl %3, %%ebx \n\t" /* chrMmxFilter+chrFilterSize */\
116 "movl %7, %%ecx \n\t" /* chrSrc+chrFilterSize */\
117 "pxor %%mm3, %%mm3 \n\t"\
118 "pxor %%mm4, %%mm4 \n\t"\
120 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
121 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
122 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
123 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
124 "pmulhw %%mm0, %%mm2 \n\t"\
125 "pmulhw %%mm0, %%mm5 \n\t"\
126 "paddw %%mm2, %%mm3 \n\t"\
127 "paddw %%mm5, %%mm4 \n\t"\
128 "addl $1, %%edx \n\t"\
131 "movl %0, %%edx \n\t" /* -lumFilterSize */\
132 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
133 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
134 "pxor %%mm1, %%mm1 \n\t"\
135 "pxor %%mm7, %%mm7 \n\t"\
137 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
138 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
139 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
140 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
141 "pmulhw %%mm0, %%mm2 \n\t"\
142 "pmulhw %%mm0, %%mm5 \n\t"\
143 "paddw %%mm2, %%mm1 \n\t"\
144 "paddw %%mm5, %%mm7 \n\t"\
145 "addl $1, %%edx \n\t"\
149 #define YSCALEYUV2RGBX \
151 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
152 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
153 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
154 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
155 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
156 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
157 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
158 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
159 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
160 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
161 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
162 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
163 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
164 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
165 "paddw %%mm3, %%mm4 \n\t"\
166 "movq %%mm2, %%mm0 \n\t"\
167 "movq %%mm5, %%mm6 \n\t"\
168 "movq %%mm4, %%mm3 \n\t"\
169 "punpcklwd %%mm2, %%mm2 \n\t"\
170 "punpcklwd %%mm5, %%mm5 \n\t"\
171 "punpcklwd %%mm4, %%mm4 \n\t"\
172 "paddw %%mm1, %%mm2 \n\t"\
173 "paddw %%mm1, %%mm5 \n\t"\
174 "paddw %%mm1, %%mm4 \n\t"\
175 "punpckhwd %%mm0, %%mm0 \n\t"\
176 "punpckhwd %%mm6, %%mm6 \n\t"\
177 "punpckhwd %%mm3, %%mm3 \n\t"\
178 "paddw %%mm7, %%mm0 \n\t"\
179 "paddw %%mm7, %%mm6 \n\t"\
180 "paddw %%mm7, %%mm3 \n\t"\
181 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
182 "packuswb %%mm0, %%mm2 \n\t"\
183 "packuswb %%mm6, %%mm5 \n\t"\
184 "packuswb %%mm3, %%mm4 \n\t"\
185 "pxor %%mm7, %%mm7 \n\t"
187 #define FULL_YSCALEYUV2RGB \
188 "pxor %%mm7, %%mm7 \n\t"\
189 "movd %6, %%mm6 \n\t" /*yalpha1*/\
190 "punpcklwd %%mm6, %%mm6 \n\t"\
191 "punpcklwd %%mm6, %%mm6 \n\t"\
192 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
193 "punpcklwd %%mm5, %%mm5 \n\t"\
194 "punpcklwd %%mm5, %%mm5 \n\t"\
195 "xorl %%eax, %%eax \n\t"\
198 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
199 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
200 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
201 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
202 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
203 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
204 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
205 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
206 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
207 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
208 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
209 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
210 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
211 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
212 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
213 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
214 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
215 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
218 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
219 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
220 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
221 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
222 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
223 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
224 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
227 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
228 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
229 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
230 "paddw %%mm1, %%mm3 \n\t" /* B*/\
231 "paddw %%mm1, %%mm0 \n\t" /* R*/\
232 "packuswb %%mm3, %%mm3 \n\t"\
234 "packuswb %%mm0, %%mm0 \n\t"\
235 "paddw %%mm4, %%mm2 \n\t"\
236 "paddw %%mm2, %%mm1 \n\t" /* G*/\
238 "packuswb %%mm1, %%mm1 \n\t"
240 #define YSCALEYUV2PACKED \
241 "movd %6, %%mm6 \n\t" /*yalpha1*/\
242 "punpcklwd %%mm6, %%mm6 \n\t"\
243 "punpcklwd %%mm6, %%mm6 \n\t"\
244 "psraw $3, %%mm6 \n\t"\
245 "movq %%mm6, 3968(%2) \n\t"\
246 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
247 "punpcklwd %%mm5, %%mm5 \n\t"\
248 "punpcklwd %%mm5, %%mm5 \n\t"\
249 "psraw $3, %%mm5 \n\t"\
250 "movq %%mm5, 3976(%2) \n\t"\
251 "xorl %%eax, %%eax \n\t"\
254 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
255 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
256 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
257 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
258 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
259 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
260 "movq 3976(%2), %%mm0 \n\t"\
261 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
262 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
263 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
264 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
265 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
266 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
267 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
268 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
269 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
270 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
271 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
272 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
273 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
274 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
275 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
276 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
277 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
278 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
280 #define YSCALEYUV2RGB \
281 "movd %6, %%mm6 \n\t" /*yalpha1*/\
282 "punpcklwd %%mm6, %%mm6 \n\t"\
283 "punpcklwd %%mm6, %%mm6 \n\t"\
284 "movq %%mm6, 3968(%2) \n\t"\
285 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
286 "punpcklwd %%mm5, %%mm5 \n\t"\
287 "punpcklwd %%mm5, %%mm5 \n\t"\
288 "movq %%mm5, 3976(%2) \n\t"\
289 "xorl %%eax, %%eax \n\t"\
292 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
293 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
294 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
295 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
296 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
297 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
298 "movq 3976(%2), %%mm0 \n\t"\
299 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
300 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
301 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
302 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
303 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
304 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
305 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
306 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
307 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
308 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
309 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
310 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
311 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
312 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
313 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
314 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
315 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
316 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
317 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
318 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
319 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
320 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
321 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
322 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
323 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
324 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
325 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
326 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
327 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
328 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
329 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
330 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
331 "paddw %%mm3, %%mm4 \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "movq %%mm5, %%mm6 \n\t"\
334 "movq %%mm4, %%mm3 \n\t"\
335 "punpcklwd %%mm2, %%mm2 \n\t"\
336 "punpcklwd %%mm5, %%mm5 \n\t"\
337 "punpcklwd %%mm4, %%mm4 \n\t"\
338 "paddw %%mm1, %%mm2 \n\t"\
339 "paddw %%mm1, %%mm5 \n\t"\
340 "paddw %%mm1, %%mm4 \n\t"\
341 "punpckhwd %%mm0, %%mm0 \n\t"\
342 "punpckhwd %%mm6, %%mm6 \n\t"\
343 "punpckhwd %%mm3, %%mm3 \n\t"\
344 "paddw %%mm7, %%mm0 \n\t"\
345 "paddw %%mm7, %%mm6 \n\t"\
346 "paddw %%mm7, %%mm3 \n\t"\
347 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
348 "packuswb %%mm0, %%mm2 \n\t"\
349 "packuswb %%mm6, %%mm5 \n\t"\
350 "packuswb %%mm3, %%mm4 \n\t"\
351 "pxor %%mm7, %%mm7 \n\t"
353 #define YSCALEYUV2PACKED1 \
354 "xorl %%eax, %%eax \n\t"\
357 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
358 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
359 "psraw $7, %%mm3 \n\t" \
360 "psraw $7, %%mm4 \n\t" \
361 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
362 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
363 "psraw $7, %%mm1 \n\t" \
364 "psraw $7, %%mm7 \n\t" \
366 #define YSCALEYUV2RGB1 \
367 "xorl %%eax, %%eax \n\t"\
370 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
371 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
372 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
373 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
374 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
375 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
376 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
377 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
378 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
379 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
380 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
381 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
382 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
383 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
384 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
385 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
386 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
387 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
388 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
389 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
390 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
391 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
392 "paddw %%mm3, %%mm4 \n\t"\
393 "movq %%mm2, %%mm0 \n\t"\
394 "movq %%mm5, %%mm6 \n\t"\
395 "movq %%mm4, %%mm3 \n\t"\
396 "punpcklwd %%mm2, %%mm2 \n\t"\
397 "punpcklwd %%mm5, %%mm5 \n\t"\
398 "punpcklwd %%mm4, %%mm4 \n\t"\
399 "paddw %%mm1, %%mm2 \n\t"\
400 "paddw %%mm1, %%mm5 \n\t"\
401 "paddw %%mm1, %%mm4 \n\t"\
402 "punpckhwd %%mm0, %%mm0 \n\t"\
403 "punpckhwd %%mm6, %%mm6 \n\t"\
404 "punpckhwd %%mm3, %%mm3 \n\t"\
405 "paddw %%mm7, %%mm0 \n\t"\
406 "paddw %%mm7, %%mm6 \n\t"\
407 "paddw %%mm7, %%mm3 \n\t"\
408 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
409 "packuswb %%mm0, %%mm2 \n\t"\
410 "packuswb %%mm6, %%mm5 \n\t"\
411 "packuswb %%mm3, %%mm4 \n\t"\
412 "pxor %%mm7, %%mm7 \n\t"
414 #define YSCALEYUV2PACKED1b \
415 "xorl %%eax, %%eax \n\t"\
418 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
419 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
420 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
421 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
422 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
423 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
424 "psrlw $8, %%mm3 \n\t" \
425 "psrlw $8, %%mm4 \n\t" \
426 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
427 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
428 "psraw $7, %%mm1 \n\t" \
429 "psraw $7, %%mm7 \n\t"
431 // do vertical chrominance interpolation
432 #define YSCALEYUV2RGB1b \
433 "xorl %%eax, %%eax \n\t"\
436 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
437 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
438 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
439 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
440 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
441 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
442 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
443 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
444 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
445 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
446 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
447 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
448 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
449 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
450 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
451 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
452 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
453 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
454 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
455 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
456 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
457 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
458 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
459 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
460 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
461 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
462 "paddw %%mm3, %%mm4 \n\t"\
463 "movq %%mm2, %%mm0 \n\t"\
464 "movq %%mm5, %%mm6 \n\t"\
465 "movq %%mm4, %%mm3 \n\t"\
466 "punpcklwd %%mm2, %%mm2 \n\t"\
467 "punpcklwd %%mm5, %%mm5 \n\t"\
468 "punpcklwd %%mm4, %%mm4 \n\t"\
469 "paddw %%mm1, %%mm2 \n\t"\
470 "paddw %%mm1, %%mm5 \n\t"\
471 "paddw %%mm1, %%mm4 \n\t"\
472 "punpckhwd %%mm0, %%mm0 \n\t"\
473 "punpckhwd %%mm6, %%mm6 \n\t"\
474 "punpckhwd %%mm3, %%mm3 \n\t"\
475 "paddw %%mm7, %%mm0 \n\t"\
476 "paddw %%mm7, %%mm6 \n\t"\
477 "paddw %%mm7, %%mm3 \n\t"\
478 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
479 "packuswb %%mm0, %%mm2 \n\t"\
480 "packuswb %%mm6, %%mm5 \n\t"\
481 "packuswb %%mm3, %%mm4 \n\t"\
482 "pxor %%mm7, %%mm7 \n\t"
485 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
486 "movq %%mm2, %%mm1 \n\t" /* B */\
487 "movq %%mm5, %%mm6 \n\t" /* R */\
488 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
489 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
490 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
491 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
492 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
493 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
494 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
495 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
496 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
497 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
499 MOVNTQ(%%mm0, (%4, %%eax, 4))\
500 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
501 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
502 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
504 "addl $8, %%eax \n\t"\
505 "cmpl %5, %%eax \n\t"\
509 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
510 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
511 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
512 "psrlq $3, %%mm2 \n\t"\
514 "movq %%mm2, %%mm1 \n\t"\
515 "movq %%mm4, %%mm3 \n\t"\
517 "punpcklbw %%mm7, %%mm3 \n\t"\
518 "punpcklbw %%mm5, %%mm2 \n\t"\
519 "punpckhbw %%mm7, %%mm4 \n\t"\
520 "punpckhbw %%mm5, %%mm1 \n\t"\
522 "psllq $3, %%mm3 \n\t"\
523 "psllq $3, %%mm4 \n\t"\
525 "por %%mm3, %%mm2 \n\t"\
526 "por %%mm4, %%mm1 \n\t"\
528 MOVNTQ(%%mm2, (%4, %%eax, 2))\
529 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
531 "addl $8, %%eax \n\t"\
532 "cmpl %5, %%eax \n\t"\
536 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
537 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
538 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
539 "psrlq $3, %%mm2 \n\t"\
540 "psrlq $1, %%mm5 \n\t"\
542 "movq %%mm2, %%mm1 \n\t"\
543 "movq %%mm4, %%mm3 \n\t"\
545 "punpcklbw %%mm7, %%mm3 \n\t"\
546 "punpcklbw %%mm5, %%mm2 \n\t"\
547 "punpckhbw %%mm7, %%mm4 \n\t"\
548 "punpckhbw %%mm5, %%mm1 \n\t"\
550 "psllq $2, %%mm3 \n\t"\
551 "psllq $2, %%mm4 \n\t"\
553 "por %%mm3, %%mm2 \n\t"\
554 "por %%mm4, %%mm1 \n\t"\
556 MOVNTQ(%%mm2, (%4, %%eax, 2))\
557 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
559 "addl $8, %%eax \n\t"\
560 "cmpl %5, %%eax \n\t"\
563 #define WRITEBGR24OLD \
564 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
565 "movq %%mm2, %%mm1 \n\t" /* B */\
566 "movq %%mm5, %%mm6 \n\t" /* R */\
567 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
568 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
569 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
570 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
571 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
572 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
573 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
574 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
575 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
576 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
578 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
579 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
580 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
581 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
582 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
583 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
584 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
585 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
587 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
588 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
589 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
590 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
591 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
592 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
593 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
595 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
596 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
597 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
598 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
599 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
601 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
602 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
603 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
604 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
605 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
606 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
607 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
608 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
610 MOVNTQ(%%mm0, (%%ebx))\
611 MOVNTQ(%%mm2, 8(%%ebx))\
612 MOVNTQ(%%mm3, 16(%%ebx))\
613 "addl $24, %%ebx \n\t"\
615 "addl $8, %%eax \n\t"\
616 "cmpl %5, %%eax \n\t"\
619 #define WRITEBGR24MMX \
620 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
621 "movq %%mm2, %%mm1 \n\t" /* B */\
622 "movq %%mm5, %%mm6 \n\t" /* R */\
623 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
624 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
625 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
626 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
627 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
628 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
629 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
630 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
631 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
632 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
634 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
635 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
636 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
637 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
639 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
640 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
641 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
642 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
644 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
645 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
646 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
647 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
649 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
650 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
651 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
652 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
653 MOVNTQ(%%mm0, (%%ebx))\
655 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
656 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
657 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
658 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
659 MOVNTQ(%%mm6, 8(%%ebx))\
661 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
662 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
663 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
664 MOVNTQ(%%mm5, 16(%%ebx))\
666 "addl $24, %%ebx \n\t"\
668 "addl $8, %%eax \n\t"\
669 "cmpl %5, %%eax \n\t"\
672 #define WRITEBGR24MMX2 \
673 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
674 "movq "MANGLE(M24A)", %%mm0 \n\t"\
675 "movq "MANGLE(M24C)", %%mm7 \n\t"\
676 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
677 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
678 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
680 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
681 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
682 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
684 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
685 "por %%mm1, %%mm6 \n\t"\
686 "por %%mm3, %%mm6 \n\t"\
687 MOVNTQ(%%mm6, (%%ebx))\
689 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
690 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
691 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
692 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
694 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
695 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
696 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
698 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
699 "por %%mm3, %%mm6 \n\t"\
700 MOVNTQ(%%mm6, 8(%%ebx))\
702 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
703 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
704 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
706 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
707 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
708 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
710 "por %%mm1, %%mm3 \n\t"\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 16(%%ebx))\
714 "addl $24, %%ebx \n\t"\
716 "addl $8, %%eax \n\t"\
717 "cmpl %5, %%eax \n\t"\
722 #define WRITEBGR24 WRITEBGR24MMX2
725 #define WRITEBGR24 WRITEBGR24MMX
729 "packuswb %%mm3, %%mm3 \n\t"\
730 "packuswb %%mm4, %%mm4 \n\t"\
731 "packuswb %%mm7, %%mm1 \n\t"\
732 "punpcklbw %%mm4, %%mm3 \n\t"\
733 "movq %%mm1, %%mm7 \n\t"\
734 "punpcklbw %%mm3, %%mm1 \n\t"\
735 "punpckhbw %%mm3, %%mm7 \n\t"\
737 MOVNTQ(%%mm1, (%4, %%eax, 2))\
738 MOVNTQ(%%mm7, 8(%4, %%eax, 2))\
740 "addl $8, %%eax \n\t"\
741 "cmpl %5, %%eax \n\t"\
745 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
746 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
747 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
748 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
755 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
756 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
757 : "%eax", "%edx", "%esi"
761 YSCALEYUV2YV12X(4096)
762 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
763 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
764 : "%eax", "%edx", "%esi"
770 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
771 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
772 : "%eax", "%edx", "%esi"
775 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
776 chrFilter, chrSrc, chrFilterSize,
777 dest, uDest, vDest, dstW, chrDstW);
781 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
782 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
789 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
796 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
804 :: "r" (lumSrc + dstW), "r" (dest + dstW),
810 for(i=0; i<dstW; i++)
812 int val= lumSrc[i]>>7;
823 for(i=0; i<chrDstW; i++)
826 int v=chrSrc[i + 2048]>>7;
830 else if (u>255) u=255;
832 else if (v>255) v=255;
843 * vertical scale YV12 to RGB
845 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
846 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
847 uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
858 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
859 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
860 "r" (dest), "m" (dstW),
861 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
862 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
870 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
871 "addl %4, %%ebx \n\t"
874 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
875 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
876 "r" (dest), "m" (dstW),
877 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
878 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
886 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
888 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
889 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
890 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
895 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
896 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
897 "r" (dest), "m" (dstW),
898 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
899 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
907 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
909 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
910 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
911 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
916 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
917 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
918 "r" (dest), "m" (dstW),
919 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
920 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
928 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
930 "psraw $3, %%mm3 \n\t"
931 "psraw $3, %%mm4 \n\t"
932 "psraw $3, %%mm1 \n\t"
933 "psraw $3, %%mm7 \n\t"
936 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
937 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
938 "r" (dest), "m" (dstW),
939 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
940 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
946 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
947 chrFilter, chrSrc, chrFilterSize,
954 * vertical bilinear scale YV12 to RGB
956 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
957 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
959 int yalpha1=yalpha^4095;
960 int uvalpha1=uvalpha^4095;
964 if(flags&SWS_FULL_CHR_H_INT)
974 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
975 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
977 "movq %%mm3, %%mm1 \n\t"
978 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
979 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
981 MOVNTQ(%%mm3, (%4, %%eax, 4))
982 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
984 "addl $4, %%eax \n\t"
985 "cmpl %5, %%eax \n\t"
989 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
990 "m" (yalpha1), "m" (uvalpha1)
1000 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1001 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1003 "movq %%mm3, %%mm1 \n\t"
1004 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1005 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1007 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1008 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1009 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1010 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1011 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1012 "movq %%mm1, %%mm2 \n\t"
1013 "psllq $48, %%mm1 \n\t" // 000000BG
1014 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1016 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1017 "psrld $16, %%mm2 \n\t" // R000R000
1018 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1019 "por %%mm2, %%mm1 \n\t" // RBGRR000
1021 "movl %4, %%ebx \n\t"
1022 "addl %%eax, %%ebx \n\t"
1026 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
1027 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
1029 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
1030 "psrlq $32, %%mm3 \n\t"
1031 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
1032 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
1034 "addl $4, %%eax \n\t"
1035 "cmpl %5, %%eax \n\t"
1038 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1039 "m" (yalpha1), "m" (uvalpha1)
1048 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1049 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1050 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1052 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1053 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1054 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1056 "psrlw $3, %%mm3 \n\t"
1057 "psllw $2, %%mm1 \n\t"
1058 "psllw $7, %%mm0 \n\t"
1059 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1060 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1062 "por %%mm3, %%mm1 \n\t"
1063 "por %%mm1, %%mm0 \n\t"
1065 MOVNTQ(%%mm0, (%4, %%eax, 2))
1067 "addl $4, %%eax \n\t"
1068 "cmpl %5, %%eax \n\t"
1071 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1072 "m" (yalpha1), "m" (uvalpha1)
1081 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1082 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1083 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1085 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1086 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1087 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1089 "psrlw $3, %%mm3 \n\t"
1090 "psllw $3, %%mm1 \n\t"
1091 "psllw $8, %%mm0 \n\t"
1092 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1093 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1095 "por %%mm3, %%mm1 \n\t"
1096 "por %%mm1, %%mm0 \n\t"
1098 MOVNTQ(%%mm0, (%4, %%eax, 2))
1100 "addl $4, %%eax \n\t"
1101 "cmpl %5, %%eax \n\t"
1104 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1105 "m" (yalpha1), "m" (uvalpha1)
1114 if(dstFormat==IMGFMT_BGR32)
1117 #ifdef WORDS_BIGENDIAN
1120 for(i=0;i<dstW;i++){
1121 // vertical linear interpolation && yuv2rgb in a single step:
1122 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1123 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1124 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1125 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1126 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1127 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1131 else if(dstFormat==IMGFMT_BGR24)
1134 for(i=0;i<dstW;i++){
1135 // vertical linear interpolation && yuv2rgb in a single step:
1136 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1137 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1138 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1139 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1140 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1141 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1145 else if(dstFormat==IMGFMT_BGR16)
1148 for(i=0;i<dstW;i++){
1149 // vertical linear interpolation && yuv2rgb in a single step:
1150 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1152 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 ((uint16_t*)dest)[i] =
1155 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1156 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1157 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1160 else if(dstFormat==IMGFMT_BGR15)
1163 for(i=0;i<dstW;i++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1169 ((uint16_t*)dest)[i] =
1170 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1171 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1172 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1180 switch(c->dstFormat)
1187 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1188 "m" (yalpha1), "m" (uvalpha1)
1194 "movl %4, %%ebx \n\t"
1198 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1199 "m" (yalpha1), "m" (uvalpha1)
1206 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1208 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1209 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1210 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1215 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1216 "m" (yalpha1), "m" (uvalpha1)
1223 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1225 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1226 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1227 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1232 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1233 "m" (yalpha1), "m" (uvalpha1)
1242 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243 "m" (yalpha1), "m" (uvalpha1)
1250 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1254 * YV12 to RGB without scaling or interpolating
1256 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1257 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1259 int uvalpha1=uvalpha^4095;
1260 const int yalpha1=0;
1263 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1264 const int yalpha= 4096; //FIXME ...
1266 if(flags&SWS_FULL_CHR_H_INT)
1268 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1273 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1281 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1282 "m" (yalpha1), "m" (uvalpha1)
1288 "movl %4, %%ebx \n\t"
1291 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1292 "m" (yalpha1), "m" (uvalpha1)
1299 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1301 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1302 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1303 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1306 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1307 "m" (yalpha1), "m" (uvalpha1)
1314 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1316 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1317 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1318 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1322 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1323 "m" (yalpha1), "m" (uvalpha1)
1331 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1346 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1347 "m" (yalpha1), "m" (uvalpha1)
1353 "movl %4, %%ebx \n\t"
1356 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1364 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1366 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1367 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1368 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1371 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1372 "m" (yalpha1), "m" (uvalpha1)
1379 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1381 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1382 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1383 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1387 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1388 "m" (yalpha1), "m" (uvalpha1)
1396 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1397 "m" (yalpha1), "m" (uvalpha1)
1404 if( uvalpha < 2048 )
1406 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1408 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1412 //FIXME yuy2* can read upto 7 samples to much
1414 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1418 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1419 "movl %0, %%eax \n\t"
1421 "movq (%1, %%eax,2), %%mm0 \n\t"
1422 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1423 "pand %%mm2, %%mm0 \n\t"
1424 "pand %%mm2, %%mm1 \n\t"
1425 "packuswb %%mm1, %%mm0 \n\t"
1426 "movq %%mm0, (%2, %%eax) \n\t"
1427 "addl $8, %%eax \n\t"
1429 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1434 for(i=0; i<width; i++)
1439 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1441 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1443 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1444 "movl %0, %%eax \n\t"
1446 "movq (%1, %%eax,4), %%mm0 \n\t"
1447 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1448 "movq (%2, %%eax,4), %%mm2 \n\t"
1449 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1452 "psrlw $8, %%mm0 \n\t"
1453 "psrlw $8, %%mm1 \n\t"
1454 "packuswb %%mm1, %%mm0 \n\t"
1455 "movq %%mm0, %%mm1 \n\t"
1456 "psrlw $8, %%mm0 \n\t"
1457 "pand %%mm4, %%mm1 \n\t"
1458 "packuswb %%mm0, %%mm0 \n\t"
1459 "packuswb %%mm1, %%mm1 \n\t"
1460 "movd %%mm0, (%4, %%eax) \n\t"
1461 "movd %%mm1, (%3, %%eax) \n\t"
1462 "addl $4, %%eax \n\t"
1464 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1469 for(i=0; i<width; i++)
1471 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1472 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1477 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1479 #ifdef HAVE_MMXFIXME
1482 for(i=0; i<width; i++)
1488 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1493 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1495 #ifdef HAVE_MMXFIXME
1498 for(i=0; i<width; i++)
1500 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1501 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1502 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1504 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1505 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1510 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1514 "movl %2, %%eax \n\t"
1515 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1516 "movq "MANGLE(w1111)", %%mm5 \n\t"
1517 "pxor %%mm7, %%mm7 \n\t"
1518 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1521 PREFETCH" 64(%0, %%ebx) \n\t"
1522 "movd (%0, %%ebx), %%mm0 \n\t"
1523 "movd 3(%0, %%ebx), %%mm1 \n\t"
1524 "punpcklbw %%mm7, %%mm0 \n\t"
1525 "punpcklbw %%mm7, %%mm1 \n\t"
1526 "movd 6(%0, %%ebx), %%mm2 \n\t"
1527 "movd 9(%0, %%ebx), %%mm3 \n\t"
1528 "punpcklbw %%mm7, %%mm2 \n\t"
1529 "punpcklbw %%mm7, %%mm3 \n\t"
1530 "pmaddwd %%mm6, %%mm0 \n\t"
1531 "pmaddwd %%mm6, %%mm1 \n\t"
1532 "pmaddwd %%mm6, %%mm2 \n\t"
1533 "pmaddwd %%mm6, %%mm3 \n\t"
1534 #ifndef FAST_BGR2YV12
1535 "psrad $8, %%mm0 \n\t"
1536 "psrad $8, %%mm1 \n\t"
1537 "psrad $8, %%mm2 \n\t"
1538 "psrad $8, %%mm3 \n\t"
1540 "packssdw %%mm1, %%mm0 \n\t"
1541 "packssdw %%mm3, %%mm2 \n\t"
1542 "pmaddwd %%mm5, %%mm0 \n\t"
1543 "pmaddwd %%mm5, %%mm2 \n\t"
1544 "packssdw %%mm2, %%mm0 \n\t"
1545 "psraw $7, %%mm0 \n\t"
1547 "movd 12(%0, %%ebx), %%mm4 \n\t"
1548 "movd 15(%0, %%ebx), %%mm1 \n\t"
1549 "punpcklbw %%mm7, %%mm4 \n\t"
1550 "punpcklbw %%mm7, %%mm1 \n\t"
1551 "movd 18(%0, %%ebx), %%mm2 \n\t"
1552 "movd 21(%0, %%ebx), %%mm3 \n\t"
1553 "punpcklbw %%mm7, %%mm2 \n\t"
1554 "punpcklbw %%mm7, %%mm3 \n\t"
1555 "pmaddwd %%mm6, %%mm4 \n\t"
1556 "pmaddwd %%mm6, %%mm1 \n\t"
1557 "pmaddwd %%mm6, %%mm2 \n\t"
1558 "pmaddwd %%mm6, %%mm3 \n\t"
1559 #ifndef FAST_BGR2YV12
1560 "psrad $8, %%mm4 \n\t"
1561 "psrad $8, %%mm1 \n\t"
1562 "psrad $8, %%mm2 \n\t"
1563 "psrad $8, %%mm3 \n\t"
1565 "packssdw %%mm1, %%mm4 \n\t"
1566 "packssdw %%mm3, %%mm2 \n\t"
1567 "pmaddwd %%mm5, %%mm4 \n\t"
1568 "pmaddwd %%mm5, %%mm2 \n\t"
1569 "addl $24, %%ebx \n\t"
1570 "packssdw %%mm2, %%mm4 \n\t"
1571 "psraw $7, %%mm4 \n\t"
1573 "packuswb %%mm4, %%mm0 \n\t"
1574 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1576 "movq %%mm0, (%1, %%eax) \n\t"
1577 "addl $8, %%eax \n\t"
1579 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1584 for(i=0; i<width; i++)
1590 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1595 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1599 "movl %4, %%eax \n\t"
1600 "movq "MANGLE(w1111)", %%mm5 \n\t"
1601 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1602 "pxor %%mm7, %%mm7 \n\t"
1603 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1604 "addl %%ebx, %%ebx \n\t"
1607 PREFETCH" 64(%0, %%ebx) \n\t"
1608 PREFETCH" 64(%1, %%ebx) \n\t"
1609 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1610 "movq (%0, %%ebx), %%mm0 \n\t"
1611 "movq (%1, %%ebx), %%mm1 \n\t"
1612 "movq 6(%0, %%ebx), %%mm2 \n\t"
1613 "movq 6(%1, %%ebx), %%mm3 \n\t"
1616 "movq %%mm0, %%mm1 \n\t"
1617 "movq %%mm2, %%mm3 \n\t"
1618 "psrlq $24, %%mm0 \n\t"
1619 "psrlq $24, %%mm2 \n\t"
1622 "punpcklbw %%mm7, %%mm0 \n\t"
1623 "punpcklbw %%mm7, %%mm2 \n\t"
1625 "movd (%0, %%ebx), %%mm0 \n\t"
1626 "movd (%1, %%ebx), %%mm1 \n\t"
1627 "movd 3(%0, %%ebx), %%mm2 \n\t"
1628 "movd 3(%1, %%ebx), %%mm3 \n\t"
1629 "punpcklbw %%mm7, %%mm0 \n\t"
1630 "punpcklbw %%mm7, %%mm1 \n\t"
1631 "punpcklbw %%mm7, %%mm2 \n\t"
1632 "punpcklbw %%mm7, %%mm3 \n\t"
1633 "paddw %%mm1, %%mm0 \n\t"
1634 "paddw %%mm3, %%mm2 \n\t"
1635 "paddw %%mm2, %%mm0 \n\t"
1636 "movd 6(%0, %%ebx), %%mm4 \n\t"
1637 "movd 6(%1, %%ebx), %%mm1 \n\t"
1638 "movd 9(%0, %%ebx), %%mm2 \n\t"
1639 "movd 9(%1, %%ebx), %%mm3 \n\t"
1640 "punpcklbw %%mm7, %%mm4 \n\t"
1641 "punpcklbw %%mm7, %%mm1 \n\t"
1642 "punpcklbw %%mm7, %%mm2 \n\t"
1643 "punpcklbw %%mm7, %%mm3 \n\t"
1644 "paddw %%mm1, %%mm4 \n\t"
1645 "paddw %%mm3, %%mm2 \n\t"
1646 "paddw %%mm4, %%mm2 \n\t"
1647 "psrlw $2, %%mm0 \n\t"
1648 "psrlw $2, %%mm2 \n\t"
1650 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1651 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1653 "pmaddwd %%mm0, %%mm1 \n\t"
1654 "pmaddwd %%mm2, %%mm3 \n\t"
1655 "pmaddwd %%mm6, %%mm0 \n\t"
1656 "pmaddwd %%mm6, %%mm2 \n\t"
1657 #ifndef FAST_BGR2YV12
1658 "psrad $8, %%mm0 \n\t"
1659 "psrad $8, %%mm1 \n\t"
1660 "psrad $8, %%mm2 \n\t"
1661 "psrad $8, %%mm3 \n\t"
1663 "packssdw %%mm2, %%mm0 \n\t"
1664 "packssdw %%mm3, %%mm1 \n\t"
1665 "pmaddwd %%mm5, %%mm0 \n\t"
1666 "pmaddwd %%mm5, %%mm1 \n\t"
1667 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1668 "psraw $7, %%mm0 \n\t"
1670 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1671 "movq 12(%0, %%ebx), %%mm4 \n\t"
1672 "movq 12(%1, %%ebx), %%mm1 \n\t"
1673 "movq 18(%0, %%ebx), %%mm2 \n\t"
1674 "movq 18(%1, %%ebx), %%mm3 \n\t"
1677 "movq %%mm4, %%mm1 \n\t"
1678 "movq %%mm2, %%mm3 \n\t"
1679 "psrlq $24, %%mm4 \n\t"
1680 "psrlq $24, %%mm2 \n\t"
1683 "punpcklbw %%mm7, %%mm4 \n\t"
1684 "punpcklbw %%mm7, %%mm2 \n\t"
1686 "movd 12(%0, %%ebx), %%mm4 \n\t"
1687 "movd 12(%1, %%ebx), %%mm1 \n\t"
1688 "movd 15(%0, %%ebx), %%mm2 \n\t"
1689 "movd 15(%1, %%ebx), %%mm3 \n\t"
1690 "punpcklbw %%mm7, %%mm4 \n\t"
1691 "punpcklbw %%mm7, %%mm1 \n\t"
1692 "punpcklbw %%mm7, %%mm2 \n\t"
1693 "punpcklbw %%mm7, %%mm3 \n\t"
1694 "paddw %%mm1, %%mm4 \n\t"
1695 "paddw %%mm3, %%mm2 \n\t"
1696 "paddw %%mm2, %%mm4 \n\t"
1697 "movd 18(%0, %%ebx), %%mm5 \n\t"
1698 "movd 18(%1, %%ebx), %%mm1 \n\t"
1699 "movd 21(%0, %%ebx), %%mm2 \n\t"
1700 "movd 21(%1, %%ebx), %%mm3 \n\t"
1701 "punpcklbw %%mm7, %%mm5 \n\t"
1702 "punpcklbw %%mm7, %%mm1 \n\t"
1703 "punpcklbw %%mm7, %%mm2 \n\t"
1704 "punpcklbw %%mm7, %%mm3 \n\t"
1705 "paddw %%mm1, %%mm5 \n\t"
1706 "paddw %%mm3, %%mm2 \n\t"
1707 "paddw %%mm5, %%mm2 \n\t"
1708 "movq "MANGLE(w1111)", %%mm5 \n\t"
1709 "psrlw $2, %%mm4 \n\t"
1710 "psrlw $2, %%mm2 \n\t"
1712 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1713 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1715 "pmaddwd %%mm4, %%mm1 \n\t"
1716 "pmaddwd %%mm2, %%mm3 \n\t"
1717 "pmaddwd %%mm6, %%mm4 \n\t"
1718 "pmaddwd %%mm6, %%mm2 \n\t"
1719 #ifndef FAST_BGR2YV12
1720 "psrad $8, %%mm4 \n\t"
1721 "psrad $8, %%mm1 \n\t"
1722 "psrad $8, %%mm2 \n\t"
1723 "psrad $8, %%mm3 \n\t"
1725 "packssdw %%mm2, %%mm4 \n\t"
1726 "packssdw %%mm3, %%mm1 \n\t"
1727 "pmaddwd %%mm5, %%mm4 \n\t"
1728 "pmaddwd %%mm5, %%mm1 \n\t"
1729 "addl $24, %%ebx \n\t"
1730 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1731 "psraw $7, %%mm4 \n\t"
1733 "movq %%mm0, %%mm1 \n\t"
1734 "punpckldq %%mm4, %%mm0 \n\t"
1735 "punpckhdq %%mm4, %%mm1 \n\t"
1736 "packsswb %%mm1, %%mm0 \n\t"
1737 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1739 "movd %%mm0, (%2, %%eax) \n\t"
1740 "punpckhdq %%mm0, %%mm0 \n\t"
1741 "movd %%mm0, (%3, %%eax) \n\t"
1742 "addl $4, %%eax \n\t"
1744 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1749 for(i=0; i<width; i++)
1751 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1752 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1753 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1755 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1756 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1761 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1764 for(i=0; i<width; i++)
1766 int d= src[i*2] + (src[i*2+1]<<8);
1769 int r= (d>>11)&0x1F;
1771 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1775 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1778 for(i=0; i<width; i++)
1781 int d0= le2me_32( ((uint32_t*)src1)[i] );
1782 int d1= le2me_32( ((uint32_t*)src2)[i] );
1784 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1785 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1787 int dh2= (dh>>11) + (dh<<21);
1791 int r= (d>>11)&0x7F;
1794 int d0= src1[i*4] + (src1[i*4+1]<<8);
1796 int g0= (d0>>5)&0x3F;
1797 int r0= (d0>>11)&0x1F;
1799 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1801 int g1= (d1>>5)&0x3F;
1802 int r1= (d1>>11)&0x1F;
1804 int d2= src2[i*4] + (src2[i*4+1]<<8);
1806 int g2= (d2>>5)&0x3F;
1807 int r2= (d2>>11)&0x1F;
1809 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1811 int g3= (d3>>5)&0x3F;
1812 int r3= (d3>>11)&0x1F;
1814 int b= b0 + b1 + b2 + b3;
1815 int g= g0 + g1 + g2 + g3;
1816 int r= r0 + r1 + r2 + r3;
1818 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1819 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1823 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1826 for(i=0; i<width; i++)
1828 int d= src[i*2] + (src[i*2+1]<<8);
1831 int r= (d>>10)&0x1F;
1833 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1837 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840 for(i=0; i<width; i++)
1843 int d0= le2me_32( ((uint32_t*)src1)[i] );
1844 int d1= le2me_32( ((uint32_t*)src2)[i] );
1846 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1847 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1849 int dh2= (dh>>11) + (dh<<21);
1853 int r= (d>>10)&0x7F;
1856 int d0= src1[i*4] + (src1[i*4+1]<<8);
1858 int g0= (d0>>5)&0x1F;
1859 int r0= (d0>>10)&0x1F;
1861 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1863 int g1= (d1>>5)&0x1F;
1864 int r1= (d1>>10)&0x1F;
1866 int d2= src2[i*4] + (src2[i*4+1]<<8);
1868 int g2= (d2>>5)&0x1F;
1869 int r2= (d2>>10)&0x1F;
1871 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1873 int g3= (d3>>5)&0x1F;
1874 int r3= (d3>>10)&0x1F;
1876 int b= b0 + b1 + b2 + b3;
1877 int g= g0 + g1 + g2 + g3;
1878 int r= r0 + r1 + r2 + r3;
1880 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1881 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1886 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1889 for(i=0; i<width; i++)
1895 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1899 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1902 for(i=0; i<width; i++)
1904 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1905 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1906 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1908 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1909 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1913 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1916 for(i=0; i<width; i++)
1922 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1926 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1929 for(i=0; i<width; i++)
1931 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1932 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1933 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1935 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1936 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1941 // Bilinear / Bicubic scaling
1942 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1943 int16_t *filter, int16_t *filterPos, int filterSize)
1946 if(filterSize==4) // allways true for upscaling, sometimes for down too
1948 int counter= -2*dstW;
1950 filterPos-= counter/2;
1953 "pxor %%mm7, %%mm7 \n\t"
1954 "movq "MANGLE(w02)", %%mm6 \n\t"
1955 "pushl %%ebp \n\t" // we use 7 regs here ...
1956 "movl %%eax, %%ebp \n\t"
1959 "movzwl (%2, %%ebp), %%eax \n\t"
1960 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1961 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1962 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1963 "movd (%3, %%eax), %%mm0 \n\t"
1964 "movd (%3, %%ebx), %%mm2 \n\t"
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1967 "pmaddwd %%mm1, %%mm0 \n\t"
1968 "pmaddwd %%mm2, %%mm3 \n\t"
1969 "psrad $8, %%mm0 \n\t"
1970 "psrad $8, %%mm3 \n\t"
1971 "packssdw %%mm3, %%mm0 \n\t"
1972 "pmaddwd %%mm6, %%mm0 \n\t"
1973 "packssdw %%mm0, %%mm0 \n\t"
1974 "movd %%mm0, (%4, %%ebp) \n\t"
1975 "addl $4, %%ebp \n\t"
1980 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1984 else if(filterSize==8)
1986 int counter= -2*dstW;
1988 filterPos-= counter/2;
1991 "pxor %%mm7, %%mm7 \n\t"
1992 "movq "MANGLE(w02)", %%mm6 \n\t"
1993 "pushl %%ebp \n\t" // we use 7 regs here ...
1994 "movl %%eax, %%ebp \n\t"
1997 "movzwl (%2, %%ebp), %%eax \n\t"
1998 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1999 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2000 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2001 "movd (%3, %%eax), %%mm0 \n\t"
2002 "movd (%3, %%ebx), %%mm2 \n\t"
2003 "punpcklbw %%mm7, %%mm0 \n\t"
2004 "punpcklbw %%mm7, %%mm2 \n\t"
2005 "pmaddwd %%mm1, %%mm0 \n\t"
2006 "pmaddwd %%mm2, %%mm3 \n\t"
2008 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2009 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2010 "movd 4(%3, %%eax), %%mm4 \n\t"
2011 "movd 4(%3, %%ebx), %%mm2 \n\t"
2012 "punpcklbw %%mm7, %%mm4 \n\t"
2013 "punpcklbw %%mm7, %%mm2 \n\t"
2014 "pmaddwd %%mm1, %%mm4 \n\t"
2015 "pmaddwd %%mm2, %%mm5 \n\t"
2016 "paddd %%mm4, %%mm0 \n\t"
2017 "paddd %%mm5, %%mm3 \n\t"
2019 "psrad $8, %%mm0 \n\t"
2020 "psrad $8, %%mm3 \n\t"
2021 "packssdw %%mm3, %%mm0 \n\t"
2022 "pmaddwd %%mm6, %%mm0 \n\t"
2023 "packssdw %%mm0, %%mm0 \n\t"
2024 "movd %%mm0, (%4, %%ebp) \n\t"
2025 "addl $4, %%ebp \n\t"
2030 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2036 int counter= -2*dstW;
2037 // filter-= counter*filterSize/2;
2038 filterPos-= counter/2;
2041 "pxor %%mm7, %%mm7 \n\t"
2042 "movq "MANGLE(w02)", %%mm6 \n\t"
2045 "movl %2, %%ecx \n\t"
2046 "movzwl (%%ecx, %0), %%eax \n\t"
2047 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2048 "movl %5, %%ecx \n\t"
2049 "pxor %%mm4, %%mm4 \n\t"
2050 "pxor %%mm5, %%mm5 \n\t"
2052 "movq (%1), %%mm1 \n\t"
2053 "movq (%1, %6), %%mm3 \n\t"
2054 "movd (%%ecx, %%eax), %%mm0 \n\t"
2055 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2056 "punpcklbw %%mm7, %%mm0 \n\t"
2057 "punpcklbw %%mm7, %%mm2 \n\t"
2058 "pmaddwd %%mm1, %%mm0 \n\t"
2059 "pmaddwd %%mm2, %%mm3 \n\t"
2060 "paddd %%mm3, %%mm5 \n\t"
2061 "paddd %%mm0, %%mm4 \n\t"
2063 "addl $4, %%ecx \n\t"
2064 "cmpl %4, %%ecx \n\t"
2067 "psrad $8, %%mm4 \n\t"
2068 "psrad $8, %%mm5 \n\t"
2069 "packssdw %%mm5, %%mm4 \n\t"
2070 "pmaddwd %%mm6, %%mm4 \n\t"
2071 "packssdw %%mm4, %%mm4 \n\t"
2072 "movl %3, %%eax \n\t"
2073 "movd %%mm4, (%%eax, %0) \n\t"
2077 : "+r" (counter), "+r" (filter)
2078 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2079 "m" (src), "r" (filterSize*2)
2080 : "%ebx", "%eax", "%ecx"
2085 for(i=0; i<dstW; i++)
2088 int srcPos= filterPos[i];
2090 // printf("filterPos: %d\n", filterPos[i]);
2091 for(j=0; j<filterSize; j++)
2093 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2094 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2096 // filter += hFilterSize;
2097 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2102 // *** horizontal scale Y line to temp buffer
2103 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2104 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2105 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2106 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2107 int32_t *mmx2FilterPos)
2109 if(srcFormat==IMGFMT_YUY2)
2111 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2112 src= formatConvBuffer;
2114 else if(srcFormat==IMGFMT_BGR32)
2116 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2117 src= formatConvBuffer;
2119 else if(srcFormat==IMGFMT_BGR24)
2121 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2122 src= formatConvBuffer;
2124 else if(srcFormat==IMGFMT_BGR16)
2126 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2127 src= formatConvBuffer;
2129 else if(srcFormat==IMGFMT_BGR15)
2131 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2132 src= formatConvBuffer;
2134 else if(srcFormat==IMGFMT_RGB32)
2136 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2137 src= formatConvBuffer;
2139 else if(srcFormat==IMGFMT_RGB24)
2141 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2142 src= formatConvBuffer;
2146 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2147 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2149 if(!(flags&SWS_FAST_BILINEAR))
2152 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2154 else // Fast Bilinear upscale / crap downscale
2162 "pxor %%mm7, %%mm7 \n\t"
2163 "movl %0, %%ecx \n\t"
2164 "movl %1, %%edi \n\t"
2165 "movl %2, %%edx \n\t"
2166 "movl %3, %%ebx \n\t"
2167 "xorl %%eax, %%eax \n\t" // i
2168 PREFETCH" (%%ecx) \n\t"
2169 PREFETCH" 32(%%ecx) \n\t"
2170 PREFETCH" 64(%%ecx) \n\t"
2172 #define FUNNY_Y_CODE \
2173 "movl (%%ebx), %%esi \n\t"\
2175 "addl (%%ebx, %%eax), %%ecx \n\t"\
2176 "addl %%eax, %%edi \n\t"\
2177 "xorl %%eax, %%eax \n\t"\
2188 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2190 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2192 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2197 //NO MMX just normal asm ...
2199 "xorl %%eax, %%eax \n\t" // i
2200 "xorl %%ebx, %%ebx \n\t" // xx
2201 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2204 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2205 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2206 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2207 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2208 "shll $16, %%edi \n\t"
2209 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2210 "movl %1, %%edi \n\t"
2211 "shrl $9, %%esi \n\t"
2212 "movw %%si, (%%edi, %%eax, 2) \n\t"
2213 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2214 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2216 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2217 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2218 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2219 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2220 "shll $16, %%edi \n\t"
2221 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2222 "movl %1, %%edi \n\t"
2223 "shrl $9, %%esi \n\t"
2224 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2225 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2226 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2229 "addl $2, %%eax \n\t"
2230 "cmpl %2, %%eax \n\t"
2234 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2235 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2238 } //if MMX2 cant be used
2242 unsigned int xpos=0;
2243 for(i=0;i<dstWidth;i++)
2245 register unsigned int xx=xpos>>16;
2246 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2247 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2254 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2255 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2256 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2257 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2258 int32_t *mmx2FilterPos)
2260 if(srcFormat==IMGFMT_YUY2)
2262 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2263 src1= formatConvBuffer;
2264 src2= formatConvBuffer+2048;
2266 else if(srcFormat==IMGFMT_BGR32)
2268 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2269 src1= formatConvBuffer;
2270 src2= formatConvBuffer+2048;
2272 else if(srcFormat==IMGFMT_BGR24)
2274 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2275 src1= formatConvBuffer;
2276 src2= formatConvBuffer+2048;
2278 else if(srcFormat==IMGFMT_BGR16)
2280 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2281 src1= formatConvBuffer;
2282 src2= formatConvBuffer+2048;
2284 else if(srcFormat==IMGFMT_BGR15)
2286 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2287 src1= formatConvBuffer;
2288 src2= formatConvBuffer+2048;
2290 else if(srcFormat==IMGFMT_RGB32)
2292 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2293 src1= formatConvBuffer;
2294 src2= formatConvBuffer+2048;
2296 else if(srcFormat==IMGFMT_RGB24)
2298 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2299 src1= formatConvBuffer;
2300 src2= formatConvBuffer+2048;
2302 else if(isGray(srcFormat))
2308 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2309 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2311 if(!(flags&SWS_FAST_BILINEAR))
2314 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2315 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2317 else // Fast Bilinear upscale / crap downscale
2325 "pxor %%mm7, %%mm7 \n\t"
2326 "movl %0, %%ecx \n\t"
2327 "movl %1, %%edi \n\t"
2328 "movl %2, %%edx \n\t"
2329 "movl %3, %%ebx \n\t"
2330 "xorl %%eax, %%eax \n\t" // i
2331 PREFETCH" (%%ecx) \n\t"
2332 PREFETCH" 32(%%ecx) \n\t"
2333 PREFETCH" 64(%%ecx) \n\t"
2335 #define FUNNY_UV_CODE \
2336 "movl (%%ebx), %%esi \n\t"\
2338 "addl (%%ebx, %%eax), %%ecx \n\t"\
2339 "addl %%eax, %%edi \n\t"\
2340 "xorl %%eax, %%eax \n\t"\
2346 "xorl %%eax, %%eax \n\t" // i
2347 "movl %5, %%ecx \n\t" // src
2348 "movl %1, %%edi \n\t" // buf1
2349 "addl $4096, %%edi \n\t"
2350 PREFETCH" (%%ecx) \n\t"
2351 PREFETCH" 32(%%ecx) \n\t"
2352 PREFETCH" 64(%%ecx) \n\t"
2359 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2360 "m" (funnyUVCode), "m" (src2)
2361 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2363 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2365 // printf("%d %d %d\n", dstWidth, i, srcW);
2366 dst[i] = src1[srcW-1]*128;
2367 dst[i+2048] = src2[srcW-1]*128;
2374 "xorl %%eax, %%eax \n\t" // i
2375 "xorl %%ebx, %%ebx \n\t" // xx
2376 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2379 "movl %0, %%esi \n\t"
2380 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2381 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2382 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2383 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2384 "shll $16, %%edi \n\t"
2385 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2386 "movl %1, %%edi \n\t"
2387 "shrl $9, %%esi \n\t"
2388 "movw %%si, (%%edi, %%eax, 2) \n\t"
2390 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2391 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2392 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2393 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2394 "shll $16, %%edi \n\t"
2395 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2396 "movl %1, %%edi \n\t"
2397 "shrl $9, %%esi \n\t"
2398 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2400 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2401 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2402 "addl $1, %%eax \n\t"
2403 "cmpl %2, %%eax \n\t"
2406 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2408 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2411 } //if MMX2 cant be used
2415 unsigned int xpos=0;
2416 for(i=0;i<dstWidth;i++)
2418 register unsigned int xx=xpos>>16;
2419 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2420 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2421 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2423 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2424 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2432 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2433 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2435 /* load a few things into local vars to make the code more readable? and faster */
2436 const int srcW= c->srcW;
2437 const int dstW= c->dstW;
2438 const int dstH= c->dstH;
2439 const int chrDstW= c->chrDstW;
2440 const int chrSrcW= c->chrSrcW;
2441 const int lumXInc= c->lumXInc;
2442 const int chrXInc= c->chrXInc;
2443 const int dstFormat= c->dstFormat;
2444 const int srcFormat= c->srcFormat;
2445 const int flags= c->flags;
2446 const int canMMX2BeUsed= c->canMMX2BeUsed;
2447 int16_t *vLumFilterPos= c->vLumFilterPos;
2448 int16_t *vChrFilterPos= c->vChrFilterPos;
2449 int16_t *hLumFilterPos= c->hLumFilterPos;
2450 int16_t *hChrFilterPos= c->hChrFilterPos;
2451 int16_t *vLumFilter= c->vLumFilter;
2452 int16_t *vChrFilter= c->vChrFilter;
2453 int16_t *hLumFilter= c->hLumFilter;
2454 int16_t *hChrFilter= c->hChrFilter;
2455 int16_t *lumMmxFilter= c->lumMmxFilter;
2456 int16_t *chrMmxFilter= c->chrMmxFilter;
2457 const int vLumFilterSize= c->vLumFilterSize;
2458 const int vChrFilterSize= c->vChrFilterSize;
2459 const int hLumFilterSize= c->hLumFilterSize;
2460 const int hChrFilterSize= c->hChrFilterSize;
2461 int16_t **lumPixBuf= c->lumPixBuf;
2462 int16_t **chrPixBuf= c->chrPixBuf;
2463 const int vLumBufSize= c->vLumBufSize;
2464 const int vChrBufSize= c->vChrBufSize;
2465 uint8_t *funnyYCode= c->funnyYCode;
2466 uint8_t *funnyUVCode= c->funnyUVCode;
2467 uint8_t *formatConvBuffer= c->formatConvBuffer;
2468 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2469 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2471 /* vars whch will change and which we need to storw back in the context */
2473 int lumBufIndex= c->lumBufIndex;
2474 int chrBufIndex= c->chrBufIndex;
2475 int lastInLumBuf= c->lastInLumBuf;
2476 int lastInChrBuf= c->lastInChrBuf;
2482 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2483 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
2485 if(isPacked(c->srcFormat)){
2488 src[2]= srcParam[0];
2491 srcStride[2]= srcStrideParam[0];
2493 srcStride[1]<<= c->vChrDrop;
2494 srcStride[2]<<= c->vChrDrop;
2496 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2497 // (int)dst[0], (int)dst[1], (int)dst[2]);
2499 #if 0 //self test FIXME move to a vfilter or something
2501 static volatile int i=0;
2503 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2504 selfTest(src, srcStride, c->srcW, c->srcH);
2509 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2510 //dstStride[0],dstStride[1],dstStride[2]);
2512 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2514 static int firstTime=1; //FIXME move this into the context perhaps
2515 if(flags & SWS_PRINT_INFO && firstTime)
2517 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2518 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2523 /* Note the user might start scaling the picture in the middle so this will not get executed
2524 this is not really intended but works currently, so ppl might do it */
2533 for(;dstY < dstH; dstY++){
2534 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2535 const int chrDstY= dstY>>c->chrDstVSubSample;
2536 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2537 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2539 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2540 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2541 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2542 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2544 //handle holes (FAST_BILINEAR & weird filters)
2545 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2546 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2547 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2548 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2549 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2551 // Do we have enough lines in this slice to output the dstY line
2552 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2554 //Do horizontal scaling
2555 while(lastInLumBuf < lastLumSrcY)
2557 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2559 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2560 ASSERT(lumBufIndex < 2*vLumBufSize)
2561 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2562 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2563 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2564 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2565 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2566 funnyYCode, c->srcFormat, formatConvBuffer,
2567 c->lumMmx2Filter, c->lumMmx2FilterPos);
2570 while(lastInChrBuf < lastChrSrcY)
2572 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2573 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2575 ASSERT(chrBufIndex < 2*vChrBufSize)
2576 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2577 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2578 //FIXME replace parameters through context struct (some at least)
2580 if(!(isGray(srcFormat) || isGray(dstFormat)))
2581 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2582 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2583 funnyUVCode, c->srcFormat, formatConvBuffer,
2584 c->chrMmx2Filter, c->chrMmx2FilterPos);
2587 //wrap buf index around to stay inside the ring buffer
2588 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2589 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2591 else // not enough lines left in this slice -> load the rest in the buffer
2593 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2594 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2595 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2596 vChrBufSize, vLumBufSize);*/
2598 //Do horizontal scaling
2599 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2601 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2603 ASSERT(lumBufIndex < 2*vLumBufSize)
2604 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2605 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2606 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2607 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2608 funnyYCode, c->srcFormat, formatConvBuffer,
2609 c->lumMmx2Filter, c->lumMmx2FilterPos);
2612 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2614 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2615 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2617 ASSERT(chrBufIndex < 2*vChrBufSize)
2618 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2619 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2621 if(!(isGray(srcFormat) || isGray(dstFormat)))
2622 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2623 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2624 funnyUVCode, c->srcFormat, formatConvBuffer,
2625 c->chrMmx2Filter, c->chrMmx2FilterPos);
2628 //wrap buf index around to stay inside the ring buffer
2629 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2630 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2631 break; //we cant output a dstY line so lets try with the next slice
2635 b5Dither= dither8[dstY&1];
2636 g6Dither= dither4[dstY&1];
2637 g5Dither= dither8[dstY&1];
2638 r5Dither= dither8[(dstY+1)&1];
2642 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2644 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2645 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2646 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2648 int16_t *lumBuf = lumPixBuf[0];
2649 int16_t *chrBuf= chrPixBuf[0];
2650 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2654 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2655 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2657 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2658 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2659 dest, uDest, vDest, dstW, chrDstW,
2660 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2665 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2666 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2668 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2669 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2670 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2672 int chrAlpha= vChrFilter[2*dstY+1];
2674 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2675 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2677 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2679 int lumAlpha= vLumFilter[2*dstY+1];
2680 int chrAlpha= vChrFilter[2*dstY+1];
2682 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2683 dest, dstW, lumAlpha, chrAlpha, dstY);
2687 RENAME(yuv2packedX)(c,
2688 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2689 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2691 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
2695 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2697 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2698 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2699 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2701 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2702 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2704 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2705 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2706 dest, uDest, vDest, dstW, chrDstW);
2710 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2711 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2713 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2714 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2721 __asm __volatile(SFENCE:::"memory");
2722 __asm __volatile(EMMS:::"memory");
2724 /* store changed local vars back in the context */
2726 c->lumBufIndex= lumBufIndex;
2727 c->chrBufIndex= chrBufIndex;
2728 c->lastInLumBuf= lastInLumBuf;
2729 c->lastInChrBuf= lastInChrBuf;