2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
45 #define SFENCE "sfence"
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define YSCALEYUV2YV12X(x) \
63 "xorl %%eax, %%eax \n\t"\
64 "pxor %%mm3, %%mm3 \n\t"\
65 "pxor %%mm4, %%mm4 \n\t"\
66 "movl %0, %%edx \n\t"\
67 ".balign 16 \n\t" /* FIXME Unroll? */\
69 "movl (%1, %%edx, 4), %%esi \n\t"\
70 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
71 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
72 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 "addl $1, %%edx \n\t"\
79 "psraw $3, %%mm3 \n\t"\
80 "psraw $3, %%mm4 \n\t"\
81 "packuswb %%mm4, %%mm3 \n\t"\
82 MOVNTQ(%%mm3, (%3, %%eax))\
83 "addl $8, %%eax \n\t"\
84 "cmpl %4, %%eax \n\t"\
85 "pxor %%mm3, %%mm3 \n\t"\
86 "pxor %%mm4, %%mm4 \n\t"\
87 "movl %0, %%edx \n\t"\
90 #define YSCALEYUV2YV121 \
91 "movl %2, %%eax \n\t"\
92 ".balign 16 \n\t" /* FIXME Unroll? */\
94 "movq (%0, %%eax, 2), %%mm0 \n\t"\
95 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
96 "psraw $7, %%mm0 \n\t"\
97 "psraw $7, %%mm1 \n\t"\
98 "packuswb %%mm1, %%mm0 \n\t"\
99 MOVNTQ(%%mm0, (%1, %%eax))\
100 "addl $8, %%eax \n\t"\
104 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
105 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
106 "r" (dest), "m" (dstW),
107 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
108 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110 #define YSCALEYUV2RGBX \
111 "xorl %%eax, %%eax \n\t"\
114 "movl %1, %%edx \n\t" /* -chrFilterSize */\
115 "movl %3, %%ebx \n\t" /* chrMmxFilter+chrFilterSize */\
116 "movl %7, %%ecx \n\t" /* chrSrc+chrFilterSize */\
117 "pxor %%mm3, %%mm3 \n\t"\
118 "pxor %%mm4, %%mm4 \n\t"\
120 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
121 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
122 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
123 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
124 "pmulhw %%mm0, %%mm2 \n\t"\
125 "pmulhw %%mm0, %%mm5 \n\t"\
126 "paddw %%mm2, %%mm3 \n\t"\
127 "paddw %%mm5, %%mm4 \n\t"\
128 "addl $1, %%edx \n\t"\
131 "movl %0, %%edx \n\t" /* -lumFilterSize */\
132 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
133 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
134 "pxor %%mm1, %%mm1 \n\t"\
135 "pxor %%mm7, %%mm7 \n\t"\
137 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
138 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
139 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
140 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
141 "pmulhw %%mm0, %%mm2 \n\t"\
142 "pmulhw %%mm0, %%mm5 \n\t"\
143 "paddw %%mm2, %%mm1 \n\t"\
144 "paddw %%mm5, %%mm7 \n\t"\
145 "addl $1, %%edx \n\t"\
148 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
149 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
150 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
151 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
152 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
153 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
154 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
155 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
156 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
157 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
158 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
159 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
160 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
161 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
162 "paddw %%mm3, %%mm4 \n\t"\
163 "movq %%mm2, %%mm0 \n\t"\
164 "movq %%mm5, %%mm6 \n\t"\
165 "movq %%mm4, %%mm3 \n\t"\
166 "punpcklwd %%mm2, %%mm2 \n\t"\
167 "punpcklwd %%mm5, %%mm5 \n\t"\
168 "punpcklwd %%mm4, %%mm4 \n\t"\
169 "paddw %%mm1, %%mm2 \n\t"\
170 "paddw %%mm1, %%mm5 \n\t"\
171 "paddw %%mm1, %%mm4 \n\t"\
172 "punpckhwd %%mm0, %%mm0 \n\t"\
173 "punpckhwd %%mm6, %%mm6 \n\t"\
174 "punpckhwd %%mm3, %%mm3 \n\t"\
175 "paddw %%mm7, %%mm0 \n\t"\
176 "paddw %%mm7, %%mm6 \n\t"\
177 "paddw %%mm7, %%mm3 \n\t"\
178 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
179 "packuswb %%mm0, %%mm2 \n\t"\
180 "packuswb %%mm6, %%mm5 \n\t"\
181 "packuswb %%mm3, %%mm4 \n\t"\
182 "pxor %%mm7, %%mm7 \n\t"
184 #define FULL_YSCALEYUV2RGB \
185 "pxor %%mm7, %%mm7 \n\t"\
186 "movd %6, %%mm6 \n\t" /*yalpha1*/\
187 "punpcklwd %%mm6, %%mm6 \n\t"\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
190 "punpcklwd %%mm5, %%mm5 \n\t"\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "xorl %%eax, %%eax \n\t"\
195 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
196 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
197 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
198 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
199 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
200 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
201 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
202 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
203 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
204 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
205 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
206 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
207 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
208 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
209 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
210 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
211 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
212 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
215 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
216 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
217 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
218 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
219 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
220 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
221 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
224 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
225 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
226 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
227 "paddw %%mm1, %%mm3 \n\t" /* B*/\
228 "paddw %%mm1, %%mm0 \n\t" /* R*/\
229 "packuswb %%mm3, %%mm3 \n\t"\
231 "packuswb %%mm0, %%mm0 \n\t"\
232 "paddw %%mm4, %%mm2 \n\t"\
233 "paddw %%mm2, %%mm1 \n\t" /* G*/\
235 "packuswb %%mm1, %%mm1 \n\t"
237 #define YSCALEYUV2RGB \
238 "movd %6, %%mm6 \n\t" /*yalpha1*/\
239 "punpcklwd %%mm6, %%mm6 \n\t"\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
241 "movq %%mm6, 3968(%2) \n\t"\
242 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
243 "punpcklwd %%mm5, %%mm5 \n\t"\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
245 "movq %%mm5, 3976(%2) \n\t"\
246 "xorl %%eax, %%eax \n\t"\
249 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
250 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
251 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
252 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
253 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
254 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
255 "movq 3976(%2), %%mm0 \n\t"\
256 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
257 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
258 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
259 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
260 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
261 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
262 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
263 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
264 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
265 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
266 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
267 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
268 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
269 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
270 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
271 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
272 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
273 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
274 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
275 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
276 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
278 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
280 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
282 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
283 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
284 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
285 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
286 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
287 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
288 "paddw %%mm3, %%mm4 \n\t"\
289 "movq %%mm2, %%mm0 \n\t"\
290 "movq %%mm5, %%mm6 \n\t"\
291 "movq %%mm4, %%mm3 \n\t"\
292 "punpcklwd %%mm2, %%mm2 \n\t"\
293 "punpcklwd %%mm5, %%mm5 \n\t"\
294 "punpcklwd %%mm4, %%mm4 \n\t"\
295 "paddw %%mm1, %%mm2 \n\t"\
296 "paddw %%mm1, %%mm5 \n\t"\
297 "paddw %%mm1, %%mm4 \n\t"\
298 "punpckhwd %%mm0, %%mm0 \n\t"\
299 "punpckhwd %%mm6, %%mm6 \n\t"\
300 "punpckhwd %%mm3, %%mm3 \n\t"\
301 "paddw %%mm7, %%mm0 \n\t"\
302 "paddw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm7, %%mm3 \n\t"\
304 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
305 "packuswb %%mm0, %%mm2 \n\t"\
306 "packuswb %%mm6, %%mm5 \n\t"\
307 "packuswb %%mm3, %%mm4 \n\t"\
308 "pxor %%mm7, %%mm7 \n\t"
310 #define YSCALEYUV2RGB1 \
311 "xorl %%eax, %%eax \n\t"\
314 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
315 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
316 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
317 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
318 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
319 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
320 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
321 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
322 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
323 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
324 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
325 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
326 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
327 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
328 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
330 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
331 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
334 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
358 // do vertical chrominance interpolation
359 #define YSCALEYUV2RGB1b \
360 "xorl %%eax, %%eax \n\t"\
363 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
364 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
365 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
366 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
367 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
368 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
369 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
370 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
371 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
372 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
373 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
374 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
375 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
376 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
377 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
378 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
379 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
380 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
381 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
383 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
384 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
385 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
386 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
387 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
388 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
389 "paddw %%mm3, %%mm4 \n\t"\
390 "movq %%mm2, %%mm0 \n\t"\
391 "movq %%mm5, %%mm6 \n\t"\
392 "movq %%mm4, %%mm3 \n\t"\
393 "punpcklwd %%mm2, %%mm2 \n\t"\
394 "punpcklwd %%mm5, %%mm5 \n\t"\
395 "punpcklwd %%mm4, %%mm4 \n\t"\
396 "paddw %%mm1, %%mm2 \n\t"\
397 "paddw %%mm1, %%mm5 \n\t"\
398 "paddw %%mm1, %%mm4 \n\t"\
399 "punpckhwd %%mm0, %%mm0 \n\t"\
400 "punpckhwd %%mm6, %%mm6 \n\t"\
401 "punpckhwd %%mm3, %%mm3 \n\t"\
402 "paddw %%mm7, %%mm0 \n\t"\
403 "paddw %%mm7, %%mm6 \n\t"\
404 "paddw %%mm7, %%mm3 \n\t"\
405 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
406 "packuswb %%mm0, %%mm2 \n\t"\
407 "packuswb %%mm6, %%mm5 \n\t"\
408 "packuswb %%mm3, %%mm4 \n\t"\
409 "pxor %%mm7, %%mm7 \n\t"
412 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
413 "movq %%mm2, %%mm1 \n\t" /* B */\
414 "movq %%mm5, %%mm6 \n\t" /* R */\
415 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
416 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
417 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
418 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
419 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
420 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
421 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
422 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
423 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
424 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
426 MOVNTQ(%%mm0, (%4, %%eax, 4))\
427 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
428 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
429 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
431 "addl $8, %%eax \n\t"\
432 "cmpl %5, %%eax \n\t"\
436 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
437 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
438 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
439 "psrlq $3, %%mm2 \n\t"\
441 "movq %%mm2, %%mm1 \n\t"\
442 "movq %%mm4, %%mm3 \n\t"\
444 "punpcklbw %%mm7, %%mm3 \n\t"\
445 "punpcklbw %%mm5, %%mm2 \n\t"\
446 "punpckhbw %%mm7, %%mm4 \n\t"\
447 "punpckhbw %%mm5, %%mm1 \n\t"\
449 "psllq $3, %%mm3 \n\t"\
450 "psllq $3, %%mm4 \n\t"\
452 "por %%mm3, %%mm2 \n\t"\
453 "por %%mm4, %%mm1 \n\t"\
455 MOVNTQ(%%mm2, (%4, %%eax, 2))\
456 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
458 "addl $8, %%eax \n\t"\
459 "cmpl %5, %%eax \n\t"\
463 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
464 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
465 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
466 "psrlq $3, %%mm2 \n\t"\
467 "psrlq $1, %%mm5 \n\t"\
469 "movq %%mm2, %%mm1 \n\t"\
470 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklbw %%mm7, %%mm3 \n\t"\
473 "punpcklbw %%mm5, %%mm2 \n\t"\
474 "punpckhbw %%mm7, %%mm4 \n\t"\
475 "punpckhbw %%mm5, %%mm1 \n\t"\
477 "psllq $2, %%mm3 \n\t"\
478 "psllq $2, %%mm4 \n\t"\
480 "por %%mm3, %%mm2 \n\t"\
481 "por %%mm4, %%mm1 \n\t"\
483 MOVNTQ(%%mm2, (%4, %%eax, 2))\
484 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
486 "addl $8, %%eax \n\t"\
487 "cmpl %5, %%eax \n\t"\
490 #define WRITEBGR24OLD \
491 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
492 "movq %%mm2, %%mm1 \n\t" /* B */\
493 "movq %%mm5, %%mm6 \n\t" /* R */\
494 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
495 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
496 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
497 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
498 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
499 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
500 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
501 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
502 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
503 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
505 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
506 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
507 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
508 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
509 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
510 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
511 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
512 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
514 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
515 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
516 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
517 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
518 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
519 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
520 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
521 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
522 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
523 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
524 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
525 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
526 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
528 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
529 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
530 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
531 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
532 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
533 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
534 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
535 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
537 MOVNTQ(%%mm0, (%%ebx))\
538 MOVNTQ(%%mm2, 8(%%ebx))\
539 MOVNTQ(%%mm3, 16(%%ebx))\
540 "addl $24, %%ebx \n\t"\
542 "addl $8, %%eax \n\t"\
543 "cmpl %5, %%eax \n\t"\
546 #define WRITEBGR24MMX \
547 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
548 "movq %%mm2, %%mm1 \n\t" /* B */\
549 "movq %%mm5, %%mm6 \n\t" /* R */\
550 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
551 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
552 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
553 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
554 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
555 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
556 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
557 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
558 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
559 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
561 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
562 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
563 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
564 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
566 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
567 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
568 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
569 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
571 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
572 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
573 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
574 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
576 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
577 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
578 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
579 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
580 MOVNTQ(%%mm0, (%%ebx))\
582 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
583 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
584 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
585 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
586 MOVNTQ(%%mm6, 8(%%ebx))\
588 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
589 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
590 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
591 MOVNTQ(%%mm5, 16(%%ebx))\
593 "addl $24, %%ebx \n\t"\
595 "addl $8, %%eax \n\t"\
596 "cmpl %5, %%eax \n\t"\
599 #define WRITEBGR24MMX2 \
600 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
601 "movq "MANGLE(M24A)", %%mm0 \n\t"\
602 "movq "MANGLE(M24C)", %%mm7 \n\t"\
603 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
604 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
605 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
607 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
608 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
609 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
611 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
612 "por %%mm1, %%mm6 \n\t"\
613 "por %%mm3, %%mm6 \n\t"\
614 MOVNTQ(%%mm6, (%%ebx))\
616 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
617 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
618 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
619 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
621 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
622 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
623 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
625 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
626 "por %%mm3, %%mm6 \n\t"\
627 MOVNTQ(%%mm6, 8(%%ebx))\
629 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
630 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
631 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
633 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
634 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
635 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
637 "por %%mm1, %%mm3 \n\t"\
638 "por %%mm3, %%mm6 \n\t"\
639 MOVNTQ(%%mm6, 16(%%ebx))\
641 "addl $24, %%ebx \n\t"\
643 "addl $8, %%eax \n\t"\
644 "cmpl %5, %%eax \n\t"\
649 #define WRITEBGR24 WRITEBGR24MMX2
652 #define WRITEBGR24 WRITEBGR24MMX
655 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
656 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
657 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
658 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
665 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
666 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
667 : "%eax", "%edx", "%esi"
671 YSCALEYUV2YV12X(4096)
672 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
673 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
674 : "%eax", "%edx", "%esi"
680 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
681 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
682 : "%eax", "%edx", "%esi"
685 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
686 chrFilter, chrSrc, chrFilterSize,
687 dest, uDest, vDest, dstW, chrDstW);
691 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
692 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
699 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
706 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
714 :: "r" (lumSrc + dstW), "r" (dest + dstW),
720 for(i=0; i<dstW; i++)
722 int val= lumSrc[i]>>7;
733 for(i=0; i<chrDstW; i++)
736 int v=chrSrc[i + 2048]>>7;
740 else if (u>255) u=255;
742 else if (v>255) v=255;
753 * vertical scale YV12 to RGB
755 static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
756 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
757 uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
768 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
769 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
770 "r" (dest), "m" (dstW),
771 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
772 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
780 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
781 "addl %4, %%ebx \n\t"
784 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
785 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
786 "r" (dest), "m" (dstW),
787 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
788 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
796 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
798 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
799 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
800 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
805 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
806 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
807 "r" (dest), "m" (dstW),
808 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
809 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
817 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
819 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
820 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
821 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
826 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
827 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
828 "r" (dest), "m" (dstW),
829 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
830 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
836 yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize,
837 chrFilter, chrSrc, chrFilterSize,
844 * vertical bilinear scale YV12 to RGB
846 static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
847 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
849 int yalpha1=yalpha^4095;
850 int uvalpha1=uvalpha^4095;
854 if(flags&SWS_FULL_CHR_H_INT)
864 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
865 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
867 "movq %%mm3, %%mm1 \n\t"
868 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
869 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
871 MOVNTQ(%%mm3, (%4, %%eax, 4))
872 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
874 "addl $4, %%eax \n\t"
875 "cmpl %5, %%eax \n\t"
879 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
880 "m" (yalpha1), "m" (uvalpha1)
890 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
891 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
893 "movq %%mm3, %%mm1 \n\t"
894 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
895 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
897 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
898 "psrlq $8, %%mm3 \n\t" // GR0BGR00
899 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
900 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
901 "por %%mm2, %%mm3 \n\t" // BGRBGR00
902 "movq %%mm1, %%mm2 \n\t"
903 "psllq $48, %%mm1 \n\t" // 000000BG
904 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
906 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
907 "psrld $16, %%mm2 \n\t" // R000R000
908 "psrlq $24, %%mm1 \n\t" // 0BGR0000
909 "por %%mm2, %%mm1 \n\t" // RBGRR000
911 "movl %4, %%ebx \n\t"
912 "addl %%eax, %%ebx \n\t"
916 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
917 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
919 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
920 "psrlq $32, %%mm3 \n\t"
921 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
922 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
924 "addl $4, %%eax \n\t"
925 "cmpl %5, %%eax \n\t"
928 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
929 "m" (yalpha1), "m" (uvalpha1)
938 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
939 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
940 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
942 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
943 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
944 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
946 "psrlw $3, %%mm3 \n\t"
947 "psllw $2, %%mm1 \n\t"
948 "psllw $7, %%mm0 \n\t"
949 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
950 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
952 "por %%mm3, %%mm1 \n\t"
953 "por %%mm1, %%mm0 \n\t"
955 MOVNTQ(%%mm0, (%4, %%eax, 2))
957 "addl $4, %%eax \n\t"
958 "cmpl %5, %%eax \n\t"
961 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
962 "m" (yalpha1), "m" (uvalpha1)
971 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
972 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
973 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
975 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
976 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
977 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
979 "psrlw $3, %%mm3 \n\t"
980 "psllw $3, %%mm1 \n\t"
981 "psllw $8, %%mm0 \n\t"
982 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
983 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
985 "por %%mm3, %%mm1 \n\t"
986 "por %%mm1, %%mm0 \n\t"
988 MOVNTQ(%%mm0, (%4, %%eax, 2))
990 "addl $4, %%eax \n\t"
991 "cmpl %5, %%eax \n\t"
994 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
995 "m" (yalpha1), "m" (uvalpha1)
1004 if(dstFormat==IMGFMT_BGR32)
1007 #ifdef WORDS_BIGENDIAN
1010 for(i=0;i<dstW;i++){
1011 // vertical linear interpolation && yuv2rgb in a single step:
1012 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1015 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1021 else if(dstFormat==IMGFMT_BGR24)
1024 for(i=0;i<dstW;i++){
1025 // vertical linear interpolation && yuv2rgb in a single step:
1026 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1029 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1030 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1031 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1035 else if(dstFormat==IMGFMT_BGR16)
1038 for(i=0;i<dstW;i++){
1039 // vertical linear interpolation && yuv2rgb in a single step:
1040 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1041 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1042 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1044 ((uint16_t*)dest)[i] =
1045 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1046 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1047 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1050 else if(dstFormat==IMGFMT_BGR15)
1053 for(i=0;i<dstW;i++){
1054 // vertical linear interpolation && yuv2rgb in a single step:
1055 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1056 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1057 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1059 ((uint16_t*)dest)[i] =
1060 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1061 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1062 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1070 switch(c->dstFormat)
1077 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1078 "m" (yalpha1), "m" (uvalpha1)
1084 "movl %4, %%ebx \n\t"
1088 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1089 "m" (yalpha1), "m" (uvalpha1)
1096 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1098 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1099 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1100 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1105 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1106 "m" (yalpha1), "m" (uvalpha1)
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1116 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1117 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1122 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1123 "m" (yalpha1), "m" (uvalpha1)
1130 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C)
1134 * YV12 to RGB without scaling or interpolating
1136 static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1137 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1139 int uvalpha1=uvalpha^4095;
1140 const int yalpha1=0;
1143 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1144 const int yalpha= 4096; //FIXME ...
1146 if(flags&SWS_FULL_CHR_H_INT)
1148 RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1153 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1161 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1162 "m" (yalpha1), "m" (uvalpha1)
1168 "movl %4, %%ebx \n\t"
1171 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1172 "m" (yalpha1), "m" (uvalpha1)
1179 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1181 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1182 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1183 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1186 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1187 "m" (yalpha1), "m" (uvalpha1)
1194 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1196 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1197 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1198 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1202 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1203 "m" (yalpha1), "m" (uvalpha1)
1217 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1218 "m" (yalpha1), "m" (uvalpha1)
1224 "movl %4, %%ebx \n\t"
1227 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1228 "m" (yalpha1), "m" (uvalpha1)
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1237 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1238 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1239 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243 "m" (yalpha1), "m" (uvalpha1)
1250 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1253 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1254 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1259 "m" (yalpha1), "m" (uvalpha1)
1266 if( uvalpha < 2048 )
1268 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C)
1270 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C)
1274 //FIXME yuy2* can read upto 7 samples to much
1276 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1280 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1281 "movl %0, %%eax \n\t"
1283 "movq (%1, %%eax,2), %%mm0 \n\t"
1284 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1285 "pand %%mm2, %%mm0 \n\t"
1286 "pand %%mm2, %%mm1 \n\t"
1287 "packuswb %%mm1, %%mm0 \n\t"
1288 "movq %%mm0, (%2, %%eax) \n\t"
1289 "addl $8, %%eax \n\t"
1291 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1296 for(i=0; i<width; i++)
1301 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1303 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1305 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1306 "movl %0, %%eax \n\t"
1308 "movq (%1, %%eax,4), %%mm0 \n\t"
1309 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1310 "movq (%2, %%eax,4), %%mm2 \n\t"
1311 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1314 "psrlw $8, %%mm0 \n\t"
1315 "psrlw $8, %%mm1 \n\t"
1316 "packuswb %%mm1, %%mm0 \n\t"
1317 "movq %%mm0, %%mm1 \n\t"
1318 "psrlw $8, %%mm0 \n\t"
1319 "pand %%mm4, %%mm1 \n\t"
1320 "packuswb %%mm0, %%mm0 \n\t"
1321 "packuswb %%mm1, %%mm1 \n\t"
1322 "movd %%mm0, (%4, %%eax) \n\t"
1323 "movd %%mm1, (%3, %%eax) \n\t"
1324 "addl $4, %%eax \n\t"
1326 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1331 for(i=0; i<width; i++)
1333 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1334 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1339 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1341 #ifdef HAVE_MMXFIXME
1344 for(i=0; i<width; i++)
1350 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1355 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1357 #ifdef HAVE_MMXFIXME
1360 for(i=0; i<width; i++)
1362 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1363 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1364 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1366 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1367 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1372 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1376 "movl %2, %%eax \n\t"
1377 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1378 "movq "MANGLE(w1111)", %%mm5 \n\t"
1379 "pxor %%mm7, %%mm7 \n\t"
1380 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1383 PREFETCH" 64(%0, %%ebx) \n\t"
1384 "movd (%0, %%ebx), %%mm0 \n\t"
1385 "movd 3(%0, %%ebx), %%mm1 \n\t"
1386 "punpcklbw %%mm7, %%mm0 \n\t"
1387 "punpcklbw %%mm7, %%mm1 \n\t"
1388 "movd 6(%0, %%ebx), %%mm2 \n\t"
1389 "movd 9(%0, %%ebx), %%mm3 \n\t"
1390 "punpcklbw %%mm7, %%mm2 \n\t"
1391 "punpcklbw %%mm7, %%mm3 \n\t"
1392 "pmaddwd %%mm6, %%mm0 \n\t"
1393 "pmaddwd %%mm6, %%mm1 \n\t"
1394 "pmaddwd %%mm6, %%mm2 \n\t"
1395 "pmaddwd %%mm6, %%mm3 \n\t"
1396 #ifndef FAST_BGR2YV12
1397 "psrad $8, %%mm0 \n\t"
1398 "psrad $8, %%mm1 \n\t"
1399 "psrad $8, %%mm2 \n\t"
1400 "psrad $8, %%mm3 \n\t"
1402 "packssdw %%mm1, %%mm0 \n\t"
1403 "packssdw %%mm3, %%mm2 \n\t"
1404 "pmaddwd %%mm5, %%mm0 \n\t"
1405 "pmaddwd %%mm5, %%mm2 \n\t"
1406 "packssdw %%mm2, %%mm0 \n\t"
1407 "psraw $7, %%mm0 \n\t"
1409 "movd 12(%0, %%ebx), %%mm4 \n\t"
1410 "movd 15(%0, %%ebx), %%mm1 \n\t"
1411 "punpcklbw %%mm7, %%mm4 \n\t"
1412 "punpcklbw %%mm7, %%mm1 \n\t"
1413 "movd 18(%0, %%ebx), %%mm2 \n\t"
1414 "movd 21(%0, %%ebx), %%mm3 \n\t"
1415 "punpcklbw %%mm7, %%mm2 \n\t"
1416 "punpcklbw %%mm7, %%mm3 \n\t"
1417 "pmaddwd %%mm6, %%mm4 \n\t"
1418 "pmaddwd %%mm6, %%mm1 \n\t"
1419 "pmaddwd %%mm6, %%mm2 \n\t"
1420 "pmaddwd %%mm6, %%mm3 \n\t"
1421 #ifndef FAST_BGR2YV12
1422 "psrad $8, %%mm4 \n\t"
1423 "psrad $8, %%mm1 \n\t"
1424 "psrad $8, %%mm2 \n\t"
1425 "psrad $8, %%mm3 \n\t"
1427 "packssdw %%mm1, %%mm4 \n\t"
1428 "packssdw %%mm3, %%mm2 \n\t"
1429 "pmaddwd %%mm5, %%mm4 \n\t"
1430 "pmaddwd %%mm5, %%mm2 \n\t"
1431 "addl $24, %%ebx \n\t"
1432 "packssdw %%mm2, %%mm4 \n\t"
1433 "psraw $7, %%mm4 \n\t"
1435 "packuswb %%mm4, %%mm0 \n\t"
1436 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1438 "movq %%mm0, (%1, %%eax) \n\t"
1439 "addl $8, %%eax \n\t"
1441 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1446 for(i=0; i<width; i++)
1452 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1457 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1461 "movl %4, %%eax \n\t"
1462 "movq "MANGLE(w1111)", %%mm5 \n\t"
1463 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1464 "pxor %%mm7, %%mm7 \n\t"
1465 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1466 "addl %%ebx, %%ebx \n\t"
1469 PREFETCH" 64(%0, %%ebx) \n\t"
1470 PREFETCH" 64(%1, %%ebx) \n\t"
1471 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1472 "movq (%0, %%ebx), %%mm0 \n\t"
1473 "movq (%1, %%ebx), %%mm1 \n\t"
1474 "movq 6(%0, %%ebx), %%mm2 \n\t"
1475 "movq 6(%1, %%ebx), %%mm3 \n\t"
1478 "movq %%mm0, %%mm1 \n\t"
1479 "movq %%mm2, %%mm3 \n\t"
1480 "psrlq $24, %%mm0 \n\t"
1481 "psrlq $24, %%mm2 \n\t"
1484 "punpcklbw %%mm7, %%mm0 \n\t"
1485 "punpcklbw %%mm7, %%mm2 \n\t"
1487 "movd (%0, %%ebx), %%mm0 \n\t"
1488 "movd (%1, %%ebx), %%mm1 \n\t"
1489 "movd 3(%0, %%ebx), %%mm2 \n\t"
1490 "movd 3(%1, %%ebx), %%mm3 \n\t"
1491 "punpcklbw %%mm7, %%mm0 \n\t"
1492 "punpcklbw %%mm7, %%mm1 \n\t"
1493 "punpcklbw %%mm7, %%mm2 \n\t"
1494 "punpcklbw %%mm7, %%mm3 \n\t"
1495 "paddw %%mm1, %%mm0 \n\t"
1496 "paddw %%mm3, %%mm2 \n\t"
1497 "paddw %%mm2, %%mm0 \n\t"
1498 "movd 6(%0, %%ebx), %%mm4 \n\t"
1499 "movd 6(%1, %%ebx), %%mm1 \n\t"
1500 "movd 9(%0, %%ebx), %%mm2 \n\t"
1501 "movd 9(%1, %%ebx), %%mm3 \n\t"
1502 "punpcklbw %%mm7, %%mm4 \n\t"
1503 "punpcklbw %%mm7, %%mm1 \n\t"
1504 "punpcklbw %%mm7, %%mm2 \n\t"
1505 "punpcklbw %%mm7, %%mm3 \n\t"
1506 "paddw %%mm1, %%mm4 \n\t"
1507 "paddw %%mm3, %%mm2 \n\t"
1508 "paddw %%mm4, %%mm2 \n\t"
1509 "psrlw $2, %%mm0 \n\t"
1510 "psrlw $2, %%mm2 \n\t"
1512 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1513 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1515 "pmaddwd %%mm0, %%mm1 \n\t"
1516 "pmaddwd %%mm2, %%mm3 \n\t"
1517 "pmaddwd %%mm6, %%mm0 \n\t"
1518 "pmaddwd %%mm6, %%mm2 \n\t"
1519 #ifndef FAST_BGR2YV12
1520 "psrad $8, %%mm0 \n\t"
1521 "psrad $8, %%mm1 \n\t"
1522 "psrad $8, %%mm2 \n\t"
1523 "psrad $8, %%mm3 \n\t"
1525 "packssdw %%mm2, %%mm0 \n\t"
1526 "packssdw %%mm3, %%mm1 \n\t"
1527 "pmaddwd %%mm5, %%mm0 \n\t"
1528 "pmaddwd %%mm5, %%mm1 \n\t"
1529 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1530 "psraw $7, %%mm0 \n\t"
1532 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1533 "movq 12(%0, %%ebx), %%mm4 \n\t"
1534 "movq 12(%1, %%ebx), %%mm1 \n\t"
1535 "movq 18(%0, %%ebx), %%mm2 \n\t"
1536 "movq 18(%1, %%ebx), %%mm3 \n\t"
1539 "movq %%mm4, %%mm1 \n\t"
1540 "movq %%mm2, %%mm3 \n\t"
1541 "psrlq $24, %%mm4 \n\t"
1542 "psrlq $24, %%mm2 \n\t"
1545 "punpcklbw %%mm7, %%mm4 \n\t"
1546 "punpcklbw %%mm7, %%mm2 \n\t"
1548 "movd 12(%0, %%ebx), %%mm4 \n\t"
1549 "movd 12(%1, %%ebx), %%mm1 \n\t"
1550 "movd 15(%0, %%ebx), %%mm2 \n\t"
1551 "movd 15(%1, %%ebx), %%mm3 \n\t"
1552 "punpcklbw %%mm7, %%mm4 \n\t"
1553 "punpcklbw %%mm7, %%mm1 \n\t"
1554 "punpcklbw %%mm7, %%mm2 \n\t"
1555 "punpcklbw %%mm7, %%mm3 \n\t"
1556 "paddw %%mm1, %%mm4 \n\t"
1557 "paddw %%mm3, %%mm2 \n\t"
1558 "paddw %%mm2, %%mm4 \n\t"
1559 "movd 18(%0, %%ebx), %%mm5 \n\t"
1560 "movd 18(%1, %%ebx), %%mm1 \n\t"
1561 "movd 21(%0, %%ebx), %%mm2 \n\t"
1562 "movd 21(%1, %%ebx), %%mm3 \n\t"
1563 "punpcklbw %%mm7, %%mm5 \n\t"
1564 "punpcklbw %%mm7, %%mm1 \n\t"
1565 "punpcklbw %%mm7, %%mm2 \n\t"
1566 "punpcklbw %%mm7, %%mm3 \n\t"
1567 "paddw %%mm1, %%mm5 \n\t"
1568 "paddw %%mm3, %%mm2 \n\t"
1569 "paddw %%mm5, %%mm2 \n\t"
1570 "movq "MANGLE(w1111)", %%mm5 \n\t"
1571 "psrlw $2, %%mm4 \n\t"
1572 "psrlw $2, %%mm2 \n\t"
1574 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1575 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1577 "pmaddwd %%mm4, %%mm1 \n\t"
1578 "pmaddwd %%mm2, %%mm3 \n\t"
1579 "pmaddwd %%mm6, %%mm4 \n\t"
1580 "pmaddwd %%mm6, %%mm2 \n\t"
1581 #ifndef FAST_BGR2YV12
1582 "psrad $8, %%mm4 \n\t"
1583 "psrad $8, %%mm1 \n\t"
1584 "psrad $8, %%mm2 \n\t"
1585 "psrad $8, %%mm3 \n\t"
1587 "packssdw %%mm2, %%mm4 \n\t"
1588 "packssdw %%mm3, %%mm1 \n\t"
1589 "pmaddwd %%mm5, %%mm4 \n\t"
1590 "pmaddwd %%mm5, %%mm1 \n\t"
1591 "addl $24, %%ebx \n\t"
1592 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1593 "psraw $7, %%mm4 \n\t"
1595 "movq %%mm0, %%mm1 \n\t"
1596 "punpckldq %%mm4, %%mm0 \n\t"
1597 "punpckhdq %%mm4, %%mm1 \n\t"
1598 "packsswb %%mm1, %%mm0 \n\t"
1599 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1601 "movd %%mm0, (%2, %%eax) \n\t"
1602 "punpckhdq %%mm0, %%mm0 \n\t"
1603 "movd %%mm0, (%3, %%eax) \n\t"
1604 "addl $4, %%eax \n\t"
1606 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1611 for(i=0; i<width; i++)
1613 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1614 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1615 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1617 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1618 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1623 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1626 for(i=0; i<width; i++)
1628 int d= src[i*2] + (src[i*2+1]<<8);
1631 int r= (d>>11)&0x1F;
1633 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1637 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1640 for(i=0; i<width; i++)
1643 int d0= le2me_32( ((uint32_t*)src1)[i] );
1644 int d1= le2me_32( ((uint32_t*)src2)[i] );
1646 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1647 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1649 int dh2= (dh>>11) + (dh<<21);
1653 int r= (d>>11)&0x7F;
1656 int d0= src1[i*4] + (src1[i*4+1]<<8);
1658 int g0= (d0>>5)&0x3F;
1659 int r0= (d0>>11)&0x1F;
1661 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1663 int g1= (d1>>5)&0x3F;
1664 int r1= (d1>>11)&0x1F;
1666 int d2= src2[i*4] + (src2[i*4+1]<<8);
1668 int g2= (d2>>5)&0x3F;
1669 int r2= (d2>>11)&0x1F;
1671 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1673 int g3= (d3>>5)&0x3F;
1674 int r3= (d3>>11)&0x1F;
1676 int b= b0 + b1 + b2 + b3;
1677 int g= g0 + g1 + g2 + g3;
1678 int r= r0 + r1 + r2 + r3;
1680 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1681 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1685 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1688 for(i=0; i<width; i++)
1690 int d= src[i*2] + (src[i*2+1]<<8);
1693 int r= (d>>10)&0x1F;
1695 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1699 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1702 for(i=0; i<width; i++)
1705 int d0= le2me_32( ((uint32_t*)src1)[i] );
1706 int d1= le2me_32( ((uint32_t*)src2)[i] );
1708 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1709 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1711 int dh2= (dh>>11) + (dh<<21);
1715 int r= (d>>10)&0x7F;
1718 int d0= src1[i*4] + (src1[i*4+1]<<8);
1720 int g0= (d0>>5)&0x1F;
1721 int r0= (d0>>10)&0x1F;
1723 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1725 int g1= (d1>>5)&0x1F;
1726 int r1= (d1>>10)&0x1F;
1728 int d2= src2[i*4] + (src2[i*4+1]<<8);
1730 int g2= (d2>>5)&0x1F;
1731 int r2= (d2>>10)&0x1F;
1733 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1735 int g3= (d3>>5)&0x1F;
1736 int r3= (d3>>10)&0x1F;
1738 int b= b0 + b1 + b2 + b3;
1739 int g= g0 + g1 + g2 + g3;
1740 int r= r0 + r1 + r2 + r3;
1742 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1743 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1748 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1751 for(i=0; i<width; i++)
1757 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1761 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1764 for(i=0; i<width; i++)
1766 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1767 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1768 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1770 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1771 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1775 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1778 for(i=0; i<width; i++)
1784 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1788 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1791 for(i=0; i<width; i++)
1793 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1794 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1795 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1797 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1798 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1803 // Bilinear / Bicubic scaling
1804 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1805 int16_t *filter, int16_t *filterPos, int filterSize)
1808 if(filterSize==4) // allways true for upscaling, sometimes for down too
1810 int counter= -2*dstW;
1812 filterPos-= counter/2;
1815 "pxor %%mm7, %%mm7 \n\t"
1816 "movq "MANGLE(w02)", %%mm6 \n\t"
1817 "pushl %%ebp \n\t" // we use 7 regs here ...
1818 "movl %%eax, %%ebp \n\t"
1821 "movzwl (%2, %%ebp), %%eax \n\t"
1822 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1823 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1824 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1825 "movd (%3, %%eax), %%mm0 \n\t"
1826 "movd (%3, %%ebx), %%mm2 \n\t"
1827 "punpcklbw %%mm7, %%mm0 \n\t"
1828 "punpcklbw %%mm7, %%mm2 \n\t"
1829 "pmaddwd %%mm1, %%mm0 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "psrad $8, %%mm0 \n\t"
1832 "psrad $8, %%mm3 \n\t"
1833 "packssdw %%mm3, %%mm0 \n\t"
1834 "pmaddwd %%mm6, %%mm0 \n\t"
1835 "packssdw %%mm0, %%mm0 \n\t"
1836 "movd %%mm0, (%4, %%ebp) \n\t"
1837 "addl $4, %%ebp \n\t"
1842 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1846 else if(filterSize==8)
1848 int counter= -2*dstW;
1850 filterPos-= counter/2;
1853 "pxor %%mm7, %%mm7 \n\t"
1854 "movq "MANGLE(w02)", %%mm6 \n\t"
1855 "pushl %%ebp \n\t" // we use 7 regs here ...
1856 "movl %%eax, %%ebp \n\t"
1859 "movzwl (%2, %%ebp), %%eax \n\t"
1860 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1861 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1862 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1863 "movd (%3, %%eax), %%mm0 \n\t"
1864 "movd (%3, %%ebx), %%mm2 \n\t"
1865 "punpcklbw %%mm7, %%mm0 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "pmaddwd %%mm1, %%mm0 \n\t"
1868 "pmaddwd %%mm2, %%mm3 \n\t"
1870 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1871 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1872 "movd 4(%3, %%eax), %%mm4 \n\t"
1873 "movd 4(%3, %%ebx), %%mm2 \n\t"
1874 "punpcklbw %%mm7, %%mm4 \n\t"
1875 "punpcklbw %%mm7, %%mm2 \n\t"
1876 "pmaddwd %%mm1, %%mm4 \n\t"
1877 "pmaddwd %%mm2, %%mm5 \n\t"
1878 "paddd %%mm4, %%mm0 \n\t"
1879 "paddd %%mm5, %%mm3 \n\t"
1881 "psrad $8, %%mm0 \n\t"
1882 "psrad $8, %%mm3 \n\t"
1883 "packssdw %%mm3, %%mm0 \n\t"
1884 "pmaddwd %%mm6, %%mm0 \n\t"
1885 "packssdw %%mm0, %%mm0 \n\t"
1886 "movd %%mm0, (%4, %%ebp) \n\t"
1887 "addl $4, %%ebp \n\t"
1892 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1898 int counter= -2*dstW;
1899 // filter-= counter*filterSize/2;
1900 filterPos-= counter/2;
1903 "pxor %%mm7, %%mm7 \n\t"
1904 "movq "MANGLE(w02)", %%mm6 \n\t"
1907 "movl %2, %%ecx \n\t"
1908 "movzwl (%%ecx, %0), %%eax \n\t"
1909 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1910 "movl %5, %%ecx \n\t"
1911 "pxor %%mm4, %%mm4 \n\t"
1912 "pxor %%mm5, %%mm5 \n\t"
1914 "movq (%1), %%mm1 \n\t"
1915 "movq (%1, %6), %%mm3 \n\t"
1916 "movd (%%ecx, %%eax), %%mm0 \n\t"
1917 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1918 "punpcklbw %%mm7, %%mm0 \n\t"
1919 "punpcklbw %%mm7, %%mm2 \n\t"
1920 "pmaddwd %%mm1, %%mm0 \n\t"
1921 "pmaddwd %%mm2, %%mm3 \n\t"
1922 "paddd %%mm3, %%mm5 \n\t"
1923 "paddd %%mm0, %%mm4 \n\t"
1925 "addl $4, %%ecx \n\t"
1926 "cmpl %4, %%ecx \n\t"
1929 "psrad $8, %%mm4 \n\t"
1930 "psrad $8, %%mm5 \n\t"
1931 "packssdw %%mm5, %%mm4 \n\t"
1932 "pmaddwd %%mm6, %%mm4 \n\t"
1933 "packssdw %%mm4, %%mm4 \n\t"
1934 "movl %3, %%eax \n\t"
1935 "movd %%mm4, (%%eax, %0) \n\t"
1939 : "+r" (counter), "+r" (filter)
1940 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1941 "m" (src), "r" (filterSize*2)
1942 : "%ebx", "%eax", "%ecx"
1947 for(i=0; i<dstW; i++)
1950 int srcPos= filterPos[i];
1952 // printf("filterPos: %d\n", filterPos[i]);
1953 for(j=0; j<filterSize; j++)
1955 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1956 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1958 // filter += hFilterSize;
1959 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1964 // *** horizontal scale Y line to temp buffer
1965 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
1966 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1967 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
1968 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
1969 int32_t *mmx2FilterPos)
1971 if(srcFormat==IMGFMT_YUY2)
1973 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
1974 src= formatConvBuffer;
1976 else if(srcFormat==IMGFMT_BGR32)
1978 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
1979 src= formatConvBuffer;
1981 else if(srcFormat==IMGFMT_BGR24)
1983 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
1984 src= formatConvBuffer;
1986 else if(srcFormat==IMGFMT_BGR16)
1988 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
1989 src= formatConvBuffer;
1991 else if(srcFormat==IMGFMT_BGR15)
1993 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
1994 src= formatConvBuffer;
1996 else if(srcFormat==IMGFMT_RGB32)
1998 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
1999 src= formatConvBuffer;
2001 else if(srcFormat==IMGFMT_RGB24)
2003 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2004 src= formatConvBuffer;
2008 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2009 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2011 if(!(flags&SWS_FAST_BILINEAR))
2014 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2016 else // Fast Bilinear upscale / crap downscale
2024 "pxor %%mm7, %%mm7 \n\t"
2025 "movl %0, %%ecx \n\t"
2026 "movl %1, %%edi \n\t"
2027 "movl %2, %%edx \n\t"
2028 "movl %3, %%ebx \n\t"
2029 "xorl %%eax, %%eax \n\t" // i
2030 PREFETCH" (%%ecx) \n\t"
2031 PREFETCH" 32(%%ecx) \n\t"
2032 PREFETCH" 64(%%ecx) \n\t"
2034 #define FUNNY_Y_CODE \
2035 "movl (%%ebx), %%esi \n\t"\
2037 "addl (%%ebx, %%eax), %%ecx \n\t"\
2038 "addl %%eax, %%edi \n\t"\
2039 "xorl %%eax, %%eax \n\t"\
2050 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2052 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2054 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2059 //NO MMX just normal asm ...
2061 "xorl %%eax, %%eax \n\t" // i
2062 "xorl %%ebx, %%ebx \n\t" // xx
2063 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2066 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2067 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2068 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2069 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2070 "shll $16, %%edi \n\t"
2071 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2072 "movl %1, %%edi \n\t"
2073 "shrl $9, %%esi \n\t"
2074 "movw %%si, (%%edi, %%eax, 2) \n\t"
2075 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2076 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2078 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2079 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2080 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2081 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2082 "shll $16, %%edi \n\t"
2083 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2084 "movl %1, %%edi \n\t"
2085 "shrl $9, %%esi \n\t"
2086 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2087 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2088 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2091 "addl $2, %%eax \n\t"
2092 "cmpl %2, %%eax \n\t"
2096 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2097 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2100 } //if MMX2 cant be used
2104 unsigned int xpos=0;
2105 for(i=0;i<dstWidth;i++)
2107 register unsigned int xx=xpos>>16;
2108 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2109 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2116 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2117 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2118 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2119 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2120 int32_t *mmx2FilterPos)
2122 if(srcFormat==IMGFMT_YUY2)
2124 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2125 src1= formatConvBuffer;
2126 src2= formatConvBuffer+2048;
2128 else if(srcFormat==IMGFMT_BGR32)
2130 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2131 src1= formatConvBuffer;
2132 src2= formatConvBuffer+2048;
2134 else if(srcFormat==IMGFMT_BGR24)
2136 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2137 src1= formatConvBuffer;
2138 src2= formatConvBuffer+2048;
2140 else if(srcFormat==IMGFMT_BGR16)
2142 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2143 src1= formatConvBuffer;
2144 src2= formatConvBuffer+2048;
2146 else if(srcFormat==IMGFMT_BGR15)
2148 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2149 src1= formatConvBuffer;
2150 src2= formatConvBuffer+2048;
2152 else if(srcFormat==IMGFMT_RGB32)
2154 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2155 src1= formatConvBuffer;
2156 src2= formatConvBuffer+2048;
2158 else if(srcFormat==IMGFMT_RGB24)
2160 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2161 src1= formatConvBuffer;
2162 src2= formatConvBuffer+2048;
2164 else if(isGray(srcFormat))
2170 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2171 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2173 if(!(flags&SWS_FAST_BILINEAR))
2176 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2177 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2179 else // Fast Bilinear upscale / crap downscale
2187 "pxor %%mm7, %%mm7 \n\t"
2188 "movl %0, %%ecx \n\t"
2189 "movl %1, %%edi \n\t"
2190 "movl %2, %%edx \n\t"
2191 "movl %3, %%ebx \n\t"
2192 "xorl %%eax, %%eax \n\t" // i
2193 PREFETCH" (%%ecx) \n\t"
2194 PREFETCH" 32(%%ecx) \n\t"
2195 PREFETCH" 64(%%ecx) \n\t"
2197 #define FUNNY_UV_CODE \
2198 "movl (%%ebx), %%esi \n\t"\
2200 "addl (%%ebx, %%eax), %%ecx \n\t"\
2201 "addl %%eax, %%edi \n\t"\
2202 "xorl %%eax, %%eax \n\t"\
2208 "xorl %%eax, %%eax \n\t" // i
2209 "movl %5, %%ecx \n\t" // src
2210 "movl %1, %%edi \n\t" // buf1
2211 "addl $4096, %%edi \n\t"
2212 PREFETCH" (%%ecx) \n\t"
2213 PREFETCH" 32(%%ecx) \n\t"
2214 PREFETCH" 64(%%ecx) \n\t"
2221 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2222 "m" (funnyUVCode), "m" (src2)
2223 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2225 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2227 // printf("%d %d %d\n", dstWidth, i, srcW);
2228 dst[i] = src1[srcW-1]*128;
2229 dst[i+2048] = src2[srcW-1]*128;
2236 "xorl %%eax, %%eax \n\t" // i
2237 "xorl %%ebx, %%ebx \n\t" // xx
2238 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2241 "movl %0, %%esi \n\t"
2242 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2243 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2244 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2245 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2246 "shll $16, %%edi \n\t"
2247 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2248 "movl %1, %%edi \n\t"
2249 "shrl $9, %%esi \n\t"
2250 "movw %%si, (%%edi, %%eax, 2) \n\t"
2252 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2253 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2254 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2255 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2256 "shll $16, %%edi \n\t"
2257 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2258 "movl %1, %%edi \n\t"
2259 "shrl $9, %%esi \n\t"
2260 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2262 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2263 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2264 "addl $1, %%eax \n\t"
2265 "cmpl %2, %%eax \n\t"
2268 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2270 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2273 } //if MMX2 cant be used
2277 unsigned int xpos=0;
2278 for(i=0;i<dstWidth;i++)
2280 register unsigned int xx=xpos>>16;
2281 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2282 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2283 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2285 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2286 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2294 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2295 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2297 /* load a few things into local vars to make the code more readable? and faster */
2298 const int srcW= c->srcW;
2299 const int dstW= c->dstW;
2300 const int dstH= c->dstH;
2301 const int chrDstW= c->chrDstW;
2302 const int chrSrcW= c->chrSrcW;
2303 const int lumXInc= c->lumXInc;
2304 const int chrXInc= c->chrXInc;
2305 const int dstFormat= c->dstFormat;
2306 const int srcFormat= c->srcFormat;
2307 const int flags= c->flags;
2308 const int canMMX2BeUsed= c->canMMX2BeUsed;
2309 int16_t *vLumFilterPos= c->vLumFilterPos;
2310 int16_t *vChrFilterPos= c->vChrFilterPos;
2311 int16_t *hLumFilterPos= c->hLumFilterPos;
2312 int16_t *hChrFilterPos= c->hChrFilterPos;
2313 int16_t *vLumFilter= c->vLumFilter;
2314 int16_t *vChrFilter= c->vChrFilter;
2315 int16_t *hLumFilter= c->hLumFilter;
2316 int16_t *hChrFilter= c->hChrFilter;
2317 int16_t *lumMmxFilter= c->lumMmxFilter;
2318 int16_t *chrMmxFilter= c->chrMmxFilter;
2319 const int vLumFilterSize= c->vLumFilterSize;
2320 const int vChrFilterSize= c->vChrFilterSize;
2321 const int hLumFilterSize= c->hLumFilterSize;
2322 const int hChrFilterSize= c->hChrFilterSize;
2323 int16_t **lumPixBuf= c->lumPixBuf;
2324 int16_t **chrPixBuf= c->chrPixBuf;
2325 const int vLumBufSize= c->vLumBufSize;
2326 const int vChrBufSize= c->vChrBufSize;
2327 uint8_t *funnyYCode= c->funnyYCode;
2328 uint8_t *funnyUVCode= c->funnyUVCode;
2329 uint8_t *formatConvBuffer= c->formatConvBuffer;
2330 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2331 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2333 /* vars whch will change and which we need to storw back in the context */
2335 int lumBufIndex= c->lumBufIndex;
2336 int chrBufIndex= c->chrBufIndex;
2337 int lastInLumBuf= c->lastInLumBuf;
2338 int lastInChrBuf= c->lastInChrBuf;
2344 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2345 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
2347 if(isPacked(c->srcFormat)){
2350 src[2]= srcParam[0];
2353 srcStride[2]= srcStrideParam[0];
2355 srcStride[1]<<= c->vChrDrop;
2356 srcStride[2]<<= c->vChrDrop;
2358 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2359 // (int)dst[0], (int)dst[1], (int)dst[2]);
2361 #if 0 //self test FIXME move to a vfilter or something
2363 static volatile int i=0;
2365 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2366 selfTest(src, srcStride, c->srcW, c->srcH);
2371 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2372 //dstStride[0],dstStride[1],dstStride[2]);
2374 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2376 static int firstTime=1; //FIXME move this into the context perhaps
2377 if(flags & SWS_PRINT_INFO && firstTime)
2379 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2380 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2385 /* Note the user might start scaling the picture in the middle so this will not get executed
2386 this is not really intended but works currently, so ppl might do it */
2395 for(;dstY < dstH; dstY++){
2396 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2397 const int chrDstY= dstY>>c->chrDstVSubSample;
2398 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2399 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2401 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2402 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2403 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2404 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2406 //handle holes (FAST_BILINEAR & weird filters)
2407 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2408 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2409 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2410 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2411 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2413 // Do we have enough lines in this slice to output the dstY line
2414 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2416 //Do horizontal scaling
2417 while(lastInLumBuf < lastLumSrcY)
2419 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2421 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2422 ASSERT(lumBufIndex < 2*vLumBufSize)
2423 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2424 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2425 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2426 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2427 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2428 funnyYCode, c->srcFormat, formatConvBuffer,
2429 c->lumMmx2Filter, c->lumMmx2FilterPos);
2432 while(lastInChrBuf < lastChrSrcY)
2434 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2435 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2437 ASSERT(chrBufIndex < 2*vChrBufSize)
2438 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2439 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2440 //FIXME replace parameters through context struct (some at least)
2442 if(!(isGray(srcFormat) || isGray(dstFormat)))
2443 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2444 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2445 funnyUVCode, c->srcFormat, formatConvBuffer,
2446 c->chrMmx2Filter, c->chrMmx2FilterPos);
2449 //wrap buf index around to stay inside the ring buffer
2450 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2451 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2453 else // not enough lines left in this slice -> load the rest in the buffer
2455 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2456 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2457 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2458 vChrBufSize, vLumBufSize);*/
2460 //Do horizontal scaling
2461 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2463 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2465 ASSERT(lumBufIndex < 2*vLumBufSize)
2466 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2467 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2468 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2469 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2470 funnyYCode, c->srcFormat, formatConvBuffer,
2471 c->lumMmx2Filter, c->lumMmx2FilterPos);
2474 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2476 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2477 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2479 ASSERT(chrBufIndex < 2*vChrBufSize)
2480 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2481 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2483 if(!(isGray(srcFormat) || isGray(dstFormat)))
2484 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2485 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2486 funnyUVCode, c->srcFormat, formatConvBuffer,
2487 c->chrMmx2Filter, c->chrMmx2FilterPos);
2490 //wrap buf index around to stay inside the ring buffer
2491 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2492 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2493 break; //we cant output a dstY line so lets try with the next slice
2497 b5Dither= dither8[dstY&1];
2498 g6Dither= dither4[dstY&1];
2499 g5Dither= dither8[dstY&1];
2500 r5Dither= dither8[(dstY+1)&1];
2504 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2506 if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2507 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2509 int16_t *lumBuf = lumPixBuf[0];
2510 int16_t *chrBuf= chrPixBuf[0];
2511 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2515 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2516 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2518 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2519 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2520 dest, uDest, vDest, dstW, chrDstW,
2521 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2526 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2527 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2529 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2530 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2531 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2533 int chrAlpha= vChrFilter[2*dstY+1];
2535 RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2536 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2538 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2540 int lumAlpha= vLumFilter[2*dstY+1];
2541 int chrAlpha= vChrFilter[2*dstY+1];
2543 RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2544 dest, dstW, lumAlpha, chrAlpha, dstY);
2549 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2550 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2552 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
2556 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2558 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2559 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2560 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2562 if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL;
2564 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2565 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2566 dest, uDest, vDest, dstW, chrDstW);
2570 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2571 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2573 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2574 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2581 __asm __volatile(SFENCE:::"memory");
2582 __asm __volatile(EMMS:::"memory");
2584 /* store changed local vars back in the context */
2586 c->lumBufIndex= lumBufIndex;
2587 c->chrBufIndex= chrBufIndex;
2588 c->lastInLumBuf= lastInLumBuf;
2589 c->lastInChrBuf= lastInChrBuf;