2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
40 #define PREFETCH "/nop"
41 #define PREFETCHW "/nop"
45 #define SFENCE "sfence"
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52 #elif defined (HAVE_3DNOW)
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #define YSCALEYUV2YV12X(x) \
64 "xorl %%eax, %%eax \n\t"\
65 "pxor %%mm3, %%mm3 \n\t"\
66 "pxor %%mm4, %%mm4 \n\t"\
67 "movl %0, %%edx \n\t"\
68 ".balign 16 \n\t" /* FIXME Unroll? */\
70 "movl (%1, %%edx, 4), %%esi \n\t"\
71 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
72 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
73 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
74 "pmulhw %%mm0, %%mm2 \n\t"\
75 "pmulhw %%mm0, %%mm5 \n\t"\
76 "paddw %%mm2, %%mm3 \n\t"\
77 "paddw %%mm5, %%mm4 \n\t"\
78 "addl $1, %%edx \n\t"\
80 "psraw $3, %%mm3 \n\t"\
81 "psraw $3, %%mm4 \n\t"\
82 "packuswb %%mm4, %%mm3 \n\t"\
83 MOVNTQ(%%mm3, (%3, %%eax))\
84 "addl $8, %%eax \n\t"\
85 "cmpl %4, %%eax \n\t"\
86 "pxor %%mm3, %%mm3 \n\t"\
87 "pxor %%mm4, %%mm4 \n\t"\
88 "movl %0, %%edx \n\t"\
91 #define YSCALEYUV2YV121 \
92 "movl %2, %%eax \n\t"\
93 ".balign 16 \n\t" /* FIXME Unroll? */\
95 "movq (%0, %%eax, 2), %%mm0 \n\t"\
96 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
97 "psraw $7, %%mm0 \n\t"\
98 "psraw $7, %%mm1 \n\t"\
99 "packuswb %%mm1, %%mm0 \n\t"\
100 MOVNTQ(%%mm0, (%1, %%eax))\
101 "addl $8, %%eax \n\t"\
105 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107 "r" (dest), "m" (dstW),
108 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
111 #define YSCALEYUV2RGBX \
112 "xorl %%eax, %%eax \n\t"\
115 "movl %1, %%edx \n\t" /* -chrFilterSize */\
116 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
117 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
118 "pxor %%mm3, %%mm3 \n\t"\
119 "pxor %%mm4, %%mm4 \n\t"\
121 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
122 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
123 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
124 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
125 "pmulhw %%mm0, %%mm2 \n\t"\
126 "pmulhw %%mm0, %%mm5 \n\t"\
127 "paddw %%mm2, %%mm3 \n\t"\
128 "paddw %%mm5, %%mm4 \n\t"\
129 "addl $1, %%edx \n\t"\
132 "movl %0, %%edx \n\t" /* -lumFilterSize */\
133 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
134 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
135 "pxor %%mm1, %%mm1 \n\t"\
136 "pxor %%mm7, %%mm7 \n\t"\
138 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
139 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
140 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
141 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
142 "pmulhw %%mm0, %%mm2 \n\t"\
143 "pmulhw %%mm0, %%mm5 \n\t"\
144 "paddw %%mm2, %%mm1 \n\t"\
145 "paddw %%mm5, %%mm7 \n\t"\
146 "addl $1, %%edx \n\t"\
149 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
150 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
151 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
152 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
153 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
155 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
156 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
159 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
160 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
161 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
162 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163 "paddw %%mm3, %%mm4 \n\t"\
164 "movq %%mm2, %%mm0 \n\t"\
165 "movq %%mm5, %%mm6 \n\t"\
166 "movq %%mm4, %%mm3 \n\t"\
167 "punpcklwd %%mm2, %%mm2 \n\t"\
168 "punpcklwd %%mm5, %%mm5 \n\t"\
169 "punpcklwd %%mm4, %%mm4 \n\t"\
170 "paddw %%mm1, %%mm2 \n\t"\
171 "paddw %%mm1, %%mm5 \n\t"\
172 "paddw %%mm1, %%mm4 \n\t"\
173 "punpckhwd %%mm0, %%mm0 \n\t"\
174 "punpckhwd %%mm6, %%mm6 \n\t"\
175 "punpckhwd %%mm3, %%mm3 \n\t"\
176 "paddw %%mm7, %%mm0 \n\t"\
177 "paddw %%mm7, %%mm6 \n\t"\
178 "paddw %%mm7, %%mm3 \n\t"\
179 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180 "packuswb %%mm0, %%mm2 \n\t"\
181 "packuswb %%mm6, %%mm5 \n\t"\
182 "packuswb %%mm3, %%mm4 \n\t"\
183 "pxor %%mm7, %%mm7 \n\t"
185 #define FULL_YSCALEYUV2RGB \
186 "pxor %%mm7, %%mm7 \n\t"\
187 "movd %6, %%mm6 \n\t" /*yalpha1*/\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "punpcklwd %%mm6, %%mm6 \n\t"\
190 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "punpcklwd %%mm5, %%mm5 \n\t"\
193 "xorl %%eax, %%eax \n\t"\
196 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
197 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
198 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
199 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
200 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
201 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
206 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
209 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
211 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
212 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
213 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
216 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
218 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
219 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
220 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
221 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
222 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
225 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
226 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
228 "paddw %%mm1, %%mm3 \n\t" /* B*/\
229 "paddw %%mm1, %%mm0 \n\t" /* R*/\
230 "packuswb %%mm3, %%mm3 \n\t"\
232 "packuswb %%mm0, %%mm0 \n\t"\
233 "paddw %%mm4, %%mm2 \n\t"\
234 "paddw %%mm2, %%mm1 \n\t" /* G*/\
236 "packuswb %%mm1, %%mm1 \n\t"
238 #define YSCALEYUV2RGB \
239 "movd %6, %%mm6 \n\t" /*yalpha1*/\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
241 "punpcklwd %%mm6, %%mm6 \n\t"\
242 "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
243 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
245 "punpcklwd %%mm5, %%mm5 \n\t"\
246 "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
247 "xorl %%eax, %%eax \n\t"\
250 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
251 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
252 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
253 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
254 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
256 "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
257 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
263 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
264 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
265 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
266 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
267 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
269 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
271 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
272 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
273 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
274 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
275 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
276 "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277 "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
278 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
282 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
285 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
286 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
287 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289 "paddw %%mm3, %%mm4 \n\t"\
290 "movq %%mm2, %%mm0 \n\t"\
291 "movq %%mm5, %%mm6 \n\t"\
292 "movq %%mm4, %%mm3 \n\t"\
293 "punpcklwd %%mm2, %%mm2 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
295 "punpcklwd %%mm4, %%mm4 \n\t"\
296 "paddw %%mm1, %%mm2 \n\t"\
297 "paddw %%mm1, %%mm5 \n\t"\
298 "paddw %%mm1, %%mm4 \n\t"\
299 "punpckhwd %%mm0, %%mm0 \n\t"\
300 "punpckhwd %%mm6, %%mm6 \n\t"\
301 "punpckhwd %%mm3, %%mm3 \n\t"\
302 "paddw %%mm7, %%mm0 \n\t"\
303 "paddw %%mm7, %%mm6 \n\t"\
304 "paddw %%mm7, %%mm3 \n\t"\
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306 "packuswb %%mm0, %%mm2 \n\t"\
307 "packuswb %%mm6, %%mm5 \n\t"\
308 "packuswb %%mm3, %%mm4 \n\t"\
309 "pxor %%mm7, %%mm7 \n\t"
311 #define YSCALEYUV2RGB1 \
312 "xorl %%eax, %%eax \n\t"\
315 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
316 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
317 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
319 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
320 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
321 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
322 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
323 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
325 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
326 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
327 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
328 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
333 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
334 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
335 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
336 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337 "paddw %%mm3, %%mm4 \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "movq %%mm5, %%mm6 \n\t"\
340 "movq %%mm4, %%mm3 \n\t"\
341 "punpcklwd %%mm2, %%mm2 \n\t"\
342 "punpcklwd %%mm5, %%mm5 \n\t"\
343 "punpcklwd %%mm4, %%mm4 \n\t"\
344 "paddw %%mm1, %%mm2 \n\t"\
345 "paddw %%mm1, %%mm5 \n\t"\
346 "paddw %%mm1, %%mm4 \n\t"\
347 "punpckhwd %%mm0, %%mm0 \n\t"\
348 "punpckhwd %%mm6, %%mm6 \n\t"\
349 "punpckhwd %%mm3, %%mm3 \n\t"\
350 "paddw %%mm7, %%mm0 \n\t"\
351 "paddw %%mm7, %%mm6 \n\t"\
352 "paddw %%mm7, %%mm3 \n\t"\
353 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354 "packuswb %%mm0, %%mm2 \n\t"\
355 "packuswb %%mm6, %%mm5 \n\t"\
356 "packuswb %%mm3, %%mm4 \n\t"\
357 "pxor %%mm7, %%mm7 \n\t"
359 // do vertical chrominance interpolation
360 #define YSCALEYUV2RGB1b \
361 "xorl %%eax, %%eax \n\t"\
364 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
365 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
366 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
367 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
368 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
370 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
371 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
372 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
373 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
374 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
375 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
376 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
378 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
380 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
381 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
386 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
387 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
388 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
389 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390 "paddw %%mm3, %%mm4 \n\t"\
391 "movq %%mm2, %%mm0 \n\t"\
392 "movq %%mm5, %%mm6 \n\t"\
393 "movq %%mm4, %%mm3 \n\t"\
394 "punpcklwd %%mm2, %%mm2 \n\t"\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm4, %%mm4 \n\t"\
397 "paddw %%mm1, %%mm2 \n\t"\
398 "paddw %%mm1, %%mm5 \n\t"\
399 "paddw %%mm1, %%mm4 \n\t"\
400 "punpckhwd %%mm0, %%mm0 \n\t"\
401 "punpckhwd %%mm6, %%mm6 \n\t"\
402 "punpckhwd %%mm3, %%mm3 \n\t"\
403 "paddw %%mm7, %%mm0 \n\t"\
404 "paddw %%mm7, %%mm6 \n\t"\
405 "paddw %%mm7, %%mm3 \n\t"\
406 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407 "packuswb %%mm0, %%mm2 \n\t"\
408 "packuswb %%mm6, %%mm5 \n\t"\
409 "packuswb %%mm3, %%mm4 \n\t"\
410 "pxor %%mm7, %%mm7 \n\t"
413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414 "movq %%mm2, %%mm1 \n\t" /* B */\
415 "movq %%mm5, %%mm6 \n\t" /* R */\
416 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
417 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
418 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
419 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
420 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
421 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
422 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
423 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
424 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
425 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
427 MOVNTQ(%%mm0, (%4, %%eax, 4))\
428 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
432 "addl $8, %%eax \n\t"\
433 "cmpl %5, %%eax \n\t"\
437 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
438 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
439 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
440 "psrlq $3, %%mm2 \n\t"\
442 "movq %%mm2, %%mm1 \n\t"\
443 "movq %%mm4, %%mm3 \n\t"\
445 "punpcklbw %%mm7, %%mm3 \n\t"\
446 "punpcklbw %%mm5, %%mm2 \n\t"\
447 "punpckhbw %%mm7, %%mm4 \n\t"\
448 "punpckhbw %%mm5, %%mm1 \n\t"\
450 "psllq $3, %%mm3 \n\t"\
451 "psllq $3, %%mm4 \n\t"\
453 "por %%mm3, %%mm2 \n\t"\
454 "por %%mm4, %%mm1 \n\t"\
456 MOVNTQ(%%mm2, (%4, %%eax, 2))\
457 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
459 "addl $8, %%eax \n\t"\
460 "cmpl %5, %%eax \n\t"\
464 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
465 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
466 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
467 "psrlq $3, %%mm2 \n\t"\
468 "psrlq $1, %%mm5 \n\t"\
470 "movq %%mm2, %%mm1 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
473 "punpcklbw %%mm7, %%mm3 \n\t"\
474 "punpcklbw %%mm5, %%mm2 \n\t"\
475 "punpckhbw %%mm7, %%mm4 \n\t"\
476 "punpckhbw %%mm5, %%mm1 \n\t"\
478 "psllq $2, %%mm3 \n\t"\
479 "psllq $2, %%mm4 \n\t"\
481 "por %%mm3, %%mm2 \n\t"\
482 "por %%mm4, %%mm1 \n\t"\
484 MOVNTQ(%%mm2, (%4, %%eax, 2))\
485 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
487 "addl $8, %%eax \n\t"\
488 "cmpl %5, %%eax \n\t"\
491 #define WRITEBGR24OLD \
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493 "movq %%mm2, %%mm1 \n\t" /* B */\
494 "movq %%mm5, %%mm6 \n\t" /* R */\
495 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
496 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
497 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
498 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
499 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
500 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
501 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
502 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
503 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
504 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
506 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
507 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
508 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
510 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
511 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
512 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
513 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
515 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
516 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
517 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
518 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
519 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
520 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
521 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
522 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
524 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
525 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
526 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
527 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
529 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
530 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
531 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
532 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
534 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
535 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
536 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
538 MOVNTQ(%%mm0, (%%ebx))\
539 MOVNTQ(%%mm2, 8(%%ebx))\
540 MOVNTQ(%%mm3, 16(%%ebx))\
541 "addl $24, %%ebx \n\t"\
543 "addl $8, %%eax \n\t"\
544 "cmpl %5, %%eax \n\t"\
547 #define WRITEBGR24MMX \
548 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549 "movq %%mm2, %%mm1 \n\t" /* B */\
550 "movq %%mm5, %%mm6 \n\t" /* R */\
551 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
552 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
553 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
554 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
555 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
556 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
557 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
558 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
559 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
560 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
562 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
563 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
564 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
565 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
567 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
568 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
569 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
570 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
572 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
573 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
574 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
575 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
577 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
578 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
579 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
580 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
581 MOVNTQ(%%mm0, (%%ebx))\
583 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
584 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
585 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
586 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
587 MOVNTQ(%%mm6, 8(%%ebx))\
589 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
590 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
591 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
592 MOVNTQ(%%mm5, 16(%%ebx))\
594 "addl $24, %%ebx \n\t"\
596 "addl $8, %%eax \n\t"\
597 "cmpl %5, %%eax \n\t"\
600 #define WRITEBGR24MMX2 \
601 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
602 "movq "MANGLE(M24A)", %%mm0 \n\t"\
603 "movq "MANGLE(M24C)", %%mm7 \n\t"\
604 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
605 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
606 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
608 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
609 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
610 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
612 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
613 "por %%mm1, %%mm6 \n\t"\
614 "por %%mm3, %%mm6 \n\t"\
615 MOVNTQ(%%mm6, (%%ebx))\
617 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
618 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
619 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
620 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
622 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
623 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
624 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
626 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
627 "por %%mm3, %%mm6 \n\t"\
628 MOVNTQ(%%mm6, 8(%%ebx))\
630 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
631 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
632 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
634 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
635 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
636 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
638 "por %%mm1, %%mm3 \n\t"\
639 "por %%mm3, %%mm6 \n\t"\
640 MOVNTQ(%%mm6, 16(%%ebx))\
642 "addl $24, %%ebx \n\t"\
644 "addl $8, %%eax \n\t"\
645 "cmpl %5, %%eax \n\t"\
650 #define WRITEBGR24 WRITEBGR24MMX2
653 #define WRITEBGR24 WRITEBGR24MMX
656 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
659 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
666 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
668 : "%eax", "%edx", "%esi"
672 YSCALEYUV2YV12X(4096)
673 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
675 : "%eax", "%edx", "%esi"
681 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683 : "%eax", "%edx", "%esi"
686 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
687 chrFilter, chrSrc, chrFilterSize,
688 dest, uDest, vDest, dstW);
692 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
700 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
707 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
715 :: "r" (lumSrc + dstW), "r" (dest + dstW),
720 //FIXME Optimize (just quickly writen not opti..)
721 //FIXME replace MINMAX with LUTs
723 for(i=0; i<dstW; i++)
725 int val= lumSrc[i]>>7;
727 dest[i]= MIN(MAX(val>>19, 0), 255);
731 for(i=0; i<(dstW>>1); i++)
734 int v=chrSrc[i + 2048]>>7;
736 uDest[i]= MIN(MAX(u>>19, 0), 255);
737 vDest[i]= MIN(MAX(v>>19, 0), 255);
744 * vertical scale YV12 to RGB
746 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
747 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
748 uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
750 /* if(flags&SWS_FULL_UV_IPOL)
757 if(dstFormat == IMGFMT_BGR32) //FIXME untested
763 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
764 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
765 "r" (dest), "m" (dstW),
766 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
767 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
770 else if(dstFormat == IMGFMT_BGR24) //FIXME untested
774 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
775 "addl %4, %%ebx \n\t"
778 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
779 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
780 "r" (dest), "m" (dstW),
781 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
782 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
785 else if(dstFormat==IMGFMT_BGR15)
789 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
791 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
792 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
793 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
798 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
799 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
800 "r" (dest), "m" (dstW),
801 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
802 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
805 else if(dstFormat==IMGFMT_BGR16)
809 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
811 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
812 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
813 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
818 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
819 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
820 "r" (dest), "m" (dstW),
821 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
822 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
826 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
827 chrFilter, chrSrc, chrFilterSize,
828 dest, dstW, dstFormat);
836 * vertical bilinear scale YV12 to RGB
838 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
839 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
841 int yalpha1=yalpha^4095;
842 int uvalpha1=uvalpha^4095;
844 if(flags&SWS_FULL_CHR_H_INT)
848 if(dstFormat==IMGFMT_BGR32)
854 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
855 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
857 "movq %%mm3, %%mm1 \n\t"
858 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
859 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
861 MOVNTQ(%%mm3, (%4, %%eax, 4))
862 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
864 "addl $4, %%eax \n\t"
865 "cmpl %5, %%eax \n\t"
869 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
870 "m" (yalpha1), "m" (uvalpha1)
874 else if(dstFormat==IMGFMT_BGR24)
881 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
882 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
884 "movq %%mm3, %%mm1 \n\t"
885 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
886 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
888 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
889 "psrlq $8, %%mm3 \n\t" // GR0BGR00
890 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
891 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
892 "por %%mm2, %%mm3 \n\t" // BGRBGR00
893 "movq %%mm1, %%mm2 \n\t"
894 "psllq $48, %%mm1 \n\t" // 000000BG
895 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
897 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
898 "psrld $16, %%mm2 \n\t" // R000R000
899 "psrlq $24, %%mm1 \n\t" // 0BGR0000
900 "por %%mm2, %%mm1 \n\t" // RBGRR000
902 "movl %4, %%ebx \n\t"
903 "addl %%eax, %%ebx \n\t"
907 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
908 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
910 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
911 "psrlq $32, %%mm3 \n\t"
912 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
913 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
915 "addl $4, %%eax \n\t"
916 "cmpl %5, %%eax \n\t"
919 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
920 "m" (yalpha1), "m" (uvalpha1)
924 else if(dstFormat==IMGFMT_BGR15)
930 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
931 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
932 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
934 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
935 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
936 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
938 "psrlw $3, %%mm3 \n\t"
939 "psllw $2, %%mm1 \n\t"
940 "psllw $7, %%mm0 \n\t"
941 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
942 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
944 "por %%mm3, %%mm1 \n\t"
945 "por %%mm1, %%mm0 \n\t"
947 MOVNTQ(%%mm0, (%4, %%eax, 2))
949 "addl $4, %%eax \n\t"
950 "cmpl %5, %%eax \n\t"
953 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
954 "m" (yalpha1), "m" (uvalpha1)
958 else if(dstFormat==IMGFMT_BGR16)
964 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
965 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
966 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
968 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
969 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
970 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
972 "psrlw $3, %%mm3 \n\t"
973 "psllw $3, %%mm1 \n\t"
974 "psllw $8, %%mm0 \n\t"
975 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
976 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
978 "por %%mm3, %%mm1 \n\t"
979 "por %%mm1, %%mm0 \n\t"
981 MOVNTQ(%%mm0, (%4, %%eax, 2))
983 "addl $4, %%eax \n\t"
984 "cmpl %5, %%eax \n\t"
987 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
988 "m" (yalpha1), "m" (uvalpha1)
993 if(dstFormat==IMGFMT_BGR32)
996 #ifdef WORDS_BIGENDIAN
1000 // vertical linear interpolation && yuv2rgb in a single step:
1001 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1005 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1006 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1010 else if(dstFormat==IMGFMT_BGR24)
1013 for(i=0;i<dstW;i++){
1014 // vertical linear interpolation && yuv2rgb in a single step:
1015 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1016 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1017 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1018 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1019 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1020 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1024 else if(dstFormat==IMGFMT_BGR16)
1027 for(i=0;i<dstW;i++){
1028 // vertical linear interpolation && yuv2rgb in a single step:
1029 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1030 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1031 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1033 ((uint16_t*)dest)[i] =
1034 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1035 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1036 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1039 else if(dstFormat==IMGFMT_BGR15)
1042 for(i=0;i<dstW;i++){
1043 // vertical linear interpolation && yuv2rgb in a single step:
1044 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1045 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1046 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1048 ((uint16_t*)dest)[i] =
1049 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1050 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1051 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1059 if(dstFormat==IMGFMT_BGR32)
1065 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1066 "m" (yalpha1), "m" (uvalpha1)
1070 else if(dstFormat==IMGFMT_BGR24)
1073 "movl %4, %%ebx \n\t"
1077 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1078 "m" (yalpha1), "m" (uvalpha1)
1082 else if(dstFormat==IMGFMT_BGR15)
1086 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1088 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1089 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1090 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1095 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1096 "m" (yalpha1), "m" (uvalpha1)
1100 else if(dstFormat==IMGFMT_BGR16)
1104 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1106 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1107 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1108 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1113 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1114 "m" (yalpha1), "m" (uvalpha1)
1119 if(dstFormat==IMGFMT_BGR32)
1122 #ifdef WORDS_BIGENDIAN
1125 for(i=0; i<dstW-1; i+=2){
1126 // vertical linear interpolation && yuv2rgb in a single step:
1127 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1128 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1129 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1130 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1132 int Cb= yuvtab_40cf[U];
1133 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1134 int Cr= yuvtab_3343[V];
1136 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1137 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1138 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1140 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1141 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1142 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1145 else if(dstFormat==IMGFMT_BGR24)
1148 for(i=0; i<dstW-1; i+=2){
1149 // vertical linear interpolation && yuv2rgb in a single step:
1150 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1152 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1153 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1155 int Cb= yuvtab_40cf[U];
1156 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1157 int Cr= yuvtab_3343[V];
1159 dest[0]=clip_table[((Y1 + Cb) >>13)];
1160 dest[1]=clip_table[((Y1 + Cg) >>13)];
1161 dest[2]=clip_table[((Y1 + Cr) >>13)];
1163 dest[3]=clip_table[((Y2 + Cb) >>13)];
1164 dest[4]=clip_table[((Y2 + Cg) >>13)];
1165 dest[5]=clip_table[((Y2 + Cr) >>13)];
1169 else if(dstFormat==IMGFMT_BGR16)
1173 static int ditherb1=1<<14;
1174 static int ditherg1=1<<13;
1175 static int ditherr1=2<<14;
1176 static int ditherb2=3<<14;
1177 static int ditherg2=3<<13;
1178 static int ditherr2=0<<14;
1180 ditherb1 ^= (1^2)<<14;
1181 ditherg1 ^= (1^2)<<13;
1182 ditherr1 ^= (1^2)<<14;
1183 ditherb2 ^= (3^0)<<14;
1184 ditherg2 ^= (3^0)<<13;
1185 ditherr2 ^= (3^0)<<14;
1187 const int ditherb1=0;
1188 const int ditherg1=0;
1189 const int ditherr1=0;
1190 const int ditherb2=0;
1191 const int ditherg2=0;
1192 const int ditherr2=0;
1194 for(i=0; i<dstW-1; i+=2){
1195 // vertical linear interpolation && yuv2rgb in a single step:
1196 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1197 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1198 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1199 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1201 int Cb= yuvtab_40cf[U];
1202 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1203 int Cr= yuvtab_3343[V];
1205 ((uint16_t*)dest)[i] =
1206 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1207 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1208 clip_table16r[(Y1 + Cr + ditherr1) >>13];
1210 ((uint16_t*)dest)[i+1] =
1211 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1212 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1213 clip_table16r[(Y2 + Cr + ditherr2) >>13];
1216 else if(dstFormat==IMGFMT_BGR15)
1220 static int ditherb1=1<<14;
1221 static int ditherg1=1<<14;
1222 static int ditherr1=2<<14;
1223 static int ditherb2=3<<14;
1224 static int ditherg2=3<<14;
1225 static int ditherr2=0<<14;
1227 ditherb1 ^= (1^2)<<14;
1228 ditherg1 ^= (1^2)<<14;
1229 ditherr1 ^= (1^2)<<14;
1230 ditherb2 ^= (3^0)<<14;
1231 ditherg2 ^= (3^0)<<14;
1232 ditherr2 ^= (3^0)<<14;
1234 const int ditherb1=0;
1235 const int ditherg1=0;
1236 const int ditherr1=0;
1237 const int ditherb2=0;
1238 const int ditherg2=0;
1239 const int ditherr2=0;
1241 for(i=0; i<dstW-1; i+=2){
1242 // vertical linear interpolation && yuv2rgb in a single step:
1243 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1244 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1245 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1246 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1248 int Cb= yuvtab_40cf[U];
1249 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1250 int Cr= yuvtab_3343[V];
1252 ((uint16_t*)dest)[i] =
1253 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1254 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1255 clip_table15r[(Y1 + Cr + ditherr1) >>13];
1257 ((uint16_t*)dest)[i+1] =
1258 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1259 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1260 clip_table15r[(Y2 + Cr + ditherr2) >>13];
1268 * YV12 to RGB without scaling or interpolating
1270 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1271 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1273 int uvalpha1=uvalpha^4095;
1274 const int yalpha1=0;
1276 if(flags&SWS_FULL_CHR_H_INT)
1278 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1283 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1285 if(dstFormat==IMGFMT_BGR32)
1290 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1295 else if(dstFormat==IMGFMT_BGR24)
1298 "movl %4, %%ebx \n\t"
1301 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1302 "m" (yalpha1), "m" (uvalpha1)
1306 else if(dstFormat==IMGFMT_BGR15)
1310 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1312 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1313 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1314 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1317 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1318 "m" (yalpha1), "m" (uvalpha1)
1322 else if(dstFormat==IMGFMT_BGR16)
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1328 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1329 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1330 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1334 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1335 "m" (yalpha1), "m" (uvalpha1)
1342 if(dstFormat==IMGFMT_BGR32)
1347 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1348 "m" (yalpha1), "m" (uvalpha1)
1352 else if(dstFormat==IMGFMT_BGR24)
1355 "movl %4, %%ebx \n\t"
1358 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1359 "m" (yalpha1), "m" (uvalpha1)
1363 else if(dstFormat==IMGFMT_BGR15)
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1374 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1375 "m" (yalpha1), "m" (uvalpha1)
1379 else if(dstFormat==IMGFMT_BGR16)
1383 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1385 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1391 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1392 "m" (yalpha1), "m" (uvalpha1)
1398 //FIXME write 2 versions (for even & odd lines)
1400 if(dstFormat==IMGFMT_BGR32)
1403 #ifdef WORDS_BIGENDIAN
1406 for(i=0; i<dstW-1; i+=2){
1407 // vertical linear interpolation && yuv2rgb in a single step:
1408 int Y1=yuvtab_2568[buf0[i]>>7];
1409 int Y2=yuvtab_2568[buf0[i+1]>>7];
1410 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1411 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1413 int Cb= yuvtab_40cf[U];
1414 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1415 int Cr= yuvtab_3343[V];
1417 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1418 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1419 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1421 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1422 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1423 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1426 else if(dstFormat==IMGFMT_BGR24)
1429 for(i=0; i<dstW-1; i+=2){
1430 // vertical linear interpolation && yuv2rgb in a single step:
1431 int Y1=yuvtab_2568[buf0[i]>>7];
1432 int Y2=yuvtab_2568[buf0[i+1]>>7];
1433 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1434 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1436 int Cb= yuvtab_40cf[U];
1437 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1438 int Cr= yuvtab_3343[V];
1440 dest[0]=clip_table[((Y1 + Cb) >>13)];
1441 dest[1]=clip_table[((Y1 + Cg) >>13)];
1442 dest[2]=clip_table[((Y1 + Cr) >>13)];
1444 dest[3]=clip_table[((Y2 + Cb) >>13)];
1445 dest[4]=clip_table[((Y2 + Cg) >>13)];
1446 dest[5]=clip_table[((Y2 + Cr) >>13)];
1450 else if(dstFormat==IMGFMT_BGR16)
1454 static int ditherb1=1<<14;
1455 static int ditherg1=1<<13;
1456 static int ditherr1=2<<14;
1457 static int ditherb2=3<<14;
1458 static int ditherg2=3<<13;
1459 static int ditherr2=0<<14;
1461 ditherb1 ^= (1^2)<<14;
1462 ditherg1 ^= (1^2)<<13;
1463 ditherr1 ^= (1^2)<<14;
1464 ditherb2 ^= (3^0)<<14;
1465 ditherg2 ^= (3^0)<<13;
1466 ditherr2 ^= (3^0)<<14;
1468 const int ditherb1=0;
1469 const int ditherg1=0;
1470 const int ditherr1=0;
1471 const int ditherb2=0;
1472 const int ditherg2=0;
1473 const int ditherr2=0;
1475 for(i=0; i<dstW-1; i+=2){
1476 // vertical linear interpolation && yuv2rgb in a single step:
1477 int Y1=yuvtab_2568[buf0[i]>>7];
1478 int Y2=yuvtab_2568[buf0[i+1]>>7];
1479 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1480 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1482 int Cb= yuvtab_40cf[U];
1483 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1484 int Cr= yuvtab_3343[V];
1486 ((uint16_t*)dest)[i] =
1487 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1488 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1489 clip_table16r[(Y1 + Cr + ditherr1) >>13];
1491 ((uint16_t*)dest)[i+1] =
1492 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1493 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1494 clip_table16r[(Y2 + Cr + ditherr2) >>13];
1497 else if(dstFormat==IMGFMT_BGR15)
1501 static int ditherb1=1<<14;
1502 static int ditherg1=1<<14;
1503 static int ditherr1=2<<14;
1504 static int ditherb2=3<<14;
1505 static int ditherg2=3<<14;
1506 static int ditherr2=0<<14;
1508 ditherb1 ^= (1^2)<<14;
1509 ditherg1 ^= (1^2)<<14;
1510 ditherr1 ^= (1^2)<<14;
1511 ditherb2 ^= (3^0)<<14;
1512 ditherg2 ^= (3^0)<<14;
1513 ditherr2 ^= (3^0)<<14;
1515 const int ditherb1=0;
1516 const int ditherg1=0;
1517 const int ditherr1=0;
1518 const int ditherb2=0;
1519 const int ditherg2=0;
1520 const int ditherr2=0;
1522 for(i=0; i<dstW-1; i+=2){
1523 // vertical linear interpolation && yuv2rgb in a single step:
1524 int Y1=yuvtab_2568[buf0[i]>>7];
1525 int Y2=yuvtab_2568[buf0[i+1]>>7];
1526 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1527 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1529 int Cb= yuvtab_40cf[U];
1530 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1531 int Cr= yuvtab_3343[V];
1533 ((uint16_t*)dest)[i] =
1534 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1535 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1536 clip_table15r[(Y1 + Cr + ditherr1) >>13];
1538 ((uint16_t*)dest)[i+1] =
1539 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1540 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1541 clip_table15r[(Y2 + Cr + ditherr2) >>13];
1547 //FIXME yuy2* can read upto 7 samples to much
1549 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1553 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1554 "movl %0, %%eax \n\t"
1556 "movq (%1, %%eax,2), %%mm0 \n\t"
1557 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1558 "pand %%mm2, %%mm0 \n\t"
1559 "pand %%mm2, %%mm1 \n\t"
1560 "packuswb %%mm1, %%mm0 \n\t"
1561 "movq %%mm0, (%2, %%eax) \n\t"
1562 "addl $8, %%eax \n\t"
1564 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1569 for(i=0; i<width; i++)
1574 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1576 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1578 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1579 "movl %0, %%eax \n\t"
1581 "movq (%1, %%eax,4), %%mm0 \n\t"
1582 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1583 "movq (%2, %%eax,4), %%mm2 \n\t"
1584 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1587 "psrlw $8, %%mm0 \n\t"
1588 "psrlw $8, %%mm1 \n\t"
1589 "packuswb %%mm1, %%mm0 \n\t"
1590 "movq %%mm0, %%mm1 \n\t"
1591 "psrlw $8, %%mm0 \n\t"
1592 "pand %%mm4, %%mm1 \n\t"
1593 "packuswb %%mm0, %%mm0 \n\t"
1594 "packuswb %%mm1, %%mm1 \n\t"
1595 "movd %%mm0, (%4, %%eax) \n\t"
1596 "movd %%mm1, (%3, %%eax) \n\t"
1597 "addl $4, %%eax \n\t"
1599 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1604 for(i=0; i<width; i++)
1606 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1607 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1612 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1614 #ifdef HAVE_MMXFIXME
1617 for(i=0; i<width; i++)
1623 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1628 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1630 #ifdef HAVE_MMXFIXME
1633 for(i=0; i<width; i++)
1635 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1636 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1637 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1639 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1640 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1645 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1649 "movl %2, %%eax \n\t"
1650 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1651 "movq "MANGLE(w1111)", %%mm5 \n\t"
1652 "pxor %%mm7, %%mm7 \n\t"
1653 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1656 PREFETCH" 64(%0, %%ebx) \n\t"
1657 "movd (%0, %%ebx), %%mm0 \n\t"
1658 "movd 3(%0, %%ebx), %%mm1 \n\t"
1659 "punpcklbw %%mm7, %%mm0 \n\t"
1660 "punpcklbw %%mm7, %%mm1 \n\t"
1661 "movd 6(%0, %%ebx), %%mm2 \n\t"
1662 "movd 9(%0, %%ebx), %%mm3 \n\t"
1663 "punpcklbw %%mm7, %%mm2 \n\t"
1664 "punpcklbw %%mm7, %%mm3 \n\t"
1665 "pmaddwd %%mm6, %%mm0 \n\t"
1666 "pmaddwd %%mm6, %%mm1 \n\t"
1667 "pmaddwd %%mm6, %%mm2 \n\t"
1668 "pmaddwd %%mm6, %%mm3 \n\t"
1669 #ifndef FAST_BGR2YV12
1670 "psrad $8, %%mm0 \n\t"
1671 "psrad $8, %%mm1 \n\t"
1672 "psrad $8, %%mm2 \n\t"
1673 "psrad $8, %%mm3 \n\t"
1675 "packssdw %%mm1, %%mm0 \n\t"
1676 "packssdw %%mm3, %%mm2 \n\t"
1677 "pmaddwd %%mm5, %%mm0 \n\t"
1678 "pmaddwd %%mm5, %%mm2 \n\t"
1679 "packssdw %%mm2, %%mm0 \n\t"
1680 "psraw $7, %%mm0 \n\t"
1682 "movd 12(%0, %%ebx), %%mm4 \n\t"
1683 "movd 15(%0, %%ebx), %%mm1 \n\t"
1684 "punpcklbw %%mm7, %%mm4 \n\t"
1685 "punpcklbw %%mm7, %%mm1 \n\t"
1686 "movd 18(%0, %%ebx), %%mm2 \n\t"
1687 "movd 21(%0, %%ebx), %%mm3 \n\t"
1688 "punpcklbw %%mm7, %%mm2 \n\t"
1689 "punpcklbw %%mm7, %%mm3 \n\t"
1690 "pmaddwd %%mm6, %%mm4 \n\t"
1691 "pmaddwd %%mm6, %%mm1 \n\t"
1692 "pmaddwd %%mm6, %%mm2 \n\t"
1693 "pmaddwd %%mm6, %%mm3 \n\t"
1694 #ifndef FAST_BGR2YV12
1695 "psrad $8, %%mm4 \n\t"
1696 "psrad $8, %%mm1 \n\t"
1697 "psrad $8, %%mm2 \n\t"
1698 "psrad $8, %%mm3 \n\t"
1700 "packssdw %%mm1, %%mm4 \n\t"
1701 "packssdw %%mm3, %%mm2 \n\t"
1702 "pmaddwd %%mm5, %%mm4 \n\t"
1703 "pmaddwd %%mm5, %%mm2 \n\t"
1704 "addl $24, %%ebx \n\t"
1705 "packssdw %%mm2, %%mm4 \n\t"
1706 "psraw $7, %%mm4 \n\t"
1708 "packuswb %%mm4, %%mm0 \n\t"
1709 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1711 "movq %%mm0, (%1, %%eax) \n\t"
1712 "addl $8, %%eax \n\t"
1714 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1719 for(i=0; i<width; i++)
1725 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1730 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1734 "movl %4, %%eax \n\t"
1735 "movq "MANGLE(w1111)", %%mm5 \n\t"
1736 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1737 "pxor %%mm7, %%mm7 \n\t"
1738 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1739 "addl %%ebx, %%ebx \n\t"
1742 PREFETCH" 64(%0, %%ebx) \n\t"
1743 PREFETCH" 64(%1, %%ebx) \n\t"
1744 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1745 "movq (%0, %%ebx), %%mm0 \n\t"
1746 "movq (%1, %%ebx), %%mm1 \n\t"
1747 "movq 6(%0, %%ebx), %%mm2 \n\t"
1748 "movq 6(%1, %%ebx), %%mm3 \n\t"
1751 "movq %%mm0, %%mm1 \n\t"
1752 "movq %%mm2, %%mm3 \n\t"
1753 "psrlq $24, %%mm0 \n\t"
1754 "psrlq $24, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm0 \n\t"
1758 "punpcklbw %%mm7, %%mm2 \n\t"
1760 "movd (%0, %%ebx), %%mm0 \n\t"
1761 "movd (%1, %%ebx), %%mm1 \n\t"
1762 "movd 3(%0, %%ebx), %%mm2 \n\t"
1763 "movd 3(%1, %%ebx), %%mm3 \n\t"
1764 "punpcklbw %%mm7, %%mm0 \n\t"
1765 "punpcklbw %%mm7, %%mm1 \n\t"
1766 "punpcklbw %%mm7, %%mm2 \n\t"
1767 "punpcklbw %%mm7, %%mm3 \n\t"
1768 "paddw %%mm1, %%mm0 \n\t"
1769 "paddw %%mm3, %%mm2 \n\t"
1770 "paddw %%mm2, %%mm0 \n\t"
1771 "movd 6(%0, %%ebx), %%mm4 \n\t"
1772 "movd 6(%1, %%ebx), %%mm1 \n\t"
1773 "movd 9(%0, %%ebx), %%mm2 \n\t"
1774 "movd 9(%1, %%ebx), %%mm3 \n\t"
1775 "punpcklbw %%mm7, %%mm4 \n\t"
1776 "punpcklbw %%mm7, %%mm1 \n\t"
1777 "punpcklbw %%mm7, %%mm2 \n\t"
1778 "punpcklbw %%mm7, %%mm3 \n\t"
1779 "paddw %%mm1, %%mm4 \n\t"
1780 "paddw %%mm3, %%mm2 \n\t"
1781 "paddw %%mm4, %%mm2 \n\t"
1782 "psrlw $2, %%mm0 \n\t"
1783 "psrlw $2, %%mm2 \n\t"
1785 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1786 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1788 "pmaddwd %%mm0, %%mm1 \n\t"
1789 "pmaddwd %%mm2, %%mm3 \n\t"
1790 "pmaddwd %%mm6, %%mm0 \n\t"
1791 "pmaddwd %%mm6, %%mm2 \n\t"
1792 #ifndef FAST_BGR2YV12
1793 "psrad $8, %%mm0 \n\t"
1794 "psrad $8, %%mm1 \n\t"
1795 "psrad $8, %%mm2 \n\t"
1796 "psrad $8, %%mm3 \n\t"
1798 "packssdw %%mm2, %%mm0 \n\t"
1799 "packssdw %%mm3, %%mm1 \n\t"
1800 "pmaddwd %%mm5, %%mm0 \n\t"
1801 "pmaddwd %%mm5, %%mm1 \n\t"
1802 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1803 "psraw $7, %%mm0 \n\t"
1805 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1806 "movq 12(%0, %%ebx), %%mm4 \n\t"
1807 "movq 12(%1, %%ebx), %%mm1 \n\t"
1808 "movq 18(%0, %%ebx), %%mm2 \n\t"
1809 "movq 18(%1, %%ebx), %%mm3 \n\t"
1812 "movq %%mm4, %%mm1 \n\t"
1813 "movq %%mm2, %%mm3 \n\t"
1814 "psrlq $24, %%mm4 \n\t"
1815 "psrlq $24, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm4 \n\t"
1819 "punpcklbw %%mm7, %%mm2 \n\t"
1821 "movd 12(%0, %%ebx), %%mm4 \n\t"
1822 "movd 12(%1, %%ebx), %%mm1 \n\t"
1823 "movd 15(%0, %%ebx), %%mm2 \n\t"
1824 "movd 15(%1, %%ebx), %%mm3 \n\t"
1825 "punpcklbw %%mm7, %%mm4 \n\t"
1826 "punpcklbw %%mm7, %%mm1 \n\t"
1827 "punpcklbw %%mm7, %%mm2 \n\t"
1828 "punpcklbw %%mm7, %%mm3 \n\t"
1829 "paddw %%mm1, %%mm4 \n\t"
1830 "paddw %%mm3, %%mm2 \n\t"
1831 "paddw %%mm2, %%mm4 \n\t"
1832 "movd 18(%0, %%ebx), %%mm5 \n\t"
1833 "movd 18(%1, %%ebx), %%mm1 \n\t"
1834 "movd 21(%0, %%ebx), %%mm2 \n\t"
1835 "movd 21(%1, %%ebx), %%mm3 \n\t"
1836 "punpcklbw %%mm7, %%mm5 \n\t"
1837 "punpcklbw %%mm7, %%mm1 \n\t"
1838 "punpcklbw %%mm7, %%mm2 \n\t"
1839 "punpcklbw %%mm7, %%mm3 \n\t"
1840 "paddw %%mm1, %%mm5 \n\t"
1841 "paddw %%mm3, %%mm2 \n\t"
1842 "paddw %%mm5, %%mm2 \n\t"
1843 "movq "MANGLE(w1111)", %%mm5 \n\t"
1844 "psrlw $2, %%mm4 \n\t"
1845 "psrlw $2, %%mm2 \n\t"
1847 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1848 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1850 "pmaddwd %%mm4, %%mm1 \n\t"
1851 "pmaddwd %%mm2, %%mm3 \n\t"
1852 "pmaddwd %%mm6, %%mm4 \n\t"
1853 "pmaddwd %%mm6, %%mm2 \n\t"
1854 #ifndef FAST_BGR2YV12
1855 "psrad $8, %%mm4 \n\t"
1856 "psrad $8, %%mm1 \n\t"
1857 "psrad $8, %%mm2 \n\t"
1858 "psrad $8, %%mm3 \n\t"
1860 "packssdw %%mm2, %%mm4 \n\t"
1861 "packssdw %%mm3, %%mm1 \n\t"
1862 "pmaddwd %%mm5, %%mm4 \n\t"
1863 "pmaddwd %%mm5, %%mm1 \n\t"
1864 "addl $24, %%ebx \n\t"
1865 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1866 "psraw $7, %%mm4 \n\t"
1868 "movq %%mm0, %%mm1 \n\t"
1869 "punpckldq %%mm4, %%mm0 \n\t"
1870 "punpckhdq %%mm4, %%mm1 \n\t"
1871 "packsswb %%mm1, %%mm0 \n\t"
1872 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1874 "movd %%mm0, (%2, %%eax) \n\t"
1875 "punpckhdq %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%3, %%eax) \n\t"
1877 "addl $4, %%eax \n\t"
1879 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1884 for(i=0; i<width; i++)
1886 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1887 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1888 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1890 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1891 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1896 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1899 for(i=0; i<width; i++)
1901 int d= src[i*2] + (src[i*2+1]<<8);
1904 int r= (d>>11)&0x1F;
1906 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1910 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1913 for(i=0; i<width; i++)
1916 int d0= le2me_32( ((uint32_t*)src1)[i] );
1917 int d1= le2me_32( ((uint32_t*)src2)[i] );
1919 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1920 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1922 int dh2= (dh>>11) + (dh<<21);
1926 int r= (d>>11)&0x7F;
1929 int d0= src1[i*4] + (src1[i*4+1]<<8);
1931 int g0= (d0>>5)&0x3F;
1932 int r0= (d0>>11)&0x1F;
1934 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1936 int g1= (d1>>5)&0x3F;
1937 int r1= (d1>>11)&0x1F;
1939 int d2= src2[i*4] + (src2[i*4+1]<<8);
1941 int g2= (d2>>5)&0x3F;
1942 int r2= (d2>>11)&0x1F;
1944 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1946 int g3= (d3>>5)&0x3F;
1947 int r3= (d3>>11)&0x1F;
1949 int b= b0 + b1 + b2 + b3;
1950 int g= g0 + g1 + g2 + g3;
1951 int r= r0 + r1 + r2 + r3;
1953 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1954 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1958 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1961 for(i=0; i<width; i++)
1963 int d= src[i*2] + (src[i*2+1]<<8);
1966 int r= (d>>10)&0x1F;
1968 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1972 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1975 for(i=0; i<width; i++)
1978 int d0= le2me_32( ((uint32_t*)src1)[i] );
1979 int d1= le2me_32( ((uint32_t*)src2)[i] );
1981 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1982 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1984 int dh2= (dh>>11) + (dh<<21);
1988 int r= (d>>10)&0x7F;
1991 int d0= src1[i*4] + (src1[i*4+1]<<8);
1993 int g0= (d0>>5)&0x1F;
1994 int r0= (d0>>10)&0x1F;
1996 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1998 int g1= (d1>>5)&0x1F;
1999 int r1= (d1>>10)&0x1F;
2001 int d2= src2[i*4] + (src2[i*4+1]<<8);
2003 int g2= (d2>>5)&0x1F;
2004 int r2= (d2>>10)&0x1F;
2006 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
2008 int g3= (d3>>5)&0x1F;
2009 int r3= (d3>>10)&0x1F;
2011 int b= b0 + b1 + b2 + b3;
2012 int g= g0 + g1 + g2 + g3;
2013 int r= r0 + r1 + r2 + r3;
2015 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2016 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2021 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2024 for(i=0; i<width; i++)
2030 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2034 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2037 for(i=0; i<width; i++)
2039 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
2040 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
2041 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
2043 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2044 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2048 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2051 for(i=0; i<width; i++)
2057 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2061 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2064 for(i=0; i<width; i++)
2066 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2068 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2076 // Bilinear / Bicubic scaling
2077 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2078 int16_t *filter, int16_t *filterPos, int filterSize)
2081 if(filterSize==4) // allways true for upscaling, sometimes for down too
2083 int counter= -2*dstW;
2085 filterPos-= counter/2;
2088 "pxor %%mm7, %%mm7 \n\t"
2089 "movq "MANGLE(w02)", %%mm6 \n\t"
2090 "pushl %%ebp \n\t" // we use 7 regs here ...
2091 "movl %%eax, %%ebp \n\t"
2094 "movzwl (%2, %%ebp), %%eax \n\t"
2095 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2096 "movq (%1, %%ebp, 4), %%mm1 \n\t"
2097 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
2098 "movd (%3, %%eax), %%mm0 \n\t"
2099 "movd (%3, %%ebx), %%mm2 \n\t"
2100 "punpcklbw %%mm7, %%mm0 \n\t"
2101 "punpcklbw %%mm7, %%mm2 \n\t"
2102 "pmaddwd %%mm1, %%mm0 \n\t"
2103 "pmaddwd %%mm2, %%mm3 \n\t"
2104 "psrad $8, %%mm0 \n\t"
2105 "psrad $8, %%mm3 \n\t"
2106 "packssdw %%mm3, %%mm0 \n\t"
2107 "pmaddwd %%mm6, %%mm0 \n\t"
2108 "packssdw %%mm0, %%mm0 \n\t"
2109 "movd %%mm0, (%4, %%ebp) \n\t"
2110 "addl $4, %%ebp \n\t"
2115 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2119 else if(filterSize==8)
2121 int counter= -2*dstW;
2123 filterPos-= counter/2;
2126 "pxor %%mm7, %%mm7 \n\t"
2127 "movq "MANGLE(w02)", %%mm6 \n\t"
2128 "pushl %%ebp \n\t" // we use 7 regs here ...
2129 "movl %%eax, %%ebp \n\t"
2132 "movzwl (%2, %%ebp), %%eax \n\t"
2133 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2134 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2135 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2136 "movd (%3, %%eax), %%mm0 \n\t"
2137 "movd (%3, %%ebx), %%mm2 \n\t"
2138 "punpcklbw %%mm7, %%mm0 \n\t"
2139 "punpcklbw %%mm7, %%mm2 \n\t"
2140 "pmaddwd %%mm1, %%mm0 \n\t"
2141 "pmaddwd %%mm2, %%mm3 \n\t"
2143 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2144 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2145 "movd 4(%3, %%eax), %%mm4 \n\t"
2146 "movd 4(%3, %%ebx), %%mm2 \n\t"
2147 "punpcklbw %%mm7, %%mm4 \n\t"
2148 "punpcklbw %%mm7, %%mm2 \n\t"
2149 "pmaddwd %%mm1, %%mm4 \n\t"
2150 "pmaddwd %%mm2, %%mm5 \n\t"
2151 "paddd %%mm4, %%mm0 \n\t"
2152 "paddd %%mm5, %%mm3 \n\t"
2154 "psrad $8, %%mm0 \n\t"
2155 "psrad $8, %%mm3 \n\t"
2156 "packssdw %%mm3, %%mm0 \n\t"
2157 "pmaddwd %%mm6, %%mm0 \n\t"
2158 "packssdw %%mm0, %%mm0 \n\t"
2159 "movd %%mm0, (%4, %%ebp) \n\t"
2160 "addl $4, %%ebp \n\t"
2165 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2171 int counter= -2*dstW;
2172 // filter-= counter*filterSize/2;
2173 filterPos-= counter/2;
2176 "pxor %%mm7, %%mm7 \n\t"
2177 "movq "MANGLE(w02)", %%mm6 \n\t"
2180 "movl %2, %%ecx \n\t"
2181 "movzwl (%%ecx, %0), %%eax \n\t"
2182 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2183 "movl %5, %%ecx \n\t"
2184 "pxor %%mm4, %%mm4 \n\t"
2185 "pxor %%mm5, %%mm5 \n\t"
2187 "movq (%1), %%mm1 \n\t"
2188 "movq (%1, %6), %%mm3 \n\t"
2189 "movd (%%ecx, %%eax), %%mm0 \n\t"
2190 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2191 "punpcklbw %%mm7, %%mm0 \n\t"
2192 "punpcklbw %%mm7, %%mm2 \n\t"
2193 "pmaddwd %%mm1, %%mm0 \n\t"
2194 "pmaddwd %%mm2, %%mm3 \n\t"
2195 "paddd %%mm3, %%mm5 \n\t"
2196 "paddd %%mm0, %%mm4 \n\t"
2198 "addl $4, %%ecx \n\t"
2199 "cmpl %4, %%ecx \n\t"
2202 "psrad $8, %%mm4 \n\t"
2203 "psrad $8, %%mm5 \n\t"
2204 "packssdw %%mm5, %%mm4 \n\t"
2205 "pmaddwd %%mm6, %%mm4 \n\t"
2206 "packssdw %%mm4, %%mm4 \n\t"
2207 "movl %3, %%eax \n\t"
2208 "movd %%mm4, (%%eax, %0) \n\t"
2212 : "+r" (counter), "+r" (filter)
2213 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2214 "m" (src), "r" (filterSize*2)
2215 : "%ebx", "%eax", "%ecx"
2220 for(i=0; i<dstW; i++)
2223 int srcPos= filterPos[i];
2225 // printf("filterPos: %d\n", filterPos[i]);
2226 for(j=0; j<filterSize; j++)
2228 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2231 // filter += hFilterSize;
2232 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2237 // *** horizontal scale Y line to temp buffer
2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2239 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2240 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2241 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2242 int32_t *mmx2FilterPos)
2244 if(srcFormat==IMGFMT_YUY2)
2246 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2247 src= formatConvBuffer;
2249 else if(srcFormat==IMGFMT_BGR32)
2251 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2252 src= formatConvBuffer;
2254 else if(srcFormat==IMGFMT_BGR24)
2256 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2257 src= formatConvBuffer;
2259 else if(srcFormat==IMGFMT_BGR16)
2261 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2262 src= formatConvBuffer;
2264 else if(srcFormat==IMGFMT_BGR15)
2266 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2267 src= formatConvBuffer;
2269 else if(srcFormat==IMGFMT_RGB32)
2271 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2272 src= formatConvBuffer;
2274 else if(srcFormat==IMGFMT_RGB24)
2276 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2277 src= formatConvBuffer;
2281 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2282 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2284 if(!(flags&SWS_FAST_BILINEAR))
2287 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2289 else // Fast Bilinear upscale / crap downscale
2297 "pxor %%mm7, %%mm7 \n\t"
2298 "movl %0, %%ecx \n\t"
2299 "movl %1, %%edi \n\t"
2300 "movl %2, %%edx \n\t"
2301 "movl %3, %%ebx \n\t"
2302 "xorl %%eax, %%eax \n\t" // i
2303 PREFETCH" (%%ecx) \n\t"
2304 PREFETCH" 32(%%ecx) \n\t"
2305 PREFETCH" 64(%%ecx) \n\t"
2307 #define FUNNY_Y_CODE \
2308 "movl (%%ebx), %%esi \n\t"\
2310 "addl (%%ebx, %%eax), %%ecx \n\t"\
2311 "addl %%eax, %%edi \n\t"\
2312 "xorl %%eax, %%eax \n\t"\
2323 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2325 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2327 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2332 //NO MMX just normal asm ...
2334 "xorl %%eax, %%eax \n\t" // i
2335 "xorl %%ebx, %%ebx \n\t" // xx
2336 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2339 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2340 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2341 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2342 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2343 "shll $16, %%edi \n\t"
2344 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2345 "movl %1, %%edi \n\t"
2346 "shrl $9, %%esi \n\t"
2347 "movw %%si, (%%edi, %%eax, 2) \n\t"
2348 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2349 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2351 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2352 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2353 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2354 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2355 "shll $16, %%edi \n\t"
2356 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2357 "movl %1, %%edi \n\t"
2358 "shrl $9, %%esi \n\t"
2359 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2360 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2361 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2364 "addl $2, %%eax \n\t"
2365 "cmpl %2, %%eax \n\t"
2369 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2370 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2373 } //if MMX2 cant be used
2377 unsigned int xpos=0;
2378 for(i=0;i<dstWidth;i++)
2380 register unsigned int xx=xpos>>16;
2381 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2382 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2389 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2390 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2391 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2392 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2393 int32_t *mmx2FilterPos)
2395 if(srcFormat==IMGFMT_YUY2)
2397 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2398 src1= formatConvBuffer;
2399 src2= formatConvBuffer+2048;
2401 else if(srcFormat==IMGFMT_BGR32)
2403 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2404 src1= formatConvBuffer;
2405 src2= formatConvBuffer+2048;
2407 else if(srcFormat==IMGFMT_BGR24)
2409 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2410 src1= formatConvBuffer;
2411 src2= formatConvBuffer+2048;
2413 else if(srcFormat==IMGFMT_BGR16)
2415 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2416 src1= formatConvBuffer;
2417 src2= formatConvBuffer+2048;
2419 else if(srcFormat==IMGFMT_BGR15)
2421 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2422 src1= formatConvBuffer;
2423 src2= formatConvBuffer+2048;
2425 else if(srcFormat==IMGFMT_RGB32)
2427 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2428 src1= formatConvBuffer;
2429 src2= formatConvBuffer+2048;
2431 else if(srcFormat==IMGFMT_RGB24)
2433 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2434 src1= formatConvBuffer;
2435 src2= formatConvBuffer+2048;
2437 else if(isGray(srcFormat))
2443 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2444 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2446 if(!(flags&SWS_FAST_BILINEAR))
2449 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2450 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2452 else // Fast Bilinear upscale / crap downscale
2460 "pxor %%mm7, %%mm7 \n\t"
2461 "movl %0, %%ecx \n\t"
2462 "movl %1, %%edi \n\t"
2463 "movl %2, %%edx \n\t"
2464 "movl %3, %%ebx \n\t"
2465 "xorl %%eax, %%eax \n\t" // i
2466 PREFETCH" (%%ecx) \n\t"
2467 PREFETCH" 32(%%ecx) \n\t"
2468 PREFETCH" 64(%%ecx) \n\t"
2470 #define FUNNY_UV_CODE \
2471 "movl (%%ebx), %%esi \n\t"\
2473 "addl (%%ebx, %%eax), %%ecx \n\t"\
2474 "addl %%eax, %%edi \n\t"\
2475 "xorl %%eax, %%eax \n\t"\
2481 "xorl %%eax, %%eax \n\t" // i
2482 "movl %5, %%ecx \n\t" // src
2483 "movl %1, %%edi \n\t" // buf1
2484 "addl $4096, %%edi \n\t"
2485 PREFETCH" (%%ecx) \n\t"
2486 PREFETCH" 32(%%ecx) \n\t"
2487 PREFETCH" 64(%%ecx) \n\t"
2494 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2495 "m" (funnyUVCode), "m" (src2)
2496 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2498 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2500 // printf("%d %d %d\n", dstWidth, i, srcW);
2501 dst[i] = src1[srcW-1]*128;
2502 dst[i+2048] = src2[srcW-1]*128;
2509 "xorl %%eax, %%eax \n\t" // i
2510 "xorl %%ebx, %%ebx \n\t" // xx
2511 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2514 "movl %0, %%esi \n\t"
2515 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2516 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2517 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2518 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2519 "shll $16, %%edi \n\t"
2520 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2521 "movl %1, %%edi \n\t"
2522 "shrl $9, %%esi \n\t"
2523 "movw %%si, (%%edi, %%eax, 2) \n\t"
2525 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2526 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2527 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2528 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2529 "shll $16, %%edi \n\t"
2530 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2531 "movl %1, %%edi \n\t"
2532 "shrl $9, %%esi \n\t"
2533 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2535 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2536 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2537 "addl $1, %%eax \n\t"
2538 "cmpl %2, %%eax \n\t"
2541 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2543 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2546 } //if MMX2 cant be used
2550 unsigned int xpos=0;
2551 for(i=0;i<dstWidth;i++)
2553 register unsigned int xx=xpos>>16;
2554 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2555 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2556 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2558 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2559 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2567 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2568 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2570 /* load a few things into local vars to make the code more readable? and faster */
2571 const int srcW= c->srcW;
2572 const int dstW= c->dstW;
2573 const int dstH= c->dstH;
2574 const int chrDstW= c->chrDstW;
2575 const int lumXInc= c->lumXInc;
2576 const int chrXInc= c->chrXInc;
2577 const int dstFormat= c->dstFormat;
2578 const int flags= c->flags;
2579 const int canMMX2BeUsed= c->canMMX2BeUsed;
2580 int16_t *vLumFilterPos= c->vLumFilterPos;
2581 int16_t *vChrFilterPos= c->vChrFilterPos;
2582 int16_t *hLumFilterPos= c->hLumFilterPos;
2583 int16_t *hChrFilterPos= c->hChrFilterPos;
2584 int16_t *vLumFilter= c->vLumFilter;
2585 int16_t *vChrFilter= c->vChrFilter;
2586 int16_t *hLumFilter= c->hLumFilter;
2587 int16_t *hChrFilter= c->hChrFilter;
2588 int16_t *lumMmxFilter= c->lumMmxFilter;
2589 int16_t *chrMmxFilter= c->chrMmxFilter;
2590 const int vLumFilterSize= c->vLumFilterSize;
2591 const int vChrFilterSize= c->vChrFilterSize;
2592 const int hLumFilterSize= c->hLumFilterSize;
2593 const int hChrFilterSize= c->hChrFilterSize;
2594 int16_t **lumPixBuf= c->lumPixBuf;
2595 int16_t **chrPixBuf= c->chrPixBuf;
2596 const int vLumBufSize= c->vLumBufSize;
2597 const int vChrBufSize= c->vChrBufSize;
2598 uint8_t *funnyYCode= c->funnyYCode;
2599 uint8_t *funnyUVCode= c->funnyUVCode;
2600 uint8_t *formatConvBuffer= c->formatConvBuffer;
2602 /* vars whch will change and which we need to storw back in the context */
2604 int lumBufIndex= c->lumBufIndex;
2605 int chrBufIndex= c->chrBufIndex;
2606 int lastInLumBuf= c->lastInLumBuf;
2607 int lastInChrBuf= c->lastInChrBuf;
2613 if(c->srcFormat == IMGFMT_I420){
2614 src[0]= srcParam[0];
2615 src[1]= srcParam[2];
2616 src[2]= srcParam[1];
2617 srcStride[0]= srcStrideParam[0];
2618 srcStride[1]= srcStrideParam[2];
2619 srcStride[2]= srcStrideParam[1];
2621 else if(c->srcFormat==IMGFMT_YV12){
2622 src[0]= srcParam[0];
2623 src[1]= srcParam[1];
2624 src[2]= srcParam[2];
2625 srcStride[0]= srcStrideParam[0];
2626 srcStride[1]= srcStrideParam[1];
2627 srcStride[2]= srcStrideParam[2];
2629 else if(isPacked(c->srcFormat)){
2632 src[2]= srcParam[0];
2633 srcStride[0]= srcStrideParam[0];
2635 srcStride[2]= srcStrideParam[0]<<1;
2637 else if(isGray(c->srcFormat)){
2638 src[0]= srcParam[0];
2641 srcStride[0]= srcStrideParam[0];
2646 if(dstFormat == IMGFMT_I420){
2647 dst[0]= dstParam[0];
2648 dst[1]= dstParam[2];
2649 dst[2]= dstParam[1];
2650 dstStride[0]= dstStrideParam[0];
2651 dstStride[1]= dstStrideParam[2];
2652 dstStride[2]= dstStrideParam[1];
2654 dst[0]= dstParam[0];
2655 dst[1]= dstParam[1];
2656 dst[2]= dstParam[2];
2657 dstStride[0]= dstStrideParam[0];
2658 dstStride[1]= dstStrideParam[1];
2659 dstStride[2]= dstStrideParam[2];
2662 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2663 //dstStride[0],dstStride[1],dstStride[2]);
2665 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2667 static int firstTime=1; //FIXME move this into the context perhaps
2668 if(flags & SWS_PRINT_INFO && firstTime)
2670 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2671 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2676 /* Note the user might start scaling the picture in the middle so this will not get executed
2677 this is not really intended but works currently, so ppl might do it */
2686 for(;dstY < dstH; dstY++){
2687 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2688 unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2689 unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
2690 const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
2692 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2693 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2694 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2695 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2697 //handle holes (FAST_BILINEAR & weird filters)
2698 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2699 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2700 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2701 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2702 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2704 // Do we have enough lines in this slice to output the dstY line
2705 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH + 1)>>1))
2707 //Do horizontal scaling
2708 while(lastInLumBuf < lastLumSrcY)
2710 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2712 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2713 ASSERT(lumBufIndex < 2*vLumBufSize)
2714 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2715 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2716 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2717 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2718 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2719 funnyYCode, c->srcFormat, formatConvBuffer,
2720 c->lumMmx2Filter, c->lumMmx2FilterPos);
2723 while(lastInChrBuf < lastChrSrcY)
2725 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2726 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2728 ASSERT(chrBufIndex < 2*vChrBufSize)
2729 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2730 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2731 //FIXME replace parameters through context struct (some at least)
2732 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2733 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2734 funnyUVCode, c->srcFormat, formatConvBuffer,
2735 c->chrMmx2Filter, c->chrMmx2FilterPos);
2738 //wrap buf index around to stay inside the ring buffer
2739 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2740 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2742 else // not enough lines left in this slice -> load the rest in the buffer
2744 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2745 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2746 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2747 vChrBufSize, vLumBufSize);
2749 //Do horizontal scaling
2750 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2752 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2754 ASSERT(lumBufIndex < 2*vLumBufSize)
2755 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2756 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2757 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2758 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2759 funnyYCode, c->srcFormat, formatConvBuffer,
2760 c->lumMmx2Filter, c->lumMmx2FilterPos);
2763 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2765 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2766 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2768 ASSERT(chrBufIndex < 2*vChrBufSize)
2769 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2770 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2771 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2772 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2773 funnyUVCode, c->srcFormat, formatConvBuffer,
2774 c->chrMmx2Filter, c->chrMmx2FilterPos);
2777 //wrap buf index around to stay inside the ring buffer
2778 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2779 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2780 break; //we cant output a dstY line so lets try with the next slice
2784 b5Dither= dither8[dstY&1];
2785 g6Dither= dither4[dstY&1];
2786 g5Dither= dither8[dstY&1];
2787 r5Dither= dither8[(dstY+1)&1];
2791 if(isPlanarYUV(dstFormat)) //YV12 like
2793 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2794 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2796 int16_t *lumBuf = lumPixBuf[0];
2797 int16_t *chrBuf= chrPixBuf[0];
2798 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2802 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2803 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2805 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2806 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807 dest, uDest, vDest, dstW,
2808 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2813 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2814 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2816 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2817 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2818 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2820 int chrAlpha= vChrFilter[2*dstY+1];
2822 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2823 dest, dstW, chrAlpha, dstFormat, flags);
2825 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2827 int lumAlpha= vLumFilter[2*dstY+1];
2828 int chrAlpha= vChrFilter[2*dstY+1];
2830 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2831 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2836 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2837 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2838 dest, dstW, dstFormat,
2839 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2843 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2845 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2846 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2847 if(isPlanarYUV(dstFormat)) //YV12
2849 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2851 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2852 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853 dest, uDest, vDest, dstW);
2857 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2858 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2860 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2862 dest, dstW, dstFormat);
2868 __asm __volatile(SFENCE:::"memory");
2869 __asm __volatile(EMMS:::"memory");
2871 /* store changed local vars back in the context */
2873 c->lumBufIndex= lumBufIndex;
2874 c->chrBufIndex= chrBufIndex;
2875 c->lastInLumBuf= lastInLumBuf;
2876 c->lastInChrBuf= lastInChrBuf;