2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
161 #define YSCALEYUV2RGBX \
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "m" ((long)chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "m" ((long)chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "m" ((long)dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804 chrFilter, chrSrc, chrFilterSize,
805 dest, uDest, dstW, chrDstW, dstFormat);
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
831 :: "r" (lumSrc + dstW), "r" (dest + dstW),
837 for(i=0; i<dstW; i++)
839 int val= lumSrc[i]>>7;
850 for(i=0; i<chrDstW; i++)
853 int v=chrSrc[i + 2048]>>7;
857 else if (u>255) u=255;
859 else if (v>255) v=255;
870 * vertical scale YV12 to RGB
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874 uint8_t *dest, int dstW, int dstY)
884 WRITEBGR32(%4, %5, %%REGa)
886 :: "r" (&c->redDither),
887 "m" (dummy), "m" (dummy), "m" (dummy),
888 "r" (dest), "m" (dstW)
889 : "%"REG_a, "%"REG_d, "%"REG_S
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898 "add %4, %%"REG_b" \n\t"
899 WRITEBGR24(%%REGb, %5, %%REGa)
901 :: "r" (&c->redDither),
902 "m" (dummy), "m" (dummy), "m" (dummy),
903 "r" (dest), "m" (dstW)
904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919 WRITEBGR15(%4, %5, %%REGa)
921 :: "r" (&c->redDither),
922 "m" (dummy), "m" (dummy), "m" (dummy),
923 "r" (dest), "m" (dstW)
924 : "%"REG_a, "%"REG_d, "%"REG_S
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939 WRITEBGR16(%4, %5, %%REGa)
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
944 : "%"REG_a, "%"REG_d, "%"REG_S
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
958 WRITEYUY2(%4, %5, %%REGa)
960 :: "r" (&c->redDither),
961 "m" (dummy), "m" (dummy), "m" (dummy),
962 "r" (dest), "m" (dstW)
963 : "%"REG_a, "%"REG_d, "%"REG_S
970 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
974 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
983 * vertical bilinear scale YV12 to RGB
985 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
988 int yalpha1=yalpha^4095;
989 int uvalpha1=uvalpha^4095;
993 if(flags&SWS_FULL_CHR_H_INT)
1003 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1006 "movq %%mm3, %%mm1 \n\t"
1007 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1010 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1013 "add $4, %%"REG_a" \n\t"
1014 "cmp %5, %%"REG_a" \n\t"
1018 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019 "m" (yalpha1), "m" (uvalpha1)
1029 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1032 "movq %%mm3, %%mm1 \n\t"
1033 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1036 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1038 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1041 "movq %%mm1, %%mm2 \n\t"
1042 "psllq $48, %%mm1 \n\t" // 000000BG
1043 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1045 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046 "psrld $16, %%mm2 \n\t" // R000R000
1047 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1048 "por %%mm2, %%mm1 \n\t" // RBGRR000
1050 "mov %4, %%"REG_b" \n\t"
1051 "add %%"REG_a", %%"REG_b" \n\t"
1055 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1058 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1059 "psrlq $32, %%mm3 \n\t"
1060 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1063 "add $4, %%"REG_a" \n\t"
1064 "cmp %5, %%"REG_a" \n\t"
1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068 "m" (yalpha1), "m" (uvalpha1)
1069 : "%"REG_a, "%"REG_b
1077 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1081 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1085 "psrlw $3, %%mm3 \n\t"
1086 "psllw $2, %%mm1 \n\t"
1087 "psllw $7, %%mm0 \n\t"
1088 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1089 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1091 "por %%mm3, %%mm1 \n\t"
1092 "por %%mm1, %%mm0 \n\t"
1094 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1096 "add $4, %%"REG_a" \n\t"
1097 "cmp %5, %%"REG_a" \n\t"
1100 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101 "m" (yalpha1), "m" (uvalpha1)
1110 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1114 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1118 "psrlw $3, %%mm3 \n\t"
1119 "psllw $3, %%mm1 \n\t"
1120 "psllw $8, %%mm0 \n\t"
1121 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1122 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1124 "por %%mm3, %%mm1 \n\t"
1125 "por %%mm1, %%mm0 \n\t"
1127 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1129 "add $4, %%"REG_a" \n\t"
1130 "cmp %5, %%"REG_a" \n\t"
1133 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134 "m" (yalpha1), "m" (uvalpha1)
1143 if(dstFormat==IMGFMT_BGR32)
1146 #ifdef WORDS_BIGENDIAN
1149 for(i=0;i<dstW;i++){
1150 // vertical linear interpolation && yuv2rgb in a single step:
1151 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1160 else if(dstFormat==IMGFMT_BGR24)
1163 for(i=0;i<dstW;i++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1174 else if(dstFormat==IMGFMT_BGR16)
1177 for(i=0;i<dstW;i++){
1178 // vertical linear interpolation && yuv2rgb in a single step:
1179 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1183 ((uint16_t*)dest)[i] =
1184 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1189 else if(dstFormat==IMGFMT_BGR15)
1192 for(i=0;i<dstW;i++){
1193 // vertical linear interpolation && yuv2rgb in a single step:
1194 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1198 ((uint16_t*)dest)[i] =
1199 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1209 switch(c->dstFormat)
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1214 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1215 "mov %4, %%"REG_SP" \n\t"
1216 YSCALEYUV2RGB(%%REGa, %5)
1217 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1220 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1227 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1228 "mov %4, %%"REG_SP" \n\t"
1229 YSCALEYUV2RGB(%%REGa, %5)
1230 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1232 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1239 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_SP" \n\t"
1241 YSCALEYUV2RGB(%%REGa, %5)
1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1244 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1249 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1252 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1259 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1260 "mov %4, %%"REG_SP" \n\t"
1261 YSCALEYUV2RGB(%%REGa, %5)
1262 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1264 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1269 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1271 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1278 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1279 "mov %4, %%"REG_SP" \n\t"
1280 YSCALEYUV2PACKED(%%REGa, %5)
1281 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1283 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1295 * YV12 to RGB without scaling or interpolating
1297 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1300 const int yalpha1=0;
1303 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304 const int yalpha= 4096; //FIXME ...
1306 if(flags&SWS_FULL_CHR_H_INT)
1308 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1313 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1319 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1320 "mov %4, %%"REG_SP" \n\t"
1321 YSCALEYUV2RGB1(%%REGa, %5)
1322 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1325 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1332 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1333 "mov %4, %%"REG_SP" \n\t"
1334 YSCALEYUV2RGB1(%%REGa, %5)
1335 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1338 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1345 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_SP" \n\t"
1347 YSCALEYUV2RGB1(%%REGa, %5)
1348 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1350 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1354 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1357 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1364 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1365 "mov %4, %%"REG_SP" \n\t"
1366 YSCALEYUV2RGB1(%%REGa, %5)
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1374 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1377 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1384 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_SP" \n\t"
1386 YSCALEYUV2PACKED1(%%REGa, %5)
1387 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1390 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1403 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1404 "mov %4, %%"REG_SP" \n\t"
1405 YSCALEYUV2RGB1b(%%REGa, %5)
1406 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1409 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1416 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1417 "mov %4, %%"REG_SP" \n\t"
1418 YSCALEYUV2RGB1b(%%REGa, %5)
1419 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1422 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1429 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_SP" \n\t"
1431 YSCALEYUV2RGB1b(%%REGa, %5)
1432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1434 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1438 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1441 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1448 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_SP" \n\t"
1450 YSCALEYUV2RGB1b(%%REGa, %5)
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1458 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1461 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1468 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_SP" \n\t"
1470 YSCALEYUV2PACKED1b(%%REGa, %5)
1471 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1474 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1482 if( uvalpha < 2048 )
1484 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1486 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1490 //FIXME yuy2* can read upto 7 samples to much
1492 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1496 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497 "mov %0, %%"REG_a" \n\t"
1499 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1500 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501 "pand %%mm2, %%mm0 \n\t"
1502 "pand %%mm2, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
1504 "movq %%mm0, (%2, %%"REG_a") \n\t"
1505 "add $8, %%"REG_a" \n\t"
1507 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1512 for(i=0; i<width; i++)
1517 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1521 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522 "mov %0, %%"REG_a" \n\t"
1524 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1525 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1527 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1530 "psrlw $8, %%mm0 \n\t"
1531 "psrlw $8, %%mm1 \n\t"
1532 "packuswb %%mm1, %%mm0 \n\t"
1533 "movq %%mm0, %%mm1 \n\t"
1534 "psrlw $8, %%mm0 \n\t"
1535 "pand %%mm4, %%mm1 \n\t"
1536 "packuswb %%mm0, %%mm0 \n\t"
1537 "packuswb %%mm1, %%mm1 \n\t"
1538 "movd %%mm0, (%4, %%"REG_a") \n\t"
1539 "movd %%mm1, (%3, %%"REG_a") \n\t"
1540 "add $4, %%"REG_a" \n\t"
1542 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1547 for(i=0; i<width; i++)
1549 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1560 "mov %0, %%"REG_a" \n\t"
1562 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1563 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "movq %%mm0, (%2, %%"REG_a") \n\t"
1568 "add $8, %%"REG_a" \n\t"
1570 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1575 for(i=0; i<width; i++)
1580 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585 "mov %0, %%"REG_a" \n\t"
1587 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1590 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1593 "pand %%mm4, %%mm0 \n\t"
1594 "pand %%mm4, %%mm1 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "movq %%mm0, %%mm1 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "pand %%mm4, %%mm1 \n\t"
1599 "packuswb %%mm0, %%mm0 \n\t"
1600 "packuswb %%mm1, %%mm1 \n\t"
1601 "movd %%mm0, (%4, %%"REG_a") \n\t"
1602 "movd %%mm1, (%3, %%"REG_a") \n\t"
1603 "add $4, %%"REG_a" \n\t"
1605 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1610 for(i=0; i<width; i++)
1612 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1618 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1620 #ifdef HAVE_MMXFIXME
1623 for(i=0; i<width; i++)
1625 int b= ((uint32_t*)src)[i]&0xFF;
1626 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1627 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1629 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1634 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1636 #ifdef HAVE_MMXFIXME
1639 for(i=0; i<width; i++)
1641 const int a= ((uint32_t*)src1)[2*i+0];
1642 const int e= ((uint32_t*)src1)[2*i+1];
1643 const int c= ((uint32_t*)src2)[2*i+0];
1644 const int d= ((uint32_t*)src2)[2*i+1];
1645 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1646 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1647 const int b= l&0x3FF;
1651 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1652 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1657 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1661 "mov %2, %%"REG_a" \n\t"
1662 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1663 "movq "MANGLE(w1111)", %%mm5 \n\t"
1664 "pxor %%mm7, %%mm7 \n\t"
1665 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1668 PREFETCH" 64(%0, %%"REG_b") \n\t"
1669 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1670 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1671 "punpcklbw %%mm7, %%mm0 \n\t"
1672 "punpcklbw %%mm7, %%mm1 \n\t"
1673 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1674 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1675 "punpcklbw %%mm7, %%mm2 \n\t"
1676 "punpcklbw %%mm7, %%mm3 \n\t"
1677 "pmaddwd %%mm6, %%mm0 \n\t"
1678 "pmaddwd %%mm6, %%mm1 \n\t"
1679 "pmaddwd %%mm6, %%mm2 \n\t"
1680 "pmaddwd %%mm6, %%mm3 \n\t"
1681 #ifndef FAST_BGR2YV12
1682 "psrad $8, %%mm0 \n\t"
1683 "psrad $8, %%mm1 \n\t"
1684 "psrad $8, %%mm2 \n\t"
1685 "psrad $8, %%mm3 \n\t"
1687 "packssdw %%mm1, %%mm0 \n\t"
1688 "packssdw %%mm3, %%mm2 \n\t"
1689 "pmaddwd %%mm5, %%mm0 \n\t"
1690 "pmaddwd %%mm5, %%mm2 \n\t"
1691 "packssdw %%mm2, %%mm0 \n\t"
1692 "psraw $7, %%mm0 \n\t"
1694 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1695 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1696 "punpcklbw %%mm7, %%mm4 \n\t"
1697 "punpcklbw %%mm7, %%mm1 \n\t"
1698 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1699 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1700 "punpcklbw %%mm7, %%mm2 \n\t"
1701 "punpcklbw %%mm7, %%mm3 \n\t"
1702 "pmaddwd %%mm6, %%mm4 \n\t"
1703 "pmaddwd %%mm6, %%mm1 \n\t"
1704 "pmaddwd %%mm6, %%mm2 \n\t"
1705 "pmaddwd %%mm6, %%mm3 \n\t"
1706 #ifndef FAST_BGR2YV12
1707 "psrad $8, %%mm4 \n\t"
1708 "psrad $8, %%mm1 \n\t"
1709 "psrad $8, %%mm2 \n\t"
1710 "psrad $8, %%mm3 \n\t"
1712 "packssdw %%mm1, %%mm4 \n\t"
1713 "packssdw %%mm3, %%mm2 \n\t"
1714 "pmaddwd %%mm5, %%mm4 \n\t"
1715 "pmaddwd %%mm5, %%mm2 \n\t"
1716 "add $24, %%"REG_b" \n\t"
1717 "packssdw %%mm2, %%mm4 \n\t"
1718 "psraw $7, %%mm4 \n\t"
1720 "packuswb %%mm4, %%mm0 \n\t"
1721 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1723 "movq %%mm0, (%1, %%"REG_a") \n\t"
1724 "add $8, %%"REG_a" \n\t"
1726 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1727 : "%"REG_a, "%"REG_b
1731 for(i=0; i<width; i++)
1737 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1742 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1746 "mov %4, %%"REG_a" \n\t"
1747 "movq "MANGLE(w1111)", %%mm5 \n\t"
1748 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1749 "pxor %%mm7, %%mm7 \n\t"
1750 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1751 "add %%"REG_b", %%"REG_b" \n\t"
1754 PREFETCH" 64(%0, %%"REG_b") \n\t"
1755 PREFETCH" 64(%1, %%"REG_b") \n\t"
1756 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1757 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1758 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1759 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1760 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1763 "movq %%mm0, %%mm1 \n\t"
1764 "movq %%mm2, %%mm3 \n\t"
1765 "psrlq $24, %%mm0 \n\t"
1766 "psrlq $24, %%mm2 \n\t"
1769 "punpcklbw %%mm7, %%mm0 \n\t"
1770 "punpcklbw %%mm7, %%mm2 \n\t"
1772 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1773 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1774 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1775 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1776 "punpcklbw %%mm7, %%mm0 \n\t"
1777 "punpcklbw %%mm7, %%mm1 \n\t"
1778 "punpcklbw %%mm7, %%mm2 \n\t"
1779 "punpcklbw %%mm7, %%mm3 \n\t"
1780 "paddw %%mm1, %%mm0 \n\t"
1781 "paddw %%mm3, %%mm2 \n\t"
1782 "paddw %%mm2, %%mm0 \n\t"
1783 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1784 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1785 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1786 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1787 "punpcklbw %%mm7, %%mm4 \n\t"
1788 "punpcklbw %%mm7, %%mm1 \n\t"
1789 "punpcklbw %%mm7, %%mm2 \n\t"
1790 "punpcklbw %%mm7, %%mm3 \n\t"
1791 "paddw %%mm1, %%mm4 \n\t"
1792 "paddw %%mm3, %%mm2 \n\t"
1793 "paddw %%mm4, %%mm2 \n\t"
1794 "psrlw $2, %%mm0 \n\t"
1795 "psrlw $2, %%mm2 \n\t"
1797 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1798 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1800 "pmaddwd %%mm0, %%mm1 \n\t"
1801 "pmaddwd %%mm2, %%mm3 \n\t"
1802 "pmaddwd %%mm6, %%mm0 \n\t"
1803 "pmaddwd %%mm6, %%mm2 \n\t"
1804 #ifndef FAST_BGR2YV12
1805 "psrad $8, %%mm0 \n\t"
1806 "psrad $8, %%mm1 \n\t"
1807 "psrad $8, %%mm2 \n\t"
1808 "psrad $8, %%mm3 \n\t"
1810 "packssdw %%mm2, %%mm0 \n\t"
1811 "packssdw %%mm3, %%mm1 \n\t"
1812 "pmaddwd %%mm5, %%mm0 \n\t"
1813 "pmaddwd %%mm5, %%mm1 \n\t"
1814 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1815 "psraw $7, %%mm0 \n\t"
1817 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1818 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1819 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1820 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1821 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1824 "movq %%mm4, %%mm1 \n\t"
1825 "movq %%mm2, %%mm3 \n\t"
1826 "psrlq $24, %%mm4 \n\t"
1827 "psrlq $24, %%mm2 \n\t"
1830 "punpcklbw %%mm7, %%mm4 \n\t"
1831 "punpcklbw %%mm7, %%mm2 \n\t"
1833 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1834 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1835 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1836 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1837 "punpcklbw %%mm7, %%mm4 \n\t"
1838 "punpcklbw %%mm7, %%mm1 \n\t"
1839 "punpcklbw %%mm7, %%mm2 \n\t"
1840 "punpcklbw %%mm7, %%mm3 \n\t"
1841 "paddw %%mm1, %%mm4 \n\t"
1842 "paddw %%mm3, %%mm2 \n\t"
1843 "paddw %%mm2, %%mm4 \n\t"
1844 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1845 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1846 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1847 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1848 "punpcklbw %%mm7, %%mm5 \n\t"
1849 "punpcklbw %%mm7, %%mm1 \n\t"
1850 "punpcklbw %%mm7, %%mm2 \n\t"
1851 "punpcklbw %%mm7, %%mm3 \n\t"
1852 "paddw %%mm1, %%mm5 \n\t"
1853 "paddw %%mm3, %%mm2 \n\t"
1854 "paddw %%mm5, %%mm2 \n\t"
1855 "movq "MANGLE(w1111)", %%mm5 \n\t"
1856 "psrlw $2, %%mm4 \n\t"
1857 "psrlw $2, %%mm2 \n\t"
1859 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1860 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1862 "pmaddwd %%mm4, %%mm1 \n\t"
1863 "pmaddwd %%mm2, %%mm3 \n\t"
1864 "pmaddwd %%mm6, %%mm4 \n\t"
1865 "pmaddwd %%mm6, %%mm2 \n\t"
1866 #ifndef FAST_BGR2YV12
1867 "psrad $8, %%mm4 \n\t"
1868 "psrad $8, %%mm1 \n\t"
1869 "psrad $8, %%mm2 \n\t"
1870 "psrad $8, %%mm3 \n\t"
1872 "packssdw %%mm2, %%mm4 \n\t"
1873 "packssdw %%mm3, %%mm1 \n\t"
1874 "pmaddwd %%mm5, %%mm4 \n\t"
1875 "pmaddwd %%mm5, %%mm1 \n\t"
1876 "add $24, %%"REG_b" \n\t"
1877 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1878 "psraw $7, %%mm4 \n\t"
1880 "movq %%mm0, %%mm1 \n\t"
1881 "punpckldq %%mm4, %%mm0 \n\t"
1882 "punpckhdq %%mm4, %%mm1 \n\t"
1883 "packsswb %%mm1, %%mm0 \n\t"
1884 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1886 "movd %%mm0, (%2, %%"REG_a") \n\t"
1887 "punpckhdq %%mm0, %%mm0 \n\t"
1888 "movd %%mm0, (%3, %%"REG_a") \n\t"
1889 "add $4, %%"REG_a" \n\t"
1891 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1892 : "%"REG_a, "%"REG_b
1896 for(i=0; i<width; i++)
1898 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1899 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1900 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1902 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1903 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1908 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1911 for(i=0; i<width; i++)
1913 int d= ((uint16_t*)src)[i];
1916 int r= (d>>11)&0x1F;
1918 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1922 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1925 for(i=0; i<width; i++)
1927 int d0= ((uint32_t*)src1)[i];
1928 int d1= ((uint32_t*)src2)[i];
1930 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1931 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1933 int dh2= (dh>>11) + (dh<<21);
1937 int r= (d>>11)&0x7F;
1939 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1940 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1944 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1947 for(i=0; i<width; i++)
1949 int d= ((uint16_t*)src)[i];
1952 int r= (d>>10)&0x1F;
1954 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1958 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1961 for(i=0; i<width; i++)
1963 int d0= ((uint32_t*)src1)[i];
1964 int d1= ((uint32_t*)src2)[i];
1966 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1967 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1969 int dh2= (dh>>11) + (dh<<21);
1973 int r= (d>>10)&0x7F;
1975 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1976 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1981 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1984 for(i=0; i<width; i++)
1986 int r= ((uint32_t*)src)[i]&0xFF;
1987 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1988 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1990 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1994 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1997 for(i=0; i<width; i++)
1999 const int a= ((uint32_t*)src1)[2*i+0];
2000 const int e= ((uint32_t*)src1)[2*i+1];
2001 const int c= ((uint32_t*)src2)[2*i+0];
2002 const int d= ((uint32_t*)src2)[2*i+1];
2003 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2004 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2005 const int r= l&0x3FF;
2009 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2010 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2014 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2017 for(i=0; i<width; i++)
2023 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2027 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2030 for(i=0; i<width; i++)
2032 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2036 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2037 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2042 // Bilinear / Bicubic scaling
2043 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2044 int16_t *filter, int16_t *filterPos, int filterSize)
2047 assert(filterSize % 4 == 0 && filterSize>0);
2048 if(filterSize==4) // allways true for upscaling, sometimes for down too
2050 long counter= -2*dstW;
2052 filterPos-= counter/2;
2055 "pxor %%mm7, %%mm7 \n\t"
2056 "movq "MANGLE(w02)", %%mm6 \n\t"
2057 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2058 "mov %%"REG_a", %%"REG_BP" \n\t"
2061 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2062 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2063 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2064 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2065 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2066 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2067 "punpcklbw %%mm7, %%mm0 \n\t"
2068 "punpcklbw %%mm7, %%mm2 \n\t"
2069 "pmaddwd %%mm1, %%mm0 \n\t"
2070 "pmaddwd %%mm2, %%mm3 \n\t"
2071 "psrad $8, %%mm0 \n\t"
2072 "psrad $8, %%mm3 \n\t"
2073 "packssdw %%mm3, %%mm0 \n\t"
2074 "pmaddwd %%mm6, %%mm0 \n\t"
2075 "packssdw %%mm0, %%mm0 \n\t"
2076 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2077 "add $4, %%"REG_BP" \n\t"
2080 "pop %%"REG_BP" \n\t"
2082 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2086 else if(filterSize==8)
2088 long counter= -2*dstW;
2090 filterPos-= counter/2;
2093 "pxor %%mm7, %%mm7 \n\t"
2094 "movq "MANGLE(w02)", %%mm6 \n\t"
2095 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2096 "mov %%"REG_a", %%"REG_BP" \n\t"
2099 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2100 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2101 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2102 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2103 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2104 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2105 "punpcklbw %%mm7, %%mm0 \n\t"
2106 "punpcklbw %%mm7, %%mm2 \n\t"
2107 "pmaddwd %%mm1, %%mm0 \n\t"
2108 "pmaddwd %%mm2, %%mm3 \n\t"
2110 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2111 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2112 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2113 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2114 "punpcklbw %%mm7, %%mm4 \n\t"
2115 "punpcklbw %%mm7, %%mm2 \n\t"
2116 "pmaddwd %%mm1, %%mm4 \n\t"
2117 "pmaddwd %%mm2, %%mm5 \n\t"
2118 "paddd %%mm4, %%mm0 \n\t"
2119 "paddd %%mm5, %%mm3 \n\t"
2121 "psrad $8, %%mm0 \n\t"
2122 "psrad $8, %%mm3 \n\t"
2123 "packssdw %%mm3, %%mm0 \n\t"
2124 "pmaddwd %%mm6, %%mm0 \n\t"
2125 "packssdw %%mm0, %%mm0 \n\t"
2126 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2127 "add $4, %%"REG_BP" \n\t"
2130 "pop %%"REG_BP" \n\t"
2132 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2138 long counter= -2*dstW;
2139 // filter-= counter*filterSize/2;
2140 filterPos-= counter/2;
2143 "pxor %%mm7, %%mm7 \n\t"
2144 "movq "MANGLE(w02)", %%mm6 \n\t"
2147 "mov %2, %%"REG_c" \n\t"
2148 "movzwl (%%"REG_c", %0), %%eax \n\t"
2149 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2150 "mov %5, %%"REG_c" \n\t"
2151 "pxor %%mm4, %%mm4 \n\t"
2152 "pxor %%mm5, %%mm5 \n\t"
2154 "movq (%1), %%mm1 \n\t"
2155 "movq (%1, %6), %%mm3 \n\t"
2156 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2157 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2158 "punpcklbw %%mm7, %%mm0 \n\t"
2159 "punpcklbw %%mm7, %%mm2 \n\t"
2160 "pmaddwd %%mm1, %%mm0 \n\t"
2161 "pmaddwd %%mm2, %%mm3 \n\t"
2162 "paddd %%mm3, %%mm5 \n\t"
2163 "paddd %%mm0, %%mm4 \n\t"
2165 "add $4, %%"REG_c" \n\t"
2166 "cmp %4, %%"REG_c" \n\t"
2169 "psrad $8, %%mm4 \n\t"
2170 "psrad $8, %%mm5 \n\t"
2171 "packssdw %%mm5, %%mm4 \n\t"
2172 "pmaddwd %%mm6, %%mm4 \n\t"
2173 "packssdw %%mm4, %%mm4 \n\t"
2174 "mov %3, %%"REG_a" \n\t"
2175 "movd %%mm4, (%%"REG_a", %0) \n\t"
2179 : "+r" (counter), "+r" (filter)
2180 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2181 "m" (src), "r" ((long)filterSize*2)
2182 : "%"REG_b, "%"REG_a, "%"REG_c
2187 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2190 for(i=0; i<dstW; i++)
2193 int srcPos= filterPos[i];
2195 // printf("filterPos: %d\n", filterPos[i]);
2196 for(j=0; j<filterSize; j++)
2198 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2199 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2201 // filter += hFilterSize;
2202 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2208 // *** horizontal scale Y line to temp buffer
2209 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2210 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2211 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2212 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2213 int32_t *mmx2FilterPos)
2215 if(srcFormat==IMGFMT_YUY2)
2217 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2218 src= formatConvBuffer;
2220 else if(srcFormat==IMGFMT_UYVY)
2222 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2223 src= formatConvBuffer;
2225 else if(srcFormat==IMGFMT_BGR32)
2227 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2228 src= formatConvBuffer;
2230 else if(srcFormat==IMGFMT_BGR24)
2232 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2233 src= formatConvBuffer;
2235 else if(srcFormat==IMGFMT_BGR16)
2237 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2238 src= formatConvBuffer;
2240 else if(srcFormat==IMGFMT_BGR15)
2242 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2243 src= formatConvBuffer;
2245 else if(srcFormat==IMGFMT_RGB32)
2247 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2248 src= formatConvBuffer;
2250 else if(srcFormat==IMGFMT_RGB24)
2252 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2253 src= formatConvBuffer;
2257 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2258 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2260 if(!(flags&SWS_FAST_BILINEAR))
2263 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2265 else // Fast Bilinear upscale / crap downscale
2267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2273 "pxor %%mm7, %%mm7 \n\t"
2274 "mov %0, %%"REG_c" \n\t"
2275 "mov %1, %%"REG_D" \n\t"
2276 "mov %2, %%"REG_d" \n\t"
2277 "mov %3, %%"REG_b" \n\t"
2278 "xor %%"REG_a", %%"REG_a" \n\t" // i
2279 PREFETCH" (%%"REG_c") \n\t"
2280 PREFETCH" 32(%%"REG_c") \n\t"
2281 PREFETCH" 64(%%"REG_c") \n\t"
2285 #define FUNNY_Y_CODE \
2286 "movl (%%"REG_b"), %%esi \n\t"\
2288 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2289 "add %%"REG_S", %%"REG_c" \n\t"\
2290 "add %%"REG_a", %%"REG_D" \n\t"\
2291 "xor %%"REG_a", %%"REG_a" \n\t"\
2295 #define FUNNY_Y_CODE \
2296 "movl (%%"REG_b"), %%esi \n\t"\
2298 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2299 "add %%"REG_a", %%"REG_D" \n\t"\
2300 "xor %%"REG_a", %%"REG_a" \n\t"\
2313 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2315 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2317 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2322 //NO MMX just normal asm ...
2324 "xor %%"REG_a", %%"REG_a" \n\t" // i
2325 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2326 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2329 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2330 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2331 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2332 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2333 "shll $16, %%edi \n\t"
2334 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2335 "mov %1, %%"REG_D" \n\t"
2336 "shrl $9, %%esi \n\t"
2337 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2338 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2339 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2341 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2343 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2344 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2345 "shll $16, %%edi \n\t"
2346 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2347 "mov %1, %%"REG_D" \n\t"
2348 "shrl $9, %%esi \n\t"
2349 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2350 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2351 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2354 "add $2, %%"REG_a" \n\t"
2355 "cmp %2, %%"REG_a" \n\t"
2359 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2360 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2363 } //if MMX2 can't be used
2367 unsigned int xpos=0;
2368 for(i=0;i<dstWidth;i++)
2370 register unsigned int xx=xpos>>16;
2371 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2372 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2379 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2380 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2381 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2382 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2383 int32_t *mmx2FilterPos)
2385 if(srcFormat==IMGFMT_YUY2)
2387 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2388 src1= formatConvBuffer;
2389 src2= formatConvBuffer+2048;
2391 else if(srcFormat==IMGFMT_UYVY)
2393 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2394 src1= formatConvBuffer;
2395 src2= formatConvBuffer+2048;
2397 else if(srcFormat==IMGFMT_BGR32)
2399 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2400 src1= formatConvBuffer;
2401 src2= formatConvBuffer+2048;
2403 else if(srcFormat==IMGFMT_BGR24)
2405 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2406 src1= formatConvBuffer;
2407 src2= formatConvBuffer+2048;
2409 else if(srcFormat==IMGFMT_BGR16)
2411 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2412 src1= formatConvBuffer;
2413 src2= formatConvBuffer+2048;
2415 else if(srcFormat==IMGFMT_BGR15)
2417 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2418 src1= formatConvBuffer;
2419 src2= formatConvBuffer+2048;
2421 else if(srcFormat==IMGFMT_RGB32)
2423 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2424 src1= formatConvBuffer;
2425 src2= formatConvBuffer+2048;
2427 else if(srcFormat==IMGFMT_RGB24)
2429 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2430 src1= formatConvBuffer;
2431 src2= formatConvBuffer+2048;
2433 else if(isGray(srcFormat))
2439 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2440 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2442 if(!(flags&SWS_FAST_BILINEAR))
2445 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2446 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2448 else // Fast Bilinear upscale / crap downscale
2450 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2456 "pxor %%mm7, %%mm7 \n\t"
2457 "mov %0, %%"REG_c" \n\t"
2458 "mov %1, %%"REG_D" \n\t"
2459 "mov %2, %%"REG_d" \n\t"
2460 "mov %3, %%"REG_b" \n\t"
2461 "xor %%"REG_a", %%"REG_a" \n\t" // i
2462 PREFETCH" (%%"REG_c") \n\t"
2463 PREFETCH" 32(%%"REG_c") \n\t"
2464 PREFETCH" 64(%%"REG_c") \n\t"
2468 #define FUNNY_UV_CODE \
2469 "movl (%%"REG_b"), %%esi \n\t"\
2471 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2472 "add %%"REG_S", %%"REG_c" \n\t"\
2473 "add %%"REG_a", %%"REG_D" \n\t"\
2474 "xor %%"REG_a", %%"REG_a" \n\t"\
2478 #define FUNNY_UV_CODE \
2479 "movl (%%"REG_b"), %%esi \n\t"\
2481 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2482 "add %%"REG_a", %%"REG_D" \n\t"\
2483 "xor %%"REG_a", %%"REG_a" \n\t"\
2491 "xor %%"REG_a", %%"REG_a" \n\t" // i
2492 "mov %5, %%"REG_c" \n\t" // src
2493 "mov %1, %%"REG_D" \n\t" // buf1
2494 "add $4096, %%"REG_D" \n\t"
2495 PREFETCH" (%%"REG_c") \n\t"
2496 PREFETCH" 32(%%"REG_c") \n\t"
2497 PREFETCH" 64(%%"REG_c") \n\t"
2504 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2505 "m" (funnyUVCode), "m" (src2)
2506 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2508 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2510 // printf("%d %d %d\n", dstWidth, i, srcW);
2511 dst[i] = src1[srcW-1]*128;
2512 dst[i+2048] = src2[srcW-1]*128;
2519 "xor %%"REG_a", %%"REG_a" \n\t" // i
2520 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2521 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2524 "mov %0, %%"REG_S" \n\t"
2525 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2526 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2527 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2528 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2529 "shll $16, %%edi \n\t"
2530 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2531 "mov %1, %%"REG_D" \n\t"
2532 "shrl $9, %%esi \n\t"
2533 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2535 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2536 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2537 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2538 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2539 "shll $16, %%edi \n\t"
2540 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2541 "mov %1, %%"REG_D" \n\t"
2542 "shrl $9, %%esi \n\t"
2543 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2545 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2546 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2547 "add $1, %%"REG_a" \n\t"
2548 "cmp %2, %%"REG_a" \n\t"
2551 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2553 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2556 } //if MMX2 can't be used
2560 unsigned int xpos=0;
2561 for(i=0;i<dstWidth;i++)
2563 register unsigned int xx=xpos>>16;
2564 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2565 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2566 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2568 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2569 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2577 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2578 int srcSliceH, uint8_t* dst[], int dstStride[]){
2580 /* load a few things into local vars to make the code more readable? and faster */
2581 const int srcW= c->srcW;
2582 const int dstW= c->dstW;
2583 const int dstH= c->dstH;
2584 const int chrDstW= c->chrDstW;
2585 const int chrSrcW= c->chrSrcW;
2586 const int lumXInc= c->lumXInc;
2587 const int chrXInc= c->chrXInc;
2588 const int dstFormat= c->dstFormat;
2589 const int srcFormat= c->srcFormat;
2590 const int flags= c->flags;
2591 const int canMMX2BeUsed= c->canMMX2BeUsed;
2592 int16_t *vLumFilterPos= c->vLumFilterPos;
2593 int16_t *vChrFilterPos= c->vChrFilterPos;
2594 int16_t *hLumFilterPos= c->hLumFilterPos;
2595 int16_t *hChrFilterPos= c->hChrFilterPos;
2596 int16_t *vLumFilter= c->vLumFilter;
2597 int16_t *vChrFilter= c->vChrFilter;
2598 int16_t *hLumFilter= c->hLumFilter;
2599 int16_t *hChrFilter= c->hChrFilter;
2600 int32_t *lumMmxFilter= c->lumMmxFilter;
2601 int32_t *chrMmxFilter= c->chrMmxFilter;
2602 const int vLumFilterSize= c->vLumFilterSize;
2603 const int vChrFilterSize= c->vChrFilterSize;
2604 const int hLumFilterSize= c->hLumFilterSize;
2605 const int hChrFilterSize= c->hChrFilterSize;
2606 int16_t **lumPixBuf= c->lumPixBuf;
2607 int16_t **chrPixBuf= c->chrPixBuf;
2608 const int vLumBufSize= c->vLumBufSize;
2609 const int vChrBufSize= c->vChrBufSize;
2610 uint8_t *funnyYCode= c->funnyYCode;
2611 uint8_t *funnyUVCode= c->funnyUVCode;
2612 uint8_t *formatConvBuffer= c->formatConvBuffer;
2613 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2614 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2617 /* vars whch will change and which we need to storw back in the context */
2619 int lumBufIndex= c->lumBufIndex;
2620 int chrBufIndex= c->chrBufIndex;
2621 int lastInLumBuf= c->lastInLumBuf;
2622 int lastInChrBuf= c->lastInChrBuf;
2624 if(isPacked(c->srcFormat)){
2630 srcStride[2]= srcStride[0];
2632 srcStride[1]<<= c->vChrDrop;
2633 srcStride[2]<<= c->vChrDrop;
2635 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2636 // (int)dst[0], (int)dst[1], (int)dst[2]);
2638 #if 0 //self test FIXME move to a vfilter or something
2640 static volatile int i=0;
2642 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2643 selfTest(src, srcStride, c->srcW, c->srcH);
2648 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2649 //dstStride[0],dstStride[1],dstStride[2]);
2651 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2653 static int firstTime=1; //FIXME move this into the context perhaps
2654 if(flags & SWS_PRINT_INFO && firstTime)
2656 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2657 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2662 /* Note the user might start scaling the picture in the middle so this will not get executed
2663 this is not really intended but works currently, so ppl might do it */
2674 for(;dstY < dstH; dstY++){
2675 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2676 const int chrDstY= dstY>>c->chrDstVSubSample;
2677 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2678 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2680 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2681 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2682 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2683 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2685 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2686 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2687 //handle holes (FAST_BILINEAR & weird filters)
2688 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2689 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2690 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2691 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2692 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2694 // Do we have enough lines in this slice to output the dstY line
2695 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2697 //Do horizontal scaling
2698 while(lastInLumBuf < lastLumSrcY)
2700 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2702 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2703 ASSERT(lumBufIndex < 2*vLumBufSize)
2704 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2705 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2706 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2707 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2708 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2709 funnyYCode, c->srcFormat, formatConvBuffer,
2710 c->lumMmx2Filter, c->lumMmx2FilterPos);
2713 while(lastInChrBuf < lastChrSrcY)
2715 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2716 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2718 ASSERT(chrBufIndex < 2*vChrBufSize)
2719 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2720 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2721 //FIXME replace parameters through context struct (some at least)
2723 if(!(isGray(srcFormat) || isGray(dstFormat)))
2724 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2725 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2726 funnyUVCode, c->srcFormat, formatConvBuffer,
2727 c->chrMmx2Filter, c->chrMmx2FilterPos);
2730 //wrap buf index around to stay inside the ring buffer
2731 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2732 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2734 else // not enough lines left in this slice -> load the rest in the buffer
2736 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2737 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2738 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2739 vChrBufSize, vLumBufSize);*/
2741 //Do horizontal scaling
2742 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2744 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2746 ASSERT(lumBufIndex < 2*vLumBufSize)
2747 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2748 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2749 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2750 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2751 funnyYCode, c->srcFormat, formatConvBuffer,
2752 c->lumMmx2Filter, c->lumMmx2FilterPos);
2755 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2757 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2758 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2760 ASSERT(chrBufIndex < 2*vChrBufSize)
2761 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2762 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2764 if(!(isGray(srcFormat) || isGray(dstFormat)))
2765 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2766 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2767 funnyUVCode, c->srcFormat, formatConvBuffer,
2768 c->chrMmx2Filter, c->chrMmx2FilterPos);
2771 //wrap buf index around to stay inside the ring buffer
2772 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2773 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2774 break; //we can't output a dstY line so let's try with the next slice
2778 b5Dither= dither8[dstY&1];
2779 g6Dither= dither4[dstY&1];
2780 g5Dither= dither8[dstY&1];
2781 r5Dither= dither8[(dstY+1)&1];
2785 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2786 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2789 for(i=0; i<vLumFilterSize; i++)
2791 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2792 lumMmxFilter[4*i+2]=
2793 lumMmxFilter[4*i+3]=
2794 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2796 for(i=0; i<vChrFilterSize; i++)
2798 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2799 chrMmxFilter[4*i+2]=
2800 chrMmxFilter[4*i+3]=
2801 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2804 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2805 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2806 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2807 RENAME(yuv2nv12X)(c,
2808 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2809 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810 dest, uDest, dstW, chrDstW, dstFormat);
2812 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2814 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2815 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2816 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2818 int16_t *lumBuf = lumPixBuf[0];
2819 int16_t *chrBuf= chrPixBuf[0];
2820 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2825 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2826 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2827 dest, uDest, vDest, dstW, chrDstW);
2832 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2833 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2834 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2836 int chrAlpha= vChrFilter[2*dstY+1];
2837 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2838 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2840 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2842 int lumAlpha= vLumFilter[2*dstY+1];
2843 int chrAlpha= vChrFilter[2*dstY+1];
2844 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2845 dest, dstW, lumAlpha, chrAlpha, dstY);
2849 RENAME(yuv2packedX)(c,
2850 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2851 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2856 else // hmm looks like we can't use MMX here without overwriting this array's tail
2858 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2859 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2860 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2861 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2862 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2864 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2865 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2866 dest, uDest, dstW, chrDstW, dstFormat);
2868 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2870 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2871 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2873 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2874 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2875 dest, uDest, vDest, dstW, chrDstW);
2879 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2880 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2882 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2883 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2890 __asm __volatile(SFENCE:::"memory");
2891 __asm __volatile(EMMS:::"memory");
2893 /* store changed local vars back in the context */
2895 c->lumBufIndex= lumBufIndex;
2896 c->chrBufIndex= chrBufIndex;
2897 c->lastInLumBuf= lastInLumBuf;
2898 c->lastInChrBuf= lastInChrBuf;
2900 return dstY - lastDstY;