2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
161 #define YSCALEYUV2RGBX \
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "m" ((long)chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "m" ((long)chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "m" ((long)dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
799 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
807 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
814 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
822 :: "r" (lumSrc + dstW), "r" (dest + dstW),
828 for(i=0; i<dstW; i++)
830 int val= lumSrc[i]>>7;
841 for(i=0; i<chrDstW; i++)
844 int v=chrSrc[i + 2048]>>7;
848 else if (u>255) u=255;
850 else if (v>255) v=255;
861 * vertical scale YV12 to RGB
863 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865 uint8_t *dest, int dstW, int dstY)
875 WRITEBGR32(%4, %5, %%REGa)
877 :: "r" (&c->redDither),
878 "m" (dummy), "m" (dummy), "m" (dummy),
879 "r" (dest), "m" (dstW)
880 : "%"REG_a, "%"REG_d, "%"REG_S
888 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889 "add %4, %%"REG_b" \n\t"
890 WRITEBGR24(%%REGb, %5, %%REGa)
892 :: "r" (&c->redDither),
893 "m" (dummy), "m" (dummy), "m" (dummy),
894 "r" (dest), "m" (dstW)
895 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
903 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
905 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
910 WRITEBGR15(%4, %5, %%REGa)
912 :: "r" (&c->redDither),
913 "m" (dummy), "m" (dummy), "m" (dummy),
914 "r" (dest), "m" (dstW)
915 : "%"REG_a, "%"REG_d, "%"REG_S
923 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
925 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
930 WRITEBGR16(%4, %5, %%REGa)
932 :: "r" (&c->redDither),
933 "m" (dummy), "m" (dummy), "m" (dummy),
934 "r" (dest), "m" (dstW)
935 : "%"REG_a, "%"REG_d, "%"REG_S
943 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
945 "psraw $3, %%mm3 \n\t"
946 "psraw $3, %%mm4 \n\t"
947 "psraw $3, %%mm1 \n\t"
948 "psraw $3, %%mm7 \n\t"
949 WRITEYUY2(%4, %5, %%REGa)
951 :: "r" (&c->redDither),
952 "m" (dummy), "m" (dummy), "m" (dummy),
953 "r" (dest), "m" (dstW)
954 : "%"REG_a, "%"REG_d, "%"REG_S
961 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962 chrFilter, chrSrc, chrFilterSize,
965 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966 chrFilter, chrSrc, chrFilterSize,
974 * vertical bilinear scale YV12 to RGB
976 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
979 int yalpha1=yalpha^4095;
980 int uvalpha1=uvalpha^4095;
984 if(flags&SWS_FULL_CHR_H_INT)
994 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
995 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
997 "movq %%mm3, %%mm1 \n\t"
998 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
999 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1001 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1004 "add $4, %%"REG_a" \n\t"
1005 "cmp %5, %%"REG_a" \n\t"
1009 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010 "m" (yalpha1), "m" (uvalpha1)
1020 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1021 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1023 "movq %%mm3, %%mm1 \n\t"
1024 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1025 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1027 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1028 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1029 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1032 "movq %%mm1, %%mm2 \n\t"
1033 "psllq $48, %%mm1 \n\t" // 000000BG
1034 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1036 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1037 "psrld $16, %%mm2 \n\t" // R000R000
1038 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1039 "por %%mm2, %%mm1 \n\t" // RBGRR000
1041 "mov %4, %%"REG_b" \n\t"
1042 "add %%"REG_a", %%"REG_b" \n\t"
1046 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1049 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1050 "psrlq $32, %%mm3 \n\t"
1051 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1052 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1054 "add $4, %%"REG_a" \n\t"
1055 "cmp %5, %%"REG_a" \n\t"
1058 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059 "m" (yalpha1), "m" (uvalpha1)
1060 : "%"REG_a, "%"REG_b
1068 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1072 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1073 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1074 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1076 "psrlw $3, %%mm3 \n\t"
1077 "psllw $2, %%mm1 \n\t"
1078 "psllw $7, %%mm0 \n\t"
1079 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1080 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1082 "por %%mm3, %%mm1 \n\t"
1083 "por %%mm1, %%mm0 \n\t"
1085 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1087 "add $4, %%"REG_a" \n\t"
1088 "cmp %5, %%"REG_a" \n\t"
1091 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092 "m" (yalpha1), "m" (uvalpha1)
1101 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1105 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1106 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1107 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1109 "psrlw $3, %%mm3 \n\t"
1110 "psllw $3, %%mm1 \n\t"
1111 "psllw $8, %%mm0 \n\t"
1112 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1113 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1115 "por %%mm3, %%mm1 \n\t"
1116 "por %%mm1, %%mm0 \n\t"
1118 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1120 "add $4, %%"REG_a" \n\t"
1121 "cmp %5, %%"REG_a" \n\t"
1124 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125 "m" (yalpha1), "m" (uvalpha1)
1134 if(dstFormat==IMGFMT_BGR32)
1137 #ifdef WORDS_BIGENDIAN
1140 for(i=0;i<dstW;i++){
1141 // vertical linear interpolation && yuv2rgb in a single step:
1142 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1151 else if(dstFormat==IMGFMT_BGR24)
1154 for(i=0;i<dstW;i++){
1155 // vertical linear interpolation && yuv2rgb in a single step:
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1165 else if(dstFormat==IMGFMT_BGR16)
1168 for(i=0;i<dstW;i++){
1169 // vertical linear interpolation && yuv2rgb in a single step:
1170 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1174 ((uint16_t*)dest)[i] =
1175 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1180 else if(dstFormat==IMGFMT_BGR15)
1183 for(i=0;i<dstW;i++){
1184 // vertical linear interpolation && yuv2rgb in a single step:
1185 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1189 ((uint16_t*)dest)[i] =
1190 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1200 switch(c->dstFormat)
1202 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1205 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1206 "mov %4, %%"REG_SP" \n\t"
1207 YSCALEYUV2RGB(%%REGa, %5)
1208 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1211 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1218 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1219 "mov %4, %%"REG_SP" \n\t"
1220 YSCALEYUV2RGB(%%REGa, %5)
1221 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1223 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1230 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1231 "mov %4, %%"REG_SP" \n\t"
1232 YSCALEYUV2RGB(%%REGa, %5)
1233 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1235 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1240 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1243 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1250 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1251 "mov %4, %%"REG_SP" \n\t"
1252 YSCALEYUV2RGB(%%REGa, %5)
1253 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1255 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1260 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1262 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1269 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_SP" \n\t"
1271 YSCALEYUV2PACKED(%%REGa, %5)
1272 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1274 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1286 * YV12 to RGB without scaling or interpolating
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1291 const int yalpha1=0;
1294 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295 const int yalpha= 4096; //FIXME ...
1297 if(flags&SWS_FULL_CHR_H_INT)
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1304 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1310 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1311 "mov %4, %%"REG_SP" \n\t"
1312 YSCALEYUV2RGB1(%%REGa, %5)
1313 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1316 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1323 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1324 "mov %4, %%"REG_SP" \n\t"
1325 YSCALEYUV2RGB1(%%REGa, %5)
1326 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1329 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1336 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1337 "mov %4, %%"REG_SP" \n\t"
1338 YSCALEYUV2RGB1(%%REGa, %5)
1339 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1341 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1345 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1348 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1355 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1356 "mov %4, %%"REG_SP" \n\t"
1357 YSCALEYUV2RGB1(%%REGa, %5)
1358 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1360 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1365 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1368 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1375 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1376 "mov %4, %%"REG_SP" \n\t"
1377 YSCALEYUV2PACKED1(%%REGa, %5)
1378 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1381 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1394 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_SP" \n\t"
1396 YSCALEYUV2RGB1b(%%REGa, %5)
1397 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1400 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1407 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1408 "mov %4, %%"REG_SP" \n\t"
1409 YSCALEYUV2RGB1b(%%REGa, %5)
1410 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1413 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1420 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_SP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGa, %5)
1423 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1425 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1429 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1432 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1439 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_SP" \n\t"
1441 YSCALEYUV2RGB1b(%%REGa, %5)
1442 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1449 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1452 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1459 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_SP" \n\t"
1461 YSCALEYUV2PACKED1b(%%REGa, %5)
1462 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1465 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1473 if( uvalpha < 2048 )
1475 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1477 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1481 //FIXME yuy2* can read upto 7 samples to much
1483 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1487 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488 "mov %0, %%"REG_a" \n\t"
1490 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1491 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1492 "pand %%mm2, %%mm0 \n\t"
1493 "pand %%mm2, %%mm1 \n\t"
1494 "packuswb %%mm1, %%mm0 \n\t"
1495 "movq %%mm0, (%2, %%"REG_a") \n\t"
1496 "add $8, %%"REG_a" \n\t"
1498 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1503 for(i=0; i<width; i++)
1508 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1510 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1512 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513 "mov %0, %%"REG_a" \n\t"
1515 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1516 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1517 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1518 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1521 "psrlw $8, %%mm0 \n\t"
1522 "psrlw $8, %%mm1 \n\t"
1523 "packuswb %%mm1, %%mm0 \n\t"
1524 "movq %%mm0, %%mm1 \n\t"
1525 "psrlw $8, %%mm0 \n\t"
1526 "pand %%mm4, %%mm1 \n\t"
1527 "packuswb %%mm0, %%mm0 \n\t"
1528 "packuswb %%mm1, %%mm1 \n\t"
1529 "movd %%mm0, (%4, %%"REG_a") \n\t"
1530 "movd %%mm1, (%3, %%"REG_a") \n\t"
1531 "add $4, %%"REG_a" \n\t"
1533 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1538 for(i=0; i<width; i++)
1540 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1546 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1551 "mov %0, %%"REG_a" \n\t"
1553 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1554 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1555 "psrlw $8, %%mm0 \n\t"
1556 "psrlw $8, %%mm1 \n\t"
1557 "packuswb %%mm1, %%mm0 \n\t"
1558 "movq %%mm0, (%2, %%"REG_a") \n\t"
1559 "add $8, %%"REG_a" \n\t"
1561 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1566 for(i=0; i<width; i++)
1571 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1575 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576 "mov %0, %%"REG_a" \n\t"
1578 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1579 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1580 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1581 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1584 "pand %%mm4, %%mm0 \n\t"
1585 "pand %%mm4, %%mm1 \n\t"
1586 "packuswb %%mm1, %%mm0 \n\t"
1587 "movq %%mm0, %%mm1 \n\t"
1588 "psrlw $8, %%mm0 \n\t"
1589 "pand %%mm4, %%mm1 \n\t"
1590 "packuswb %%mm0, %%mm0 \n\t"
1591 "packuswb %%mm1, %%mm1 \n\t"
1592 "movd %%mm0, (%4, %%"REG_a") \n\t"
1593 "movd %%mm1, (%3, %%"REG_a") \n\t"
1594 "add $4, %%"REG_a" \n\t"
1596 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1601 for(i=0; i<width; i++)
1603 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1609 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1611 #ifdef HAVE_MMXFIXME
1614 for(i=0; i<width; i++)
1616 int b= ((uint32_t*)src)[i]&0xFF;
1617 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1620 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1625 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1627 #ifdef HAVE_MMXFIXME
1630 for(i=0; i<width; i++)
1632 const int a= ((uint32_t*)src1)[2*i+0];
1633 const int e= ((uint32_t*)src1)[2*i+1];
1634 const int c= ((uint32_t*)src2)[2*i+0];
1635 const int d= ((uint32_t*)src2)[2*i+1];
1636 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638 const int b= l&0x3FF;
1642 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1652 "mov %2, %%"REG_a" \n\t"
1653 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1654 "movq "MANGLE(w1111)", %%mm5 \n\t"
1655 "pxor %%mm7, %%mm7 \n\t"
1656 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1659 PREFETCH" 64(%0, %%"REG_b") \n\t"
1660 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1661 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1662 "punpcklbw %%mm7, %%mm0 \n\t"
1663 "punpcklbw %%mm7, %%mm1 \n\t"
1664 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1665 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1666 "punpcklbw %%mm7, %%mm2 \n\t"
1667 "punpcklbw %%mm7, %%mm3 \n\t"
1668 "pmaddwd %%mm6, %%mm0 \n\t"
1669 "pmaddwd %%mm6, %%mm1 \n\t"
1670 "pmaddwd %%mm6, %%mm2 \n\t"
1671 "pmaddwd %%mm6, %%mm3 \n\t"
1672 #ifndef FAST_BGR2YV12
1673 "psrad $8, %%mm0 \n\t"
1674 "psrad $8, %%mm1 \n\t"
1675 "psrad $8, %%mm2 \n\t"
1676 "psrad $8, %%mm3 \n\t"
1678 "packssdw %%mm1, %%mm0 \n\t"
1679 "packssdw %%mm3, %%mm2 \n\t"
1680 "pmaddwd %%mm5, %%mm0 \n\t"
1681 "pmaddwd %%mm5, %%mm2 \n\t"
1682 "packssdw %%mm2, %%mm0 \n\t"
1683 "psraw $7, %%mm0 \n\t"
1685 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1686 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1687 "punpcklbw %%mm7, %%mm4 \n\t"
1688 "punpcklbw %%mm7, %%mm1 \n\t"
1689 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1690 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1691 "punpcklbw %%mm7, %%mm2 \n\t"
1692 "punpcklbw %%mm7, %%mm3 \n\t"
1693 "pmaddwd %%mm6, %%mm4 \n\t"
1694 "pmaddwd %%mm6, %%mm1 \n\t"
1695 "pmaddwd %%mm6, %%mm2 \n\t"
1696 "pmaddwd %%mm6, %%mm3 \n\t"
1697 #ifndef FAST_BGR2YV12
1698 "psrad $8, %%mm4 \n\t"
1699 "psrad $8, %%mm1 \n\t"
1700 "psrad $8, %%mm2 \n\t"
1701 "psrad $8, %%mm3 \n\t"
1703 "packssdw %%mm1, %%mm4 \n\t"
1704 "packssdw %%mm3, %%mm2 \n\t"
1705 "pmaddwd %%mm5, %%mm4 \n\t"
1706 "pmaddwd %%mm5, %%mm2 \n\t"
1707 "add $24, %%"REG_b" \n\t"
1708 "packssdw %%mm2, %%mm4 \n\t"
1709 "psraw $7, %%mm4 \n\t"
1711 "packuswb %%mm4, %%mm0 \n\t"
1712 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1714 "movq %%mm0, (%1, %%"REG_a") \n\t"
1715 "add $8, %%"REG_a" \n\t"
1717 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718 : "%"REG_a, "%"REG_b
1722 for(i=0; i<width; i++)
1728 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1733 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1737 "mov %4, %%"REG_a" \n\t"
1738 "movq "MANGLE(w1111)", %%mm5 \n\t"
1739 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1740 "pxor %%mm7, %%mm7 \n\t"
1741 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1742 "add %%"REG_b", %%"REG_b" \n\t"
1745 PREFETCH" 64(%0, %%"REG_b") \n\t"
1746 PREFETCH" 64(%1, %%"REG_b") \n\t"
1747 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1749 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1750 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1751 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1754 "movq %%mm0, %%mm1 \n\t"
1755 "movq %%mm2, %%mm3 \n\t"
1756 "psrlq $24, %%mm0 \n\t"
1757 "psrlq $24, %%mm2 \n\t"
1760 "punpcklbw %%mm7, %%mm0 \n\t"
1761 "punpcklbw %%mm7, %%mm2 \n\t"
1763 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1764 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1765 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1766 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1767 "punpcklbw %%mm7, %%mm0 \n\t"
1768 "punpcklbw %%mm7, %%mm1 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1770 "punpcklbw %%mm7, %%mm3 \n\t"
1771 "paddw %%mm1, %%mm0 \n\t"
1772 "paddw %%mm3, %%mm2 \n\t"
1773 "paddw %%mm2, %%mm0 \n\t"
1774 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1775 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1776 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1777 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1778 "punpcklbw %%mm7, %%mm4 \n\t"
1779 "punpcklbw %%mm7, %%mm1 \n\t"
1780 "punpcklbw %%mm7, %%mm2 \n\t"
1781 "punpcklbw %%mm7, %%mm3 \n\t"
1782 "paddw %%mm1, %%mm4 \n\t"
1783 "paddw %%mm3, %%mm2 \n\t"
1784 "paddw %%mm4, %%mm2 \n\t"
1785 "psrlw $2, %%mm0 \n\t"
1786 "psrlw $2, %%mm2 \n\t"
1788 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1789 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1791 "pmaddwd %%mm0, %%mm1 \n\t"
1792 "pmaddwd %%mm2, %%mm3 \n\t"
1793 "pmaddwd %%mm6, %%mm0 \n\t"
1794 "pmaddwd %%mm6, %%mm2 \n\t"
1795 #ifndef FAST_BGR2YV12
1796 "psrad $8, %%mm0 \n\t"
1797 "psrad $8, %%mm1 \n\t"
1798 "psrad $8, %%mm2 \n\t"
1799 "psrad $8, %%mm3 \n\t"
1801 "packssdw %%mm2, %%mm0 \n\t"
1802 "packssdw %%mm3, %%mm1 \n\t"
1803 "pmaddwd %%mm5, %%mm0 \n\t"
1804 "pmaddwd %%mm5, %%mm1 \n\t"
1805 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1806 "psraw $7, %%mm0 \n\t"
1808 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1810 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1811 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1812 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1815 "movq %%mm4, %%mm1 \n\t"
1816 "movq %%mm2, %%mm3 \n\t"
1817 "psrlq $24, %%mm4 \n\t"
1818 "psrlq $24, %%mm2 \n\t"
1821 "punpcklbw %%mm7, %%mm4 \n\t"
1822 "punpcklbw %%mm7, %%mm2 \n\t"
1824 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1825 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1826 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1827 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1828 "punpcklbw %%mm7, %%mm4 \n\t"
1829 "punpcklbw %%mm7, %%mm1 \n\t"
1830 "punpcklbw %%mm7, %%mm2 \n\t"
1831 "punpcklbw %%mm7, %%mm3 \n\t"
1832 "paddw %%mm1, %%mm4 \n\t"
1833 "paddw %%mm3, %%mm2 \n\t"
1834 "paddw %%mm2, %%mm4 \n\t"
1835 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1836 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1837 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1838 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1839 "punpcklbw %%mm7, %%mm5 \n\t"
1840 "punpcklbw %%mm7, %%mm1 \n\t"
1841 "punpcklbw %%mm7, %%mm2 \n\t"
1842 "punpcklbw %%mm7, %%mm3 \n\t"
1843 "paddw %%mm1, %%mm5 \n\t"
1844 "paddw %%mm3, %%mm2 \n\t"
1845 "paddw %%mm5, %%mm2 \n\t"
1846 "movq "MANGLE(w1111)", %%mm5 \n\t"
1847 "psrlw $2, %%mm4 \n\t"
1848 "psrlw $2, %%mm2 \n\t"
1850 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1851 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1853 "pmaddwd %%mm4, %%mm1 \n\t"
1854 "pmaddwd %%mm2, %%mm3 \n\t"
1855 "pmaddwd %%mm6, %%mm4 \n\t"
1856 "pmaddwd %%mm6, %%mm2 \n\t"
1857 #ifndef FAST_BGR2YV12
1858 "psrad $8, %%mm4 \n\t"
1859 "psrad $8, %%mm1 \n\t"
1860 "psrad $8, %%mm2 \n\t"
1861 "psrad $8, %%mm3 \n\t"
1863 "packssdw %%mm2, %%mm4 \n\t"
1864 "packssdw %%mm3, %%mm1 \n\t"
1865 "pmaddwd %%mm5, %%mm4 \n\t"
1866 "pmaddwd %%mm5, %%mm1 \n\t"
1867 "add $24, %%"REG_b" \n\t"
1868 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1869 "psraw $7, %%mm4 \n\t"
1871 "movq %%mm0, %%mm1 \n\t"
1872 "punpckldq %%mm4, %%mm0 \n\t"
1873 "punpckhdq %%mm4, %%mm1 \n\t"
1874 "packsswb %%mm1, %%mm0 \n\t"
1875 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1877 "movd %%mm0, (%2, %%"REG_a") \n\t"
1878 "punpckhdq %%mm0, %%mm0 \n\t"
1879 "movd %%mm0, (%3, %%"REG_a") \n\t"
1880 "add $4, %%"REG_a" \n\t"
1882 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883 : "%"REG_a, "%"REG_b
1887 for(i=0; i<width; i++)
1889 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1893 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1899 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1902 for(i=0; i<width; i++)
1904 int d= ((uint16_t*)src)[i];
1907 int r= (d>>11)&0x1F;
1909 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1916 for(i=0; i<width; i++)
1918 int d0= ((uint32_t*)src1)[i];
1919 int d1= ((uint32_t*)src2)[i];
1921 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1924 int dh2= (dh>>11) + (dh<<21);
1928 int r= (d>>11)&0x7F;
1930 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1938 for(i=0; i<width; i++)
1940 int d= ((uint16_t*)src)[i];
1943 int r= (d>>10)&0x1F;
1945 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1952 for(i=0; i<width; i++)
1954 int d0= ((uint32_t*)src1)[i];
1955 int d1= ((uint32_t*)src2)[i];
1957 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1960 int dh2= (dh>>11) + (dh<<21);
1964 int r= (d>>10)&0x7F;
1966 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1972 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1975 for(i=0; i<width; i++)
1977 int r= ((uint32_t*)src)[i]&0xFF;
1978 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1981 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1985 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1988 for(i=0; i<width; i++)
1990 const int a= ((uint32_t*)src1)[2*i+0];
1991 const int e= ((uint32_t*)src1)[2*i+1];
1992 const int c= ((uint32_t*)src2)[2*i+0];
1993 const int d= ((uint32_t*)src2)[2*i+1];
1994 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996 const int r= l&0x3FF;
2000 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2008 for(i=0; i<width; i++)
2014 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2018 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2021 for(i=0; i<width; i++)
2023 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2027 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2033 // Bilinear / Bicubic scaling
2034 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035 int16_t *filter, int16_t *filterPos, int filterSize)
2038 assert(filterSize % 4 == 0 && filterSize>0);
2039 if(filterSize==4) // allways true for upscaling, sometimes for down too
2041 long counter= -2*dstW;
2043 filterPos-= counter/2;
2046 "pxor %%mm7, %%mm7 \n\t"
2047 "movq "MANGLE(w02)", %%mm6 \n\t"
2048 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2049 "mov %%"REG_a", %%"REG_BP" \n\t"
2052 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2053 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2054 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2057 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2058 "punpcklbw %%mm7, %%mm0 \n\t"
2059 "punpcklbw %%mm7, %%mm2 \n\t"
2060 "pmaddwd %%mm1, %%mm0 \n\t"
2061 "pmaddwd %%mm2, %%mm3 \n\t"
2062 "psrad $8, %%mm0 \n\t"
2063 "psrad $8, %%mm3 \n\t"
2064 "packssdw %%mm3, %%mm0 \n\t"
2065 "pmaddwd %%mm6, %%mm0 \n\t"
2066 "packssdw %%mm0, %%mm0 \n\t"
2067 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2068 "add $4, %%"REG_BP" \n\t"
2071 "pop %%"REG_BP" \n\t"
2073 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077 else if(filterSize==8)
2079 long counter= -2*dstW;
2081 filterPos-= counter/2;
2084 "pxor %%mm7, %%mm7 \n\t"
2085 "movq "MANGLE(w02)", %%mm6 \n\t"
2086 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2087 "mov %%"REG_a", %%"REG_BP" \n\t"
2090 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2091 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2092 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2095 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2096 "punpcklbw %%mm7, %%mm0 \n\t"
2097 "punpcklbw %%mm7, %%mm2 \n\t"
2098 "pmaddwd %%mm1, %%mm0 \n\t"
2099 "pmaddwd %%mm2, %%mm3 \n\t"
2101 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2104 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2105 "punpcklbw %%mm7, %%mm4 \n\t"
2106 "punpcklbw %%mm7, %%mm2 \n\t"
2107 "pmaddwd %%mm1, %%mm4 \n\t"
2108 "pmaddwd %%mm2, %%mm5 \n\t"
2109 "paddd %%mm4, %%mm0 \n\t"
2110 "paddd %%mm5, %%mm3 \n\t"
2112 "psrad $8, %%mm0 \n\t"
2113 "psrad $8, %%mm3 \n\t"
2114 "packssdw %%mm3, %%mm0 \n\t"
2115 "pmaddwd %%mm6, %%mm0 \n\t"
2116 "packssdw %%mm0, %%mm0 \n\t"
2117 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2118 "add $4, %%"REG_BP" \n\t"
2121 "pop %%"REG_BP" \n\t"
2123 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2129 long counter= -2*dstW;
2130 // filter-= counter*filterSize/2;
2131 filterPos-= counter/2;
2134 "pxor %%mm7, %%mm7 \n\t"
2135 "movq "MANGLE(w02)", %%mm6 \n\t"
2138 "mov %2, %%"REG_c" \n\t"
2139 "movzwl (%%"REG_c", %0), %%eax \n\t"
2140 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2141 "mov %5, %%"REG_c" \n\t"
2142 "pxor %%mm4, %%mm4 \n\t"
2143 "pxor %%mm5, %%mm5 \n\t"
2145 "movq (%1), %%mm1 \n\t"
2146 "movq (%1, %6), %%mm3 \n\t"
2147 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149 "punpcklbw %%mm7, %%mm0 \n\t"
2150 "punpcklbw %%mm7, %%mm2 \n\t"
2151 "pmaddwd %%mm1, %%mm0 \n\t"
2152 "pmaddwd %%mm2, %%mm3 \n\t"
2153 "paddd %%mm3, %%mm5 \n\t"
2154 "paddd %%mm0, %%mm4 \n\t"
2156 "add $4, %%"REG_c" \n\t"
2157 "cmp %4, %%"REG_c" \n\t"
2160 "psrad $8, %%mm4 \n\t"
2161 "psrad $8, %%mm5 \n\t"
2162 "packssdw %%mm5, %%mm4 \n\t"
2163 "pmaddwd %%mm6, %%mm4 \n\t"
2164 "packssdw %%mm4, %%mm4 \n\t"
2165 "mov %3, %%"REG_a" \n\t"
2166 "movd %%mm4, (%%"REG_a", %0) \n\t"
2170 : "+r" (counter), "+r" (filter)
2171 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172 "m" (src), "r" ((long)filterSize*2)
2173 : "%"REG_b, "%"REG_a, "%"REG_c
2178 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2181 for(i=0; i<dstW; i++)
2184 int srcPos= filterPos[i];
2186 // printf("filterPos: %d\n", filterPos[i]);
2187 for(j=0; j<filterSize; j++)
2189 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2192 // filter += hFilterSize;
2193 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2199 // *** horizontal scale Y line to temp buffer
2200 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2203 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204 int32_t *mmx2FilterPos)
2206 if(srcFormat==IMGFMT_YUY2)
2208 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209 src= formatConvBuffer;
2211 else if(srcFormat==IMGFMT_UYVY)
2213 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214 src= formatConvBuffer;
2216 else if(srcFormat==IMGFMT_BGR32)
2218 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219 src= formatConvBuffer;
2221 else if(srcFormat==IMGFMT_BGR24)
2223 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224 src= formatConvBuffer;
2226 else if(srcFormat==IMGFMT_BGR16)
2228 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229 src= formatConvBuffer;
2231 else if(srcFormat==IMGFMT_BGR15)
2233 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234 src= formatConvBuffer;
2236 else if(srcFormat==IMGFMT_RGB32)
2238 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239 src= formatConvBuffer;
2241 else if(srcFormat==IMGFMT_RGB24)
2243 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244 src= formatConvBuffer;
2248 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2251 if(!(flags&SWS_FAST_BILINEAR))
2254 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2256 else // Fast Bilinear upscale / crap downscale
2258 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2264 "pxor %%mm7, %%mm7 \n\t"
2265 "mov %0, %%"REG_c" \n\t"
2266 "mov %1, %%"REG_D" \n\t"
2267 "mov %2, %%"REG_d" \n\t"
2268 "mov %3, %%"REG_b" \n\t"
2269 "xor %%"REG_a", %%"REG_a" \n\t" // i
2270 PREFETCH" (%%"REG_c") \n\t"
2271 PREFETCH" 32(%%"REG_c") \n\t"
2272 PREFETCH" 64(%%"REG_c") \n\t"
2276 #define FUNNY_Y_CODE \
2277 "movl (%%"REG_b"), %%esi \n\t"\
2279 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2280 "add %%"REG_S", %%"REG_c" \n\t"\
2281 "add %%"REG_a", %%"REG_D" \n\t"\
2282 "xor %%"REG_a", %%"REG_a" \n\t"\
2286 #define FUNNY_Y_CODE \
2287 "movl (%%"REG_b"), %%esi \n\t"\
2289 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2290 "add %%"REG_a", %%"REG_D" \n\t"\
2291 "xor %%"REG_a", %%"REG_a" \n\t"\
2304 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2306 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2308 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2313 //NO MMX just normal asm ...
2315 "xor %%"REG_a", %%"REG_a" \n\t" // i
2316 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2317 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2320 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2321 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2322 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2323 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2324 "shll $16, %%edi \n\t"
2325 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2326 "mov %1, %%"REG_D" \n\t"
2327 "shrl $9, %%esi \n\t"
2328 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2329 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2330 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2332 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2333 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2334 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2335 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2336 "shll $16, %%edi \n\t"
2337 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2338 "mov %1, %%"REG_D" \n\t"
2339 "shrl $9, %%esi \n\t"
2340 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2341 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2342 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2345 "add $2, %%"REG_a" \n\t"
2346 "cmp %2, %%"REG_a" \n\t"
2350 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2351 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2354 } //if MMX2 can't be used
2358 unsigned int xpos=0;
2359 for(i=0;i<dstWidth;i++)
2361 register unsigned int xx=xpos>>16;
2362 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2363 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2371 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2372 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2373 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2374 int32_t *mmx2FilterPos)
2376 if(srcFormat==IMGFMT_YUY2)
2378 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2379 src1= formatConvBuffer;
2380 src2= formatConvBuffer+2048;
2382 else if(srcFormat==IMGFMT_UYVY)
2384 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385 src1= formatConvBuffer;
2386 src2= formatConvBuffer+2048;
2388 else if(srcFormat==IMGFMT_BGR32)
2390 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391 src1= formatConvBuffer;
2392 src2= formatConvBuffer+2048;
2394 else if(srcFormat==IMGFMT_BGR24)
2396 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+2048;
2400 else if(srcFormat==IMGFMT_BGR16)
2402 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403 src1= formatConvBuffer;
2404 src2= formatConvBuffer+2048;
2406 else if(srcFormat==IMGFMT_BGR15)
2408 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409 src1= formatConvBuffer;
2410 src2= formatConvBuffer+2048;
2412 else if(srcFormat==IMGFMT_RGB32)
2414 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+2048;
2418 else if(srcFormat==IMGFMT_RGB24)
2420 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421 src1= formatConvBuffer;
2422 src2= formatConvBuffer+2048;
2424 else if(isGray(srcFormat))
2430 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2431 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2433 if(!(flags&SWS_FAST_BILINEAR))
2436 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2437 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2439 else // Fast Bilinear upscale / crap downscale
2441 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2447 "pxor %%mm7, %%mm7 \n\t"
2448 "mov %0, %%"REG_c" \n\t"
2449 "mov %1, %%"REG_D" \n\t"
2450 "mov %2, %%"REG_d" \n\t"
2451 "mov %3, %%"REG_b" \n\t"
2452 "xor %%"REG_a", %%"REG_a" \n\t" // i
2453 PREFETCH" (%%"REG_c") \n\t"
2454 PREFETCH" 32(%%"REG_c") \n\t"
2455 PREFETCH" 64(%%"REG_c") \n\t"
2459 #define FUNNY_UV_CODE \
2460 "movl (%%"REG_b"), %%esi \n\t"\
2462 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2463 "add %%"REG_S", %%"REG_c" \n\t"\
2464 "add %%"REG_a", %%"REG_D" \n\t"\
2465 "xor %%"REG_a", %%"REG_a" \n\t"\
2469 #define FUNNY_UV_CODE \
2470 "movl (%%"REG_b"), %%esi \n\t"\
2472 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2473 "add %%"REG_a", %%"REG_D" \n\t"\
2474 "xor %%"REG_a", %%"REG_a" \n\t"\
2482 "xor %%"REG_a", %%"REG_a" \n\t" // i
2483 "mov %5, %%"REG_c" \n\t" // src
2484 "mov %1, %%"REG_D" \n\t" // buf1
2485 "add $4096, %%"REG_D" \n\t"
2486 PREFETCH" (%%"REG_c") \n\t"
2487 PREFETCH" 32(%%"REG_c") \n\t"
2488 PREFETCH" 64(%%"REG_c") \n\t"
2495 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2496 "m" (funnyUVCode), "m" (src2)
2497 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2499 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2501 // printf("%d %d %d\n", dstWidth, i, srcW);
2502 dst[i] = src1[srcW-1]*128;
2503 dst[i+2048] = src2[srcW-1]*128;
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i
2511 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2512 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2515 "mov %0, %%"REG_S" \n\t"
2516 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2517 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2518 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2519 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2520 "shll $16, %%edi \n\t"
2521 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2522 "mov %1, %%"REG_D" \n\t"
2523 "shrl $9, %%esi \n\t"
2524 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2526 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2527 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2528 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2529 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2530 "shll $16, %%edi \n\t"
2531 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2532 "mov %1, %%"REG_D" \n\t"
2533 "shrl $9, %%esi \n\t"
2534 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2536 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2537 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2538 "add $1, %%"REG_a" \n\t"
2539 "cmp %2, %%"REG_a" \n\t"
2542 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2544 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2547 } //if MMX2 can't be used
2551 unsigned int xpos=0;
2552 for(i=0;i<dstWidth;i++)
2554 register unsigned int xx=xpos>>16;
2555 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2556 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2557 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2559 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2560 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2568 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2569 int srcSliceH, uint8_t* dst[], int dstStride[]){
2571 /* load a few things into local vars to make the code more readable? and faster */
2572 const int srcW= c->srcW;
2573 const int dstW= c->dstW;
2574 const int dstH= c->dstH;
2575 const int chrDstW= c->chrDstW;
2576 const int chrSrcW= c->chrSrcW;
2577 const int lumXInc= c->lumXInc;
2578 const int chrXInc= c->chrXInc;
2579 const int dstFormat= c->dstFormat;
2580 const int srcFormat= c->srcFormat;
2581 const int flags= c->flags;
2582 const int canMMX2BeUsed= c->canMMX2BeUsed;
2583 int16_t *vLumFilterPos= c->vLumFilterPos;
2584 int16_t *vChrFilterPos= c->vChrFilterPos;
2585 int16_t *hLumFilterPos= c->hLumFilterPos;
2586 int16_t *hChrFilterPos= c->hChrFilterPos;
2587 int16_t *vLumFilter= c->vLumFilter;
2588 int16_t *vChrFilter= c->vChrFilter;
2589 int16_t *hLumFilter= c->hLumFilter;
2590 int16_t *hChrFilter= c->hChrFilter;
2591 int32_t *lumMmxFilter= c->lumMmxFilter;
2592 int32_t *chrMmxFilter= c->chrMmxFilter;
2593 const int vLumFilterSize= c->vLumFilterSize;
2594 const int vChrFilterSize= c->vChrFilterSize;
2595 const int hLumFilterSize= c->hLumFilterSize;
2596 const int hChrFilterSize= c->hChrFilterSize;
2597 int16_t **lumPixBuf= c->lumPixBuf;
2598 int16_t **chrPixBuf= c->chrPixBuf;
2599 const int vLumBufSize= c->vLumBufSize;
2600 const int vChrBufSize= c->vChrBufSize;
2601 uint8_t *funnyYCode= c->funnyYCode;
2602 uint8_t *funnyUVCode= c->funnyUVCode;
2603 uint8_t *formatConvBuffer= c->formatConvBuffer;
2604 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2605 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2608 /* vars whch will change and which we need to storw back in the context */
2610 int lumBufIndex= c->lumBufIndex;
2611 int chrBufIndex= c->chrBufIndex;
2612 int lastInLumBuf= c->lastInLumBuf;
2613 int lastInChrBuf= c->lastInChrBuf;
2615 if(isPacked(c->srcFormat)){
2621 srcStride[2]= srcStride[0];
2623 srcStride[1]<<= c->vChrDrop;
2624 srcStride[2]<<= c->vChrDrop;
2626 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2627 // (int)dst[0], (int)dst[1], (int)dst[2]);
2629 #if 0 //self test FIXME move to a vfilter or something
2631 static volatile int i=0;
2633 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2634 selfTest(src, srcStride, c->srcW, c->srcH);
2639 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2640 //dstStride[0],dstStride[1],dstStride[2]);
2642 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2644 static int firstTime=1; //FIXME move this into the context perhaps
2645 if(flags & SWS_PRINT_INFO && firstTime)
2647 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2648 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2653 /* Note the user might start scaling the picture in the middle so this will not get executed
2654 this is not really intended but works currently, so ppl might do it */
2665 for(;dstY < dstH; dstY++){
2666 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2667 const int chrDstY= dstY>>c->chrDstVSubSample;
2668 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2669 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2671 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2672 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2673 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2674 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2676 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2677 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2678 //handle holes (FAST_BILINEAR & weird filters)
2679 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2680 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2681 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2682 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2683 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2685 // Do we have enough lines in this slice to output the dstY line
2686 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2688 //Do horizontal scaling
2689 while(lastInLumBuf < lastLumSrcY)
2691 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2693 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2694 ASSERT(lumBufIndex < 2*vLumBufSize)
2695 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2696 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2697 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2698 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2699 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2700 funnyYCode, c->srcFormat, formatConvBuffer,
2701 c->lumMmx2Filter, c->lumMmx2FilterPos);
2704 while(lastInChrBuf < lastChrSrcY)
2706 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2707 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2709 ASSERT(chrBufIndex < 2*vChrBufSize)
2710 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2711 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2712 //FIXME replace parameters through context struct (some at least)
2714 if(!(isGray(srcFormat) || isGray(dstFormat)))
2715 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2716 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2717 funnyUVCode, c->srcFormat, formatConvBuffer,
2718 c->chrMmx2Filter, c->chrMmx2FilterPos);
2721 //wrap buf index around to stay inside the ring buffer
2722 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2723 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2725 else // not enough lines left in this slice -> load the rest in the buffer
2727 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2728 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2729 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2730 vChrBufSize, vLumBufSize);*/
2732 //Do horizontal scaling
2733 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2735 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2737 ASSERT(lumBufIndex < 2*vLumBufSize)
2738 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2739 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2740 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2741 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2742 funnyYCode, c->srcFormat, formatConvBuffer,
2743 c->lumMmx2Filter, c->lumMmx2FilterPos);
2746 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2748 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2749 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2751 ASSERT(chrBufIndex < 2*vChrBufSize)
2752 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2753 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2755 if(!(isGray(srcFormat) || isGray(dstFormat)))
2756 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2757 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2758 funnyUVCode, c->srcFormat, formatConvBuffer,
2759 c->chrMmx2Filter, c->chrMmx2FilterPos);
2762 //wrap buf index around to stay inside the ring buffer
2763 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2764 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2765 break; //we can't output a dstY line so let's try with the next slice
2769 b5Dither= dither8[dstY&1];
2770 g6Dither= dither4[dstY&1];
2771 g5Dither= dither8[dstY&1];
2772 r5Dither= dither8[(dstY+1)&1];
2776 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2777 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2780 for(i=0; i<vLumFilterSize; i++)
2782 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2783 lumMmxFilter[4*i+2]=
2784 lumMmxFilter[4*i+3]=
2785 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2787 for(i=0; i<vChrFilterSize; i++)
2789 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2790 chrMmxFilter[4*i+2]=
2791 chrMmxFilter[4*i+3]=
2792 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2795 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2797 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2801 int16_t *lumBuf = lumPixBuf[0];
2802 int16_t *chrBuf= chrPixBuf[0];
2803 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2808 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2809 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810 dest, uDest, vDest, dstW, chrDstW);
2815 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2819 int chrAlpha= vChrFilter[2*dstY+1];
2820 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2821 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2823 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2825 int lumAlpha= vLumFilter[2*dstY+1];
2826 int chrAlpha= vChrFilter[2*dstY+1];
2827 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2828 dest, dstW, lumAlpha, chrAlpha, dstY);
2832 RENAME(yuv2packedX)(c,
2833 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2834 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2839 else // hmm looks like we can't use MMX here without overwriting this array's tail
2841 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2842 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2843 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2845 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2846 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2848 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2849 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2850 dest, uDest, vDest, dstW, chrDstW);
2854 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2855 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2857 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2858 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2865 __asm __volatile(SFENCE:::"memory");
2866 __asm __volatile(EMMS:::"memory");
2868 /* store changed local vars back in the context */
2870 c->lumBufIndex= lumBufIndex;
2871 c->chrBufIndex= chrBufIndex;
2872 c->lastInLumBuf= lastInLumBuf;
2873 c->lastInChrBuf= lastInChrBuf;
2875 return dstY - lastDstY;