2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
161 #define YSCALEYUV2RGBX \
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "m" ((long)chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "m" ((long)chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "m" ((long)dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
799 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
807 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
814 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
822 :: "r" (lumSrc + dstW), "r" (dest + dstW),
828 for(i=0; i<dstW; i++)
830 int val= lumSrc[i]>>7;
841 for(i=0; i<chrDstW; i++)
844 int v=chrSrc[i + 2048]>>7;
848 else if (u>255) u=255;
850 else if (v>255) v=255;
861 * vertical scale YV12 to RGB
863 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865 uint8_t *dest, int dstW, int dstY)
875 WRITEBGR32(%4, %5, %%REGa)
877 :: "r" (&c->redDither),
878 "m" (dummy), "m" (dummy), "m" (dummy),
879 "r" (dest), "m" (dstW)
880 : "%"REG_a, "%"REG_d, "%"REG_S
888 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889 "add %4, %%"REG_b" \n\t"
890 WRITEBGR24(%%REGb, %5, %%REGa)
892 :: "r" (&c->redDither),
893 "m" (dummy), "m" (dummy), "m" (dummy),
894 "r" (dest), "m" (dstW)
895 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
903 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
905 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
910 WRITEBGR15(%4, %5, %%REGa)
912 :: "r" (&c->redDither),
913 "m" (dummy), "m" (dummy), "m" (dummy),
914 "r" (dest), "m" (dstW)
915 : "%"REG_a, "%"REG_d, "%"REG_S
923 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
925 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
930 WRITEBGR16(%4, %5, %%REGa)
932 :: "r" (&c->redDither),
933 "m" (dummy), "m" (dummy), "m" (dummy),
934 "r" (dest), "m" (dstW)
935 : "%"REG_a, "%"REG_d, "%"REG_S
943 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
945 "psraw $3, %%mm3 \n\t"
946 "psraw $3, %%mm4 \n\t"
947 "psraw $3, %%mm1 \n\t"
948 "psraw $3, %%mm7 \n\t"
949 WRITEYUY2(%4, %5, %%REGa)
951 :: "r" (&c->redDither),
952 "m" (dummy), "m" (dummy), "m" (dummy),
953 "r" (dest), "m" (dstW)
954 : "%"REG_a, "%"REG_d, "%"REG_S
961 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962 chrFilter, chrSrc, chrFilterSize,
965 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966 chrFilter, chrSrc, chrFilterSize,
974 * vertical bilinear scale YV12 to RGB
976 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
979 int yalpha1=yalpha^4095;
980 int uvalpha1=uvalpha^4095;
984 if(flags&SWS_FULL_CHR_H_INT)
994 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
995 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
997 "movq %%mm3, %%mm1 \n\t"
998 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
999 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1001 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1004 "add $4, %%"REG_a" \n\t"
1005 "cmp %5, %%"REG_a" \n\t"
1009 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010 "m" (yalpha1), "m" (uvalpha1)
1020 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1021 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1023 "movq %%mm3, %%mm1 \n\t"
1024 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1025 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1027 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1028 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1029 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1032 "movq %%mm1, %%mm2 \n\t"
1033 "psllq $48, %%mm1 \n\t" // 000000BG
1034 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1036 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1037 "psrld $16, %%mm2 \n\t" // R000R000
1038 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1039 "por %%mm2, %%mm1 \n\t" // RBGRR000
1041 "mov %4, %%"REG_b" \n\t"
1042 "add %%"REG_a", %%"REG_b" \n\t"
1046 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1049 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1050 "psrlq $32, %%mm3 \n\t"
1051 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1052 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1054 "add $4, %%"REG_a" \n\t"
1055 "cmp %5, %%"REG_a" \n\t"
1058 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059 "m" (yalpha1), "m" (uvalpha1)
1060 : "%"REG_a, "%"REG_b
1068 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1072 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1073 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1074 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1076 "psrlw $3, %%mm3 \n\t"
1077 "psllw $2, %%mm1 \n\t"
1078 "psllw $7, %%mm0 \n\t"
1079 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1080 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1082 "por %%mm3, %%mm1 \n\t"
1083 "por %%mm1, %%mm0 \n\t"
1085 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1087 "add $4, %%"REG_a" \n\t"
1088 "cmp %5, %%"REG_a" \n\t"
1091 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092 "m" (yalpha1), "m" (uvalpha1)
1101 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1105 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1106 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1107 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1109 "psrlw $3, %%mm3 \n\t"
1110 "psllw $3, %%mm1 \n\t"
1111 "psllw $8, %%mm0 \n\t"
1112 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1113 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1115 "por %%mm3, %%mm1 \n\t"
1116 "por %%mm1, %%mm0 \n\t"
1118 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1120 "add $4, %%"REG_a" \n\t"
1121 "cmp %5, %%"REG_a" \n\t"
1124 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125 "m" (yalpha1), "m" (uvalpha1)
1134 if(dstFormat==IMGFMT_BGR32)
1137 #ifdef WORDS_BIGENDIAN
1140 for(i=0;i<dstW;i++){
1141 // vertical linear interpolation && yuv2rgb in a single step:
1142 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1151 else if(dstFormat==IMGFMT_BGR24)
1154 for(i=0;i<dstW;i++){
1155 // vertical linear interpolation && yuv2rgb in a single step:
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1165 else if(dstFormat==IMGFMT_BGR16)
1168 for(i=0;i<dstW;i++){
1169 // vertical linear interpolation && yuv2rgb in a single step:
1170 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1174 ((uint16_t*)dest)[i] =
1175 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1180 else if(dstFormat==IMGFMT_BGR15)
1183 for(i=0;i<dstW;i++){
1184 // vertical linear interpolation && yuv2rgb in a single step:
1185 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1189 ((uint16_t*)dest)[i] =
1190 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1200 switch(c->dstFormat)
1202 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1205 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1206 "mov %4, %%"REG_SP" \n\t"
1207 YSCALEYUV2RGB(%%REGa, %5)
1208 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1211 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1218 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1219 "mov %4, %%"REG_SP" \n\t"
1220 YSCALEYUV2RGB(%%REGa, %5)
1221 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1223 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1230 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1231 "mov %4, %%"REG_SP" \n\t"
1232 YSCALEYUV2RGB(%%REGa, %5)
1233 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1235 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1240 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1243 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1250 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1251 "mov %4, %%"REG_SP" \n\t"
1252 YSCALEYUV2RGB(%%REGa, %5)
1253 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1255 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1260 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1262 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1269 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_SP" \n\t"
1271 YSCALEYUV2PACKED(%%REGa, %5)
1272 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1274 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1286 * YV12 to RGB without scaling or interpolating
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1291 const int yalpha1=0;
1294 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295 const int yalpha= 4096; //FIXME ...
1297 if(flags&SWS_FULL_CHR_H_INT)
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1304 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1310 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1311 "mov %4, %%"REG_SP" \n\t"
1312 YSCALEYUV2RGB1(%%REGa, %5)
1313 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1316 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1323 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1324 "mov %4, %%"REG_SP" \n\t"
1325 YSCALEYUV2RGB1(%%REGa, %5)
1326 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1329 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1336 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1337 "mov %4, %%"REG_SP" \n\t"
1338 YSCALEYUV2RGB1(%%REGa, %5)
1339 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1341 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1345 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1348 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1355 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1356 "mov %4, %%"REG_SP" \n\t"
1357 YSCALEYUV2RGB1(%%REGa, %5)
1358 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1360 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1365 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1368 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1375 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1376 "mov %4, %%"REG_SP" \n\t"
1377 YSCALEYUV2PACKED1(%%REGa, %5)
1378 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1381 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1394 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_SP" \n\t"
1396 YSCALEYUV2RGB1b(%%REGa, %5)
1397 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1400 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1407 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1408 "mov %4, %%"REG_SP" \n\t"
1409 YSCALEYUV2RGB1b(%%REGa, %5)
1410 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1413 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1420 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_SP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGa, %5)
1423 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1425 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1429 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1432 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1439 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_SP" \n\t"
1441 YSCALEYUV2RGB1b(%%REGa, %5)
1442 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1449 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1452 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1459 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_SP" \n\t"
1461 YSCALEYUV2PACKED1b(%%REGa, %5)
1462 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1465 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1473 if( uvalpha < 2048 )
1475 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1477 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1481 //FIXME yuy2* can read upto 7 samples to much
1483 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1487 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488 "mov %0, %%"REG_a" \n\t"
1490 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1491 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1492 "pand %%mm2, %%mm0 \n\t"
1493 "pand %%mm2, %%mm1 \n\t"
1494 "packuswb %%mm1, %%mm0 \n\t"
1495 "movq %%mm0, (%2, %%"REG_a") \n\t"
1496 "add $8, %%"REG_a" \n\t"
1498 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1503 for(i=0; i<width; i++)
1508 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1510 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1512 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513 "mov %0, %%"REG_a" \n\t"
1515 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1516 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1517 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1518 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1521 "psrlw $8, %%mm0 \n\t"
1522 "psrlw $8, %%mm1 \n\t"
1523 "packuswb %%mm1, %%mm0 \n\t"
1524 "movq %%mm0, %%mm1 \n\t"
1525 "psrlw $8, %%mm0 \n\t"
1526 "pand %%mm4, %%mm1 \n\t"
1527 "packuswb %%mm0, %%mm0 \n\t"
1528 "packuswb %%mm1, %%mm1 \n\t"
1529 "movd %%mm0, (%4, %%"REG_a") \n\t"
1530 "movd %%mm1, (%3, %%"REG_a") \n\t"
1531 "add $4, %%"REG_a" \n\t"
1533 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1538 for(i=0; i<width; i++)
1540 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1546 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1551 "mov %0, %%"REG_a" \n\t"
1553 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1554 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1555 "psrlw $8, %%mm0 \n\t"
1556 "psrlw $8, %%mm1 \n\t"
1557 "packuswb %%mm1, %%mm0 \n\t"
1558 "movq %%mm0, (%2, %%"REG_a") \n\t"
1559 "add $8, %%"REG_a" \n\t"
1561 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1566 for(i=0; i<width; i++)
1571 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1575 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576 "mov %0, %%"REG_a" \n\t"
1578 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1579 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1580 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1581 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1584 "pand %%mm4, %%mm0 \n\t"
1585 "pand %%mm4, %%mm1 \n\t"
1586 "packuswb %%mm1, %%mm0 \n\t"
1587 "movq %%mm0, %%mm1 \n\t"
1588 "psrlw $8, %%mm0 \n\t"
1589 "pand %%mm4, %%mm1 \n\t"
1590 "packuswb %%mm0, %%mm0 \n\t"
1591 "packuswb %%mm1, %%mm1 \n\t"
1592 "movd %%mm0, (%4, %%"REG_a") \n\t"
1593 "movd %%mm1, (%3, %%"REG_a") \n\t"
1594 "add $4, %%"REG_a" \n\t"
1596 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1601 for(i=0; i<width; i++)
1603 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1609 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1611 #ifdef HAVE_MMXFIXME
1614 for(i=0; i<width; i++)
1616 int b= ((uint32_t*)src)[i]&0xFF;
1617 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1620 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1625 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1627 #ifdef HAVE_MMXFIXME
1630 for(i=0; i<width; i++)
1632 const int a= ((uint32_t*)src1)[2*i+0];
1633 const int e= ((uint32_t*)src1)[2*i+1];
1634 const int c= ((uint32_t*)src2)[2*i+0];
1635 const int d= ((uint32_t*)src2)[2*i+1];
1636 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638 const int b= l&0x3FF;
1642 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1652 "mov %2, %%"REG_a" \n\t"
1653 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1654 "movq "MANGLE(w1111)", %%mm5 \n\t"
1655 "pxor %%mm7, %%mm7 \n\t"
1656 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1659 PREFETCH" 64(%0, %%"REG_b") \n\t"
1660 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1661 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1662 "punpcklbw %%mm7, %%mm0 \n\t"
1663 "punpcklbw %%mm7, %%mm1 \n\t"
1664 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1665 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1666 "punpcklbw %%mm7, %%mm2 \n\t"
1667 "punpcklbw %%mm7, %%mm3 \n\t"
1668 "pmaddwd %%mm6, %%mm0 \n\t"
1669 "pmaddwd %%mm6, %%mm1 \n\t"
1670 "pmaddwd %%mm6, %%mm2 \n\t"
1671 "pmaddwd %%mm6, %%mm3 \n\t"
1672 #ifndef FAST_BGR2YV12
1673 "psrad $8, %%mm0 \n\t"
1674 "psrad $8, %%mm1 \n\t"
1675 "psrad $8, %%mm2 \n\t"
1676 "psrad $8, %%mm3 \n\t"
1678 "packssdw %%mm1, %%mm0 \n\t"
1679 "packssdw %%mm3, %%mm2 \n\t"
1680 "pmaddwd %%mm5, %%mm0 \n\t"
1681 "pmaddwd %%mm5, %%mm2 \n\t"
1682 "packssdw %%mm2, %%mm0 \n\t"
1683 "psraw $7, %%mm0 \n\t"
1685 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1686 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1687 "punpcklbw %%mm7, %%mm4 \n\t"
1688 "punpcklbw %%mm7, %%mm1 \n\t"
1689 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1690 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1691 "punpcklbw %%mm7, %%mm2 \n\t"
1692 "punpcklbw %%mm7, %%mm3 \n\t"
1693 "pmaddwd %%mm6, %%mm4 \n\t"
1694 "pmaddwd %%mm6, %%mm1 \n\t"
1695 "pmaddwd %%mm6, %%mm2 \n\t"
1696 "pmaddwd %%mm6, %%mm3 \n\t"
1697 #ifndef FAST_BGR2YV12
1698 "psrad $8, %%mm4 \n\t"
1699 "psrad $8, %%mm1 \n\t"
1700 "psrad $8, %%mm2 \n\t"
1701 "psrad $8, %%mm3 \n\t"
1703 "packssdw %%mm1, %%mm4 \n\t"
1704 "packssdw %%mm3, %%mm2 \n\t"
1705 "pmaddwd %%mm5, %%mm4 \n\t"
1706 "pmaddwd %%mm5, %%mm2 \n\t"
1707 "add $24, %%"REG_b" \n\t"
1708 "packssdw %%mm2, %%mm4 \n\t"
1709 "psraw $7, %%mm4 \n\t"
1711 "packuswb %%mm4, %%mm0 \n\t"
1712 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1714 "movq %%mm0, (%1, %%"REG_a") \n\t"
1715 "add $8, %%"REG_a" \n\t"
1717 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718 : "%"REG_a, "%"REG_b
1722 for(i=0; i<width; i++)
1728 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1733 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1737 "mov %4, %%"REG_a" \n\t"
1738 "movq "MANGLE(w1111)", %%mm5 \n\t"
1739 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1740 "pxor %%mm7, %%mm7 \n\t"
1741 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1742 "add %%"REG_b", %%"REG_b" \n\t"
1745 PREFETCH" 64(%0, %%"REG_b") \n\t"
1746 PREFETCH" 64(%1, %%"REG_b") \n\t"
1747 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1749 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1750 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1751 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1754 "movq %%mm0, %%mm1 \n\t"
1755 "movq %%mm2, %%mm3 \n\t"
1756 "psrlq $24, %%mm0 \n\t"
1757 "psrlq $24, %%mm2 \n\t"
1760 "punpcklbw %%mm7, %%mm0 \n\t"
1761 "punpcklbw %%mm7, %%mm2 \n\t"
1763 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1764 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1765 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1766 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1767 "punpcklbw %%mm7, %%mm0 \n\t"
1768 "punpcklbw %%mm7, %%mm1 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1770 "punpcklbw %%mm7, %%mm3 \n\t"
1771 "paddw %%mm1, %%mm0 \n\t"
1772 "paddw %%mm3, %%mm2 \n\t"
1773 "paddw %%mm2, %%mm0 \n\t"
1774 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1775 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1776 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1777 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1778 "punpcklbw %%mm7, %%mm4 \n\t"
1779 "punpcklbw %%mm7, %%mm1 \n\t"
1780 "punpcklbw %%mm7, %%mm2 \n\t"
1781 "punpcklbw %%mm7, %%mm3 \n\t"
1782 "paddw %%mm1, %%mm4 \n\t"
1783 "paddw %%mm3, %%mm2 \n\t"
1784 "paddw %%mm4, %%mm2 \n\t"
1785 "psrlw $2, %%mm0 \n\t"
1786 "psrlw $2, %%mm2 \n\t"
1788 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1789 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1791 "pmaddwd %%mm0, %%mm1 \n\t"
1792 "pmaddwd %%mm2, %%mm3 \n\t"
1793 "pmaddwd %%mm6, %%mm0 \n\t"
1794 "pmaddwd %%mm6, %%mm2 \n\t"
1795 #ifndef FAST_BGR2YV12
1796 "psrad $8, %%mm0 \n\t"
1797 "psrad $8, %%mm1 \n\t"
1798 "psrad $8, %%mm2 \n\t"
1799 "psrad $8, %%mm3 \n\t"
1801 "packssdw %%mm2, %%mm0 \n\t"
1802 "packssdw %%mm3, %%mm1 \n\t"
1803 "pmaddwd %%mm5, %%mm0 \n\t"
1804 "pmaddwd %%mm5, %%mm1 \n\t"
1805 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1806 "psraw $7, %%mm0 \n\t"
1808 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1810 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1811 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1812 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1815 "movq %%mm4, %%mm1 \n\t"
1816 "movq %%mm2, %%mm3 \n\t"
1817 "psrlq $24, %%mm4 \n\t"
1818 "psrlq $24, %%mm2 \n\t"
1821 "punpcklbw %%mm7, %%mm4 \n\t"
1822 "punpcklbw %%mm7, %%mm2 \n\t"
1824 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1825 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1826 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1827 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1828 "punpcklbw %%mm7, %%mm4 \n\t"
1829 "punpcklbw %%mm7, %%mm1 \n\t"
1830 "punpcklbw %%mm7, %%mm2 \n\t"
1831 "punpcklbw %%mm7, %%mm3 \n\t"
1832 "paddw %%mm1, %%mm4 \n\t"
1833 "paddw %%mm3, %%mm2 \n\t"
1834 "paddw %%mm2, %%mm4 \n\t"
1835 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1836 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1837 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1838 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1839 "punpcklbw %%mm7, %%mm5 \n\t"
1840 "punpcklbw %%mm7, %%mm1 \n\t"
1841 "punpcklbw %%mm7, %%mm2 \n\t"
1842 "punpcklbw %%mm7, %%mm3 \n\t"
1843 "paddw %%mm1, %%mm5 \n\t"
1844 "paddw %%mm3, %%mm2 \n\t"
1845 "paddw %%mm5, %%mm2 \n\t"
1846 "movq "MANGLE(w1111)", %%mm5 \n\t"
1847 "psrlw $2, %%mm4 \n\t"
1848 "psrlw $2, %%mm2 \n\t"
1850 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1851 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1853 "pmaddwd %%mm4, %%mm1 \n\t"
1854 "pmaddwd %%mm2, %%mm3 \n\t"
1855 "pmaddwd %%mm6, %%mm4 \n\t"
1856 "pmaddwd %%mm6, %%mm2 \n\t"
1857 #ifndef FAST_BGR2YV12
1858 "psrad $8, %%mm4 \n\t"
1859 "psrad $8, %%mm1 \n\t"
1860 "psrad $8, %%mm2 \n\t"
1861 "psrad $8, %%mm3 \n\t"
1863 "packssdw %%mm2, %%mm4 \n\t"
1864 "packssdw %%mm3, %%mm1 \n\t"
1865 "pmaddwd %%mm5, %%mm4 \n\t"
1866 "pmaddwd %%mm5, %%mm1 \n\t"
1867 "add $24, %%"REG_b" \n\t"
1868 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1869 "psraw $7, %%mm4 \n\t"
1871 "movq %%mm0, %%mm1 \n\t"
1872 "punpckldq %%mm4, %%mm0 \n\t"
1873 "punpckhdq %%mm4, %%mm1 \n\t"
1874 "packsswb %%mm1, %%mm0 \n\t"
1875 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1877 "movd %%mm0, (%2, %%"REG_a") \n\t"
1878 "punpckhdq %%mm0, %%mm0 \n\t"
1879 "movd %%mm0, (%3, %%"REG_a") \n\t"
1880 "add $4, %%"REG_a" \n\t"
1882 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883 : "%"REG_a, "%"REG_b
1887 for(i=0; i<width; i++)
1889 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1893 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1899 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1902 for(i=0; i<width; i++)
1904 int d= ((uint16_t*)src)[i];
1907 int r= (d>>11)&0x1F;
1909 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1916 for(i=0; i<width; i++)
1918 int d0= ((uint32_t*)src1)[i];
1919 int d1= ((uint32_t*)src2)[i];
1921 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1924 int dh2= (dh>>11) + (dh<<21);
1928 int r= (d>>11)&0x7F;
1930 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1938 for(i=0; i<width; i++)
1940 int d= ((uint16_t*)src)[i];
1943 int r= (d>>10)&0x1F;
1945 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1952 for(i=0; i<width; i++)
1954 int d0= ((uint32_t*)src1)[i];
1955 int d1= ((uint32_t*)src2)[i];
1957 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1960 int dh2= (dh>>11) + (dh<<21);
1964 int r= (d>>10)&0x7F;
1966 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1972 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1975 for(i=0; i<width; i++)
1977 int r= ((uint32_t*)src)[i]&0xFF;
1978 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1981 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1985 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1988 for(i=0; i<width; i++)
1990 const int a= ((uint32_t*)src1)[2*i+0];
1991 const int e= ((uint32_t*)src1)[2*i+1];
1992 const int c= ((uint32_t*)src2)[2*i+0];
1993 const int d= ((uint32_t*)src2)[2*i+1];
1994 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996 const int r= l&0x3FF;
2000 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2008 for(i=0; i<width; i++)
2014 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2018 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2021 for(i=0; i<width; i++)
2023 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2027 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2033 // Bilinear / Bicubic scaling
2034 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035 int16_t *filter, int16_t *filterPos, int filterSize)
2038 assert(filterSize % 4 == 0 && filterSize>0);
2039 if(filterSize==4) // allways true for upscaling, sometimes for down too
2041 long counter= -2*dstW;
2043 filterPos-= counter/2;
2046 "pxor %%mm7, %%mm7 \n\t"
2047 "movq "MANGLE(w02)", %%mm6 \n\t"
2048 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2049 "mov %%"REG_a", %%"REG_BP" \n\t"
2052 "movzxw (%2, %%"REG_BP"), %%"REG_a"\n\t"
2053 "movzxw 2(%2, %%"REG_BP"), %%"REG_b"\n\t"
2054 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2057 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2058 "punpcklbw %%mm7, %%mm0 \n\t"
2059 "punpcklbw %%mm7, %%mm2 \n\t"
2060 "pmaddwd %%mm1, %%mm0 \n\t"
2061 "pmaddwd %%mm2, %%mm3 \n\t"
2062 "psrad $8, %%mm0 \n\t"
2063 "psrad $8, %%mm3 \n\t"
2064 "packssdw %%mm3, %%mm0 \n\t"
2065 "pmaddwd %%mm6, %%mm0 \n\t"
2066 "packssdw %%mm0, %%mm0 \n\t"
2067 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2068 "add $4, %%"REG_BP" \n\t"
2071 "pop %%"REG_BP" \n\t"
2073 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077 else if(filterSize==8)
2079 long counter= -2*dstW;
2081 filterPos-= counter/2;
2084 "pxor %%mm7, %%mm7 \n\t"
2085 "movq "MANGLE(w02)", %%mm6 \n\t"
2086 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2087 "mov %%"REG_a", %%"REG_BP" \n\t"
2090 "movzxw (%2, %%"REG_BP"), %%"REG_a"\n\t"
2091 "movzxw 2(%2, %%"REG_BP"), %%"REG_b"\n\t"
2092 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2095 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2096 "punpcklbw %%mm7, %%mm0 \n\t"
2097 "punpcklbw %%mm7, %%mm2 \n\t"
2098 "pmaddwd %%mm1, %%mm0 \n\t"
2099 "pmaddwd %%mm2, %%mm3 \n\t"
2101 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2104 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2105 "punpcklbw %%mm7, %%mm4 \n\t"
2106 "punpcklbw %%mm7, %%mm2 \n\t"
2107 "pmaddwd %%mm1, %%mm4 \n\t"
2108 "pmaddwd %%mm2, %%mm5 \n\t"
2109 "paddd %%mm4, %%mm0 \n\t"
2110 "paddd %%mm5, %%mm3 \n\t"
2112 "psrad $8, %%mm0 \n\t"
2113 "psrad $8, %%mm3 \n\t"
2114 "packssdw %%mm3, %%mm0 \n\t"
2115 "pmaddwd %%mm6, %%mm0 \n\t"
2116 "packssdw %%mm0, %%mm0 \n\t"
2117 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2118 "add $4, %%"REG_BP" \n\t"
2121 "pop %%"REG_BP" \n\t"
2123 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2129 long counter= -2*dstW;
2130 // filter-= counter*filterSize/2;
2131 filterPos-= counter/2;
2134 "pxor %%mm7, %%mm7 \n\t"
2135 "movq "MANGLE(w02)", %%mm6 \n\t"
2138 "mov %2, %%"REG_c" \n\t"
2139 "movzxw (%%"REG_c", %0), %%"REG_a"\n\t"
2140 "movzxw 2(%%"REG_c", %0), %%"REG_b"\n\t"
2141 "mov %5, %%"REG_c" \n\t"
2142 "pxor %%mm4, %%mm4 \n\t"
2143 "pxor %%mm5, %%mm5 \n\t"
2145 "movq (%1), %%mm1 \n\t"
2146 "movq (%1, %6), %%mm3 \n\t"
2147 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149 "punpcklbw %%mm7, %%mm0 \n\t"
2150 "punpcklbw %%mm7, %%mm2 \n\t"
2151 "pmaddwd %%mm1, %%mm0 \n\t"
2152 "pmaddwd %%mm2, %%mm3 \n\t"
2153 "paddd %%mm3, %%mm5 \n\t"
2154 "paddd %%mm0, %%mm4 \n\t"
2156 "add $4, %%"REG_c" \n\t"
2157 "cmp %4, %%"REG_c" \n\t"
2160 "psrad $8, %%mm4 \n\t"
2161 "psrad $8, %%mm5 \n\t"
2162 "packssdw %%mm5, %%mm4 \n\t"
2163 "pmaddwd %%mm6, %%mm4 \n\t"
2164 "packssdw %%mm4, %%mm4 \n\t"
2165 "mov %3, %%"REG_a" \n\t"
2166 "movd %%mm4, (%%"REG_a", %0) \n\t"
2170 : "+r" (counter), "+r" (filter)
2171 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172 "m" (src), "r" ((long)filterSize*2)
2173 : "%"REG_b, "%"REG_a, "%"REG_c
2178 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2181 for(i=0; i<dstW; i++)
2184 int srcPos= filterPos[i];
2186 // printf("filterPos: %d\n", filterPos[i]);
2187 for(j=0; j<filterSize; j++)
2189 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2192 // filter += hFilterSize;
2193 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2199 // *** horizontal scale Y line to temp buffer
2200 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2203 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204 int32_t *mmx2FilterPos)
2206 if(srcFormat==IMGFMT_YUY2)
2208 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209 src= formatConvBuffer;
2211 else if(srcFormat==IMGFMT_UYVY)
2213 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214 src= formatConvBuffer;
2216 else if(srcFormat==IMGFMT_BGR32)
2218 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219 src= formatConvBuffer;
2221 else if(srcFormat==IMGFMT_BGR24)
2223 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224 src= formatConvBuffer;
2226 else if(srcFormat==IMGFMT_BGR16)
2228 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229 src= formatConvBuffer;
2231 else if(srcFormat==IMGFMT_BGR15)
2233 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234 src= formatConvBuffer;
2236 else if(srcFormat==IMGFMT_RGB32)
2238 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239 src= formatConvBuffer;
2241 else if(srcFormat==IMGFMT_RGB24)
2243 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244 src= formatConvBuffer;
2248 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2251 if(!(flags&SWS_FAST_BILINEAR))
2254 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2256 else // Fast Bilinear upscale / crap downscale
2258 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2264 "pxor %%mm7, %%mm7 \n\t"
2265 "mov %0, %%"REG_c" \n\t"
2266 "mov %1, %%"REG_D" \n\t"
2267 "mov %2, %%"REG_d" \n\t"
2268 "mov %3, %%"REG_b" \n\t"
2269 "xor %%"REG_a", %%"REG_a" \n\t" // i
2270 PREFETCH" (%%"REG_c") \n\t"
2271 PREFETCH" 32(%%"REG_c") \n\t"
2272 PREFETCH" 64(%%"REG_c") \n\t"
2274 #define FUNNY_Y_CODE \
2275 "mov (%%"REG_b"), %%"REG_S" \n\t"\
2277 "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2278 "add %%"REG_a", %%"REG_d" \n\t"\
2279 "xor %%"REG_a", %%"REG_a" \n\t"\
2290 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2292 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_d
2294 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2299 //NO MMX just normal asm ...
2301 "xor %%"REG_a", %%"REG_a" \n\t" // i
2302 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2303 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2306 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2307 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2308 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2309 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2310 "shll $16, %%edi \n\t"
2311 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2312 "mov %1, %%"REG_D" \n\t"
2313 "shrl $9, %%esi \n\t"
2314 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2315 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2316 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2318 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2319 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2320 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2321 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2322 "shll $16, %%edi \n\t"
2323 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2324 "mov %1, %%"REG_D" \n\t"
2325 "shrl $9, %%esi \n\t"
2326 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2327 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2328 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2331 "add $2, %%"REG_a" \n\t"
2332 "cmp %2, %%"REG_a" \n\t"
2336 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2337 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2340 } //if MMX2 can't be used
2344 unsigned int xpos=0;
2345 for(i=0;i<dstWidth;i++)
2347 register unsigned int xx=xpos>>16;
2348 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2349 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2356 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2357 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2358 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2359 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2360 int32_t *mmx2FilterPos)
2362 if(srcFormat==IMGFMT_YUY2)
2364 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2365 src1= formatConvBuffer;
2366 src2= formatConvBuffer+2048;
2368 else if(srcFormat==IMGFMT_UYVY)
2370 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2371 src1= formatConvBuffer;
2372 src2= formatConvBuffer+2048;
2374 else if(srcFormat==IMGFMT_BGR32)
2376 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2377 src1= formatConvBuffer;
2378 src2= formatConvBuffer+2048;
2380 else if(srcFormat==IMGFMT_BGR24)
2382 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2383 src1= formatConvBuffer;
2384 src2= formatConvBuffer+2048;
2386 else if(srcFormat==IMGFMT_BGR16)
2388 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2389 src1= formatConvBuffer;
2390 src2= formatConvBuffer+2048;
2392 else if(srcFormat==IMGFMT_BGR15)
2394 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2395 src1= formatConvBuffer;
2396 src2= formatConvBuffer+2048;
2398 else if(srcFormat==IMGFMT_RGB32)
2400 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2401 src1= formatConvBuffer;
2402 src2= formatConvBuffer+2048;
2404 else if(srcFormat==IMGFMT_RGB24)
2406 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407 src1= formatConvBuffer;
2408 src2= formatConvBuffer+2048;
2410 else if(isGray(srcFormat))
2416 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2417 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2419 if(!(flags&SWS_FAST_BILINEAR))
2422 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2423 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2425 else // Fast Bilinear upscale / crap downscale
2427 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2433 "pxor %%mm7, %%mm7 \n\t"
2434 "mov %0, %%"REG_c" \n\t"
2435 "mov %1, %%"REG_D" \n\t"
2436 "mov %2, %%"REG_d" \n\t"
2437 "mov %3, %%"REG_b" \n\t"
2438 "xor %%"REG_a", %%"REG_a" \n\t" // i
2439 PREFETCH" (%%"REG_c") \n\t"
2440 PREFETCH" 32(%%"REG_c") \n\t"
2441 PREFETCH" 64(%%"REG_c") \n\t"
2443 #define FUNNY_UV_CODE \
2444 "movl (%%"REG_b"), %%esi \n\t"\
2446 "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2447 "add %%"REG_a", %%"REG_D" \n\t"\
2448 "xor %%"REG_a", %%"REG_a" \n\t"\
2454 "xor %%"REG_a", %%"REG_a" \n\t" // i
2455 "mov %5, %%"REG_c" \n\t" // src
2456 "mov %1, %%"REG_D" \n\t" // buf1
2457 "add $4096, %%"REG_D" \n\t"
2458 PREFETCH" (%%"REG_c") \n\t"
2459 PREFETCH" 32(%%"REG_c") \n\t"
2460 PREFETCH" 64(%%"REG_c") \n\t"
2467 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2468 "m" (funnyUVCode), "m" (src2)
2469 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%esi", "%"REG_D
2471 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2473 // printf("%d %d %d\n", dstWidth, i, srcW);
2474 dst[i] = src1[srcW-1]*128;
2475 dst[i+2048] = src2[srcW-1]*128;
2482 "xor %%"REG_a", %%"REG_a" \n\t" // i
2483 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2484 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2487 "mov %0, %%"REG_S" \n\t"
2488 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2489 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2490 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2491 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2492 "shll $16, %%edi \n\t"
2493 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2494 "mov %1, %%"REG_D" \n\t"
2495 "shrl $9, %%esi \n\t"
2496 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2498 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2499 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2500 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2501 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2502 "shll $16, %%edi \n\t"
2503 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2504 "mov %1, %%"REG_D" \n\t"
2505 "shrl $9, %%esi \n\t"
2506 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2508 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2509 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2510 "add $1, %%"REG_a" \n\t"
2511 "cmp %2, %%"REG_a" \n\t"
2514 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2516 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2519 } //if MMX2 can't be used
2523 unsigned int xpos=0;
2524 for(i=0;i<dstWidth;i++)
2526 register unsigned int xx=xpos>>16;
2527 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2528 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2529 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2531 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2532 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2540 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2541 int srcSliceH, uint8_t* dst[], int dstStride[]){
2543 /* load a few things into local vars to make the code more readable? and faster */
2544 const int srcW= c->srcW;
2545 const int dstW= c->dstW;
2546 const int dstH= c->dstH;
2547 const int chrDstW= c->chrDstW;
2548 const int chrSrcW= c->chrSrcW;
2549 const int lumXInc= c->lumXInc;
2550 const int chrXInc= c->chrXInc;
2551 const int dstFormat= c->dstFormat;
2552 const int srcFormat= c->srcFormat;
2553 const int flags= c->flags;
2554 const int canMMX2BeUsed= c->canMMX2BeUsed;
2555 int16_t *vLumFilterPos= c->vLumFilterPos;
2556 int16_t *vChrFilterPos= c->vChrFilterPos;
2557 int16_t *hLumFilterPos= c->hLumFilterPos;
2558 int16_t *hChrFilterPos= c->hChrFilterPos;
2559 int16_t *vLumFilter= c->vLumFilter;
2560 int16_t *vChrFilter= c->vChrFilter;
2561 int16_t *hLumFilter= c->hLumFilter;
2562 int16_t *hChrFilter= c->hChrFilter;
2563 int32_t *lumMmxFilter= c->lumMmxFilter;
2564 int32_t *chrMmxFilter= c->chrMmxFilter;
2565 const int vLumFilterSize= c->vLumFilterSize;
2566 const int vChrFilterSize= c->vChrFilterSize;
2567 const int hLumFilterSize= c->hLumFilterSize;
2568 const int hChrFilterSize= c->hChrFilterSize;
2569 int16_t **lumPixBuf= c->lumPixBuf;
2570 int16_t **chrPixBuf= c->chrPixBuf;
2571 const int vLumBufSize= c->vLumBufSize;
2572 const int vChrBufSize= c->vChrBufSize;
2573 uint8_t *funnyYCode= c->funnyYCode;
2574 uint8_t *funnyUVCode= c->funnyUVCode;
2575 uint8_t *formatConvBuffer= c->formatConvBuffer;
2576 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2577 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2580 /* vars whch will change and which we need to storw back in the context */
2582 int lumBufIndex= c->lumBufIndex;
2583 int chrBufIndex= c->chrBufIndex;
2584 int lastInLumBuf= c->lastInLumBuf;
2585 int lastInChrBuf= c->lastInChrBuf;
2587 if(isPacked(c->srcFormat)){
2593 srcStride[2]= srcStride[0];
2595 srcStride[1]<<= c->vChrDrop;
2596 srcStride[2]<<= c->vChrDrop;
2598 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2599 // (int)dst[0], (int)dst[1], (int)dst[2]);
2601 #if 0 //self test FIXME move to a vfilter or something
2603 static volatile int i=0;
2605 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2606 selfTest(src, srcStride, c->srcW, c->srcH);
2611 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2612 //dstStride[0],dstStride[1],dstStride[2]);
2614 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2616 static int firstTime=1; //FIXME move this into the context perhaps
2617 if(flags & SWS_PRINT_INFO && firstTime)
2619 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2620 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2625 /* Note the user might start scaling the picture in the middle so this will not get executed
2626 this is not really intended but works currently, so ppl might do it */
2637 for(;dstY < dstH; dstY++){
2638 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2639 const int chrDstY= dstY>>c->chrDstVSubSample;
2640 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2641 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2643 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2644 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2645 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2646 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2648 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2649 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2650 //handle holes (FAST_BILINEAR & weird filters)
2651 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2652 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2653 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2654 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2655 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2657 // Do we have enough lines in this slice to output the dstY line
2658 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2660 //Do horizontal scaling
2661 while(lastInLumBuf < lastLumSrcY)
2663 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2665 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2666 ASSERT(lumBufIndex < 2*vLumBufSize)
2667 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2668 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2669 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2670 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2671 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2672 funnyYCode, c->srcFormat, formatConvBuffer,
2673 c->lumMmx2Filter, c->lumMmx2FilterPos);
2676 while(lastInChrBuf < lastChrSrcY)
2678 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2679 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2681 ASSERT(chrBufIndex < 2*vChrBufSize)
2682 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2683 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2684 //FIXME replace parameters through context struct (some at least)
2686 if(!(isGray(srcFormat) || isGray(dstFormat)))
2687 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2688 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2689 funnyUVCode, c->srcFormat, formatConvBuffer,
2690 c->chrMmx2Filter, c->chrMmx2FilterPos);
2693 //wrap buf index around to stay inside the ring buffer
2694 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2695 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2697 else // not enough lines left in this slice -> load the rest in the buffer
2699 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2700 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2701 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2702 vChrBufSize, vLumBufSize);*/
2704 //Do horizontal scaling
2705 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2707 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2709 ASSERT(lumBufIndex < 2*vLumBufSize)
2710 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2711 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2712 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714 funnyYCode, c->srcFormat, formatConvBuffer,
2715 c->lumMmx2Filter, c->lumMmx2FilterPos);
2718 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2720 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2723 ASSERT(chrBufIndex < 2*vChrBufSize)
2724 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2725 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2727 if(!(isGray(srcFormat) || isGray(dstFormat)))
2728 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2729 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2730 funnyUVCode, c->srcFormat, formatConvBuffer,
2731 c->chrMmx2Filter, c->chrMmx2FilterPos);
2734 //wrap buf index around to stay inside the ring buffer
2735 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2736 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2737 break; //we can't output a dstY line so let's try with the next slice
2741 b5Dither= dither8[dstY&1];
2742 g6Dither= dither4[dstY&1];
2743 g5Dither= dither8[dstY&1];
2744 r5Dither= dither8[(dstY+1)&1];
2748 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2749 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2752 for(i=0; i<vLumFilterSize; i++)
2754 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2755 lumMmxFilter[4*i+2]=
2756 lumMmxFilter[4*i+3]=
2757 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2759 for(i=0; i<vChrFilterSize; i++)
2761 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2762 chrMmxFilter[4*i+2]=
2763 chrMmxFilter[4*i+3]=
2764 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2767 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2769 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2770 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2771 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2773 int16_t *lumBuf = lumPixBuf[0];
2774 int16_t *chrBuf= chrPixBuf[0];
2775 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2780 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2781 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2782 dest, uDest, vDest, dstW, chrDstW);
2787 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2788 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2789 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2791 int chrAlpha= vChrFilter[2*dstY+1];
2792 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2793 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2795 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2797 int lumAlpha= vLumFilter[2*dstY+1];
2798 int chrAlpha= vChrFilter[2*dstY+1];
2799 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2800 dest, dstW, lumAlpha, chrAlpha, dstY);
2804 RENAME(yuv2packedX)(c,
2805 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2806 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2811 else // hmm looks like we can't use MMX here without overwriting this array's tail
2813 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2814 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2815 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2817 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2818 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2820 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2821 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2822 dest, uDest, vDest, dstW, chrDstW);
2826 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2827 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2829 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2830 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837 __asm __volatile(SFENCE:::"memory");
2838 __asm __volatile(EMMS:::"memory");
2840 /* store changed local vars back in the context */
2842 c->lumBufIndex= lumBufIndex;
2843 c->chrBufIndex= chrBufIndex;
2844 c->lastInLumBuf= lastInLumBuf;
2845 c->lastInChrBuf= lastInChrBuf;
2847 return dstY - lastDstY;