2 // Software scaling and colorspace conversion routines for MPlayer
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL
10 #include "../config.h"
12 #include "../mmx_defs.h"
21 //disables the unscaled height version
24 #define RET 0xC3 //near return opcode
28 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
29 horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
31 Supported output formats BGR15 BGR16 BGR24 BGR32
32 BGR15 & BGR16 MMX verions support dithering
33 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
36 more intelligent missalignment avoidance for the horizontal scaler
39 change the distance of the u & v buffer
42 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
43 #define MIN(a,b) ((a) > (b) ? (b) : (a))
44 #define MAX(a,b) ((a) < (b) ? (b) : (a))
47 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
48 #elif defined (HAVE_3DNOW)
49 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
53 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
55 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60 static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
61 static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
62 static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
63 static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
64 static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
65 static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
66 static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
67 static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
68 static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
69 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
70 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
71 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
72 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
74 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
75 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
76 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
77 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
79 static uint64_t __attribute__((aligned(8))) dither4[2]={
81 0x0200020002000200LL,};
83 static uint64_t __attribute__((aligned(8))) dither8[2]={
85 0x0004000400040004LL,};
87 static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
88 static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
89 static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
90 static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
91 static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
92 static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
94 static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
95 static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
96 static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
98 static uint64_t __attribute__((aligned(8))) temp0;
99 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
100 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
103 // temporary storage for 4 yuv lines:
104 // 16bit for now (mmx likes it more compact)
106 static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
107 static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
109 static uint16_t pix_buf_y[4][2048];
110 static uint16_t pix_buf_uv[2][2048*2];
113 // clipping helper table for C implementations:
114 static unsigned char clip_table[768];
116 static unsigned short clip_table16b[768];
117 static unsigned short clip_table16g[768];
118 static unsigned short clip_table16r[768];
119 static unsigned short clip_table15b[768];
120 static unsigned short clip_table15g[768];
121 static unsigned short clip_table15r[768];
123 // yuv->rgb conversion tables:
124 static int yuvtab_2568[256];
125 static int yuvtab_3343[256];
126 static int yuvtab_0c92[256];
127 static int yuvtab_1a1e[256];
128 static int yuvtab_40cf[256];
131 static uint8_t funnyYCode[10000];
132 static uint8_t funnyUVCode[10000];
135 static int canMMX2BeUsed=0;
137 #define FULL_YSCALEYUV2RGB \
138 "pxor %%mm7, %%mm7 \n\t"\
139 "movd %6, %%mm6 \n\t" /*yalpha1*/\
140 "punpcklwd %%mm6, %%mm6 \n\t"\
141 "punpcklwd %%mm6, %%mm6 \n\t"\
142 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
143 "punpcklwd %%mm5, %%mm5 \n\t"\
144 "punpcklwd %%mm5, %%mm5 \n\t"\
145 "xorl %%eax, %%eax \n\t"\
148 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
149 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
150 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
151 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
152 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
153 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
154 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
155 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
156 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
157 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
158 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
159 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
160 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
161 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
162 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
163 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
164 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
165 "pmulhw yCoeff, %%mm1 \n\t"\
168 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
169 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
170 "pmulhw ubCoeff, %%mm3 \n\t"\
171 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
172 "pmulhw ugCoeff, %%mm2 \n\t"\
173 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
174 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
177 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
178 "pmulhw vrCoeff, %%mm0 \n\t"\
179 "pmulhw vgCoeff, %%mm4 \n\t"\
180 "paddw %%mm1, %%mm3 \n\t" /* B*/\
181 "paddw %%mm1, %%mm0 \n\t" /* R*/\
182 "packuswb %%mm3, %%mm3 \n\t"\
184 "packuswb %%mm0, %%mm0 \n\t"\
185 "paddw %%mm4, %%mm2 \n\t"\
186 "paddw %%mm2, %%mm1 \n\t" /* G*/\
188 "packuswb %%mm1, %%mm1 \n\t"
190 #define YSCALEYUV2RGB \
191 "movd %6, %%mm6 \n\t" /*yalpha1*/\
192 "punpcklwd %%mm6, %%mm6 \n\t"\
193 "punpcklwd %%mm6, %%mm6 \n\t"\
194 "movq %%mm6, asm_yalpha1 \n\t"\
195 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
196 "punpcklwd %%mm5, %%mm5 \n\t"\
197 "punpcklwd %%mm5, %%mm5 \n\t"\
198 "movq %%mm5, asm_uvalpha1 \n\t"\
199 "xorl %%eax, %%eax \n\t"\
202 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
203 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
204 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
205 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
206 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
207 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
208 "movq asm_uvalpha1, %%mm0 \n\t"\
209 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
210 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
211 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
212 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
213 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
214 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
215 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
216 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
218 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
219 "pmulhw ugCoeff, %%mm3 \n\t"\
220 "pmulhw vgCoeff, %%mm4 \n\t"\
221 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
222 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
223 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
224 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
225 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
226 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
227 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
228 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
229 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
230 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
231 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
232 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
233 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
234 "pmulhw ubCoeff, %%mm2 \n\t"\
235 "pmulhw vrCoeff, %%mm5 \n\t"\
236 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
237 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
238 "pmulhw yCoeff, %%mm1 \n\t"\
239 "pmulhw yCoeff, %%mm7 \n\t"\
240 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
241 "paddw %%mm3, %%mm4 \n\t"\
242 "movq %%mm2, %%mm0 \n\t"\
243 "movq %%mm5, %%mm6 \n\t"\
244 "movq %%mm4, %%mm3 \n\t"\
245 "punpcklwd %%mm2, %%mm2 \n\t"\
246 "punpcklwd %%mm5, %%mm5 \n\t"\
247 "punpcklwd %%mm4, %%mm4 \n\t"\
248 "paddw %%mm1, %%mm2 \n\t"\
249 "paddw %%mm1, %%mm5 \n\t"\
250 "paddw %%mm1, %%mm4 \n\t"\
251 "punpckhwd %%mm0, %%mm0 \n\t"\
252 "punpckhwd %%mm6, %%mm6 \n\t"\
253 "punpckhwd %%mm3, %%mm3 \n\t"\
254 "paddw %%mm7, %%mm0 \n\t"\
255 "paddw %%mm7, %%mm6 \n\t"\
256 "paddw %%mm7, %%mm3 \n\t"\
257 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
258 "packuswb %%mm0, %%mm2 \n\t"\
259 "packuswb %%mm6, %%mm5 \n\t"\
260 "packuswb %%mm3, %%mm4 \n\t"\
261 "pxor %%mm7, %%mm7 \n\t"
263 #define YSCALEYUV2RGB1 \
264 "xorl %%eax, %%eax \n\t"\
267 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
268 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
269 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
270 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
271 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
272 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
273 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
274 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
275 "pmulhw ugCoeff, %%mm3 \n\t"\
276 "pmulhw vgCoeff, %%mm4 \n\t"\
277 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
278 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
279 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
280 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
281 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
282 "pmulhw ubCoeff, %%mm2 \n\t"\
283 "pmulhw vrCoeff, %%mm5 \n\t"\
284 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
285 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
286 "pmulhw yCoeff, %%mm1 \n\t"\
287 "pmulhw yCoeff, %%mm7 \n\t"\
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289 "paddw %%mm3, %%mm4 \n\t"\
290 "movq %%mm2, %%mm0 \n\t"\
291 "movq %%mm5, %%mm6 \n\t"\
292 "movq %%mm4, %%mm3 \n\t"\
293 "punpcklwd %%mm2, %%mm2 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
295 "punpcklwd %%mm4, %%mm4 \n\t"\
296 "paddw %%mm1, %%mm2 \n\t"\
297 "paddw %%mm1, %%mm5 \n\t"\
298 "paddw %%mm1, %%mm4 \n\t"\
299 "punpckhwd %%mm0, %%mm0 \n\t"\
300 "punpckhwd %%mm6, %%mm6 \n\t"\
301 "punpckhwd %%mm3, %%mm3 \n\t"\
302 "paddw %%mm7, %%mm0 \n\t"\
303 "paddw %%mm7, %%mm6 \n\t"\
304 "paddw %%mm7, %%mm3 \n\t"\
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306 "packuswb %%mm0, %%mm2 \n\t"\
307 "packuswb %%mm6, %%mm5 \n\t"\
308 "packuswb %%mm3, %%mm4 \n\t"\
309 "pxor %%mm7, %%mm7 \n\t"
311 // do vertical chrominance interpolation
312 #define YSCALEYUV2RGB1b \
313 "xorl %%eax, %%eax \n\t"\
316 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
317 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
318 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
319 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
320 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
321 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
322 "psrlw $5, %%mm3 \n\t"\
323 "psrlw $5, %%mm4 \n\t"\
324 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
325 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
326 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
327 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
328 "pmulhw ugCoeff, %%mm3 \n\t"\
329 "pmulhw vgCoeff, %%mm4 \n\t"\
330 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
331 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
332 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
333 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
334 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
335 "pmulhw ubCoeff, %%mm2 \n\t"\
336 "pmulhw vrCoeff, %%mm5 \n\t"\
337 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
338 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
339 "pmulhw yCoeff, %%mm1 \n\t"\
340 "pmulhw yCoeff, %%mm7 \n\t"\
341 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
342 "paddw %%mm3, %%mm4 \n\t"\
343 "movq %%mm2, %%mm0 \n\t"\
344 "movq %%mm5, %%mm6 \n\t"\
345 "movq %%mm4, %%mm3 \n\t"\
346 "punpcklwd %%mm2, %%mm2 \n\t"\
347 "punpcklwd %%mm5, %%mm5 \n\t"\
348 "punpcklwd %%mm4, %%mm4 \n\t"\
349 "paddw %%mm1, %%mm2 \n\t"\
350 "paddw %%mm1, %%mm5 \n\t"\
351 "paddw %%mm1, %%mm4 \n\t"\
352 "punpckhwd %%mm0, %%mm0 \n\t"\
353 "punpckhwd %%mm6, %%mm6 \n\t"\
354 "punpckhwd %%mm3, %%mm3 \n\t"\
355 "paddw %%mm7, %%mm0 \n\t"\
356 "paddw %%mm7, %%mm6 \n\t"\
357 "paddw %%mm7, %%mm3 \n\t"\
358 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
359 "packuswb %%mm0, %%mm2 \n\t"\
360 "packuswb %%mm6, %%mm5 \n\t"\
361 "packuswb %%mm3, %%mm4 \n\t"\
362 "pxor %%mm7, %%mm7 \n\t"
365 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
366 "movq %%mm2, %%mm1 \n\t" /* B */\
367 "movq %%mm5, %%mm6 \n\t" /* R */\
368 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
369 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
370 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
371 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
372 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
373 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
374 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
375 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
376 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
377 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
379 MOVNTQ(%%mm0, (%4, %%eax, 4))\
380 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
381 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
382 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
384 "addl $8, %%eax \n\t"\
385 "cmpl %5, %%eax \n\t"\
389 "pand bF8, %%mm2 \n\t" /* B */\
390 "pand bFC, %%mm4 \n\t" /* G */\
391 "pand bF8, %%mm5 \n\t" /* R */\
392 "psrlq $3, %%mm2 \n\t"\
394 "movq %%mm2, %%mm1 \n\t"\
395 "movq %%mm4, %%mm3 \n\t"\
397 "punpcklbw %%mm7, %%mm3 \n\t"\
398 "punpcklbw %%mm5, %%mm2 \n\t"\
399 "punpckhbw %%mm7, %%mm4 \n\t"\
400 "punpckhbw %%mm5, %%mm1 \n\t"\
402 "psllq $3, %%mm3 \n\t"\
403 "psllq $3, %%mm4 \n\t"\
405 "por %%mm3, %%mm2 \n\t"\
406 "por %%mm4, %%mm1 \n\t"\
408 MOVNTQ(%%mm2, (%4, %%eax, 2))\
409 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
411 "addl $8, %%eax \n\t"\
412 "cmpl %5, %%eax \n\t"\
416 "pand bF8, %%mm2 \n\t" /* B */\
417 "pand bF8, %%mm4 \n\t" /* G */\
418 "pand bF8, %%mm5 \n\t" /* R */\
419 "psrlq $3, %%mm2 \n\t"\
420 "psrlq $1, %%mm5 \n\t"\
422 "movq %%mm2, %%mm1 \n\t"\
423 "movq %%mm4, %%mm3 \n\t"\
425 "punpcklbw %%mm7, %%mm3 \n\t"\
426 "punpcklbw %%mm5, %%mm2 \n\t"\
427 "punpckhbw %%mm7, %%mm4 \n\t"\
428 "punpckhbw %%mm5, %%mm1 \n\t"\
430 "psllq $2, %%mm3 \n\t"\
431 "psllq $2, %%mm4 \n\t"\
433 "por %%mm3, %%mm2 \n\t"\
434 "por %%mm4, %%mm1 \n\t"\
436 MOVNTQ(%%mm2, (%4, %%eax, 2))\
437 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
439 "addl $8, %%eax \n\t"\
440 "cmpl %5, %%eax \n\t"\
443 #define WRITEBGR24OLD \
444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
445 "movq %%mm2, %%mm1 \n\t" /* B */\
446 "movq %%mm5, %%mm6 \n\t" /* R */\
447 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
448 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
449 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
450 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
451 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
452 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
453 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
454 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
455 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
456 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
458 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
459 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
460 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
461 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
462 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
463 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
464 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
465 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
467 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
468 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
469 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
470 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
471 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
472 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
473 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
474 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
475 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
476 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
477 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
478 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
479 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
481 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
482 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
483 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
484 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
485 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
486 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
487 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
488 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
490 MOVNTQ(%%mm0, (%%ebx))\
491 MOVNTQ(%%mm2, 8(%%ebx))\
492 MOVNTQ(%%mm3, 16(%%ebx))\
493 "addl $24, %%ebx \n\t"\
495 "addl $8, %%eax \n\t"\
496 "cmpl %5, %%eax \n\t"\
499 #define WRITEBGR24MMX \
500 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
501 "movq %%mm2, %%mm1 \n\t" /* B */\
502 "movq %%mm5, %%mm6 \n\t" /* R */\
503 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
504 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
505 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
506 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
507 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
508 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
509 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
510 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
511 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
512 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
514 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
515 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
516 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
517 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
519 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
520 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
521 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
522 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
524 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
525 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
526 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
527 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
529 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
530 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
531 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
532 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
533 MOVNTQ(%%mm0, (%%ebx))\
535 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
536 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
537 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
538 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
539 MOVNTQ(%%mm6, 8(%%ebx))\
541 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
542 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
543 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
544 MOVNTQ(%%mm5, 16(%%ebx))\
546 "addl $24, %%ebx \n\t"\
548 "addl $8, %%eax \n\t"\
549 "cmpl %5, %%eax \n\t"\
552 #define WRITEBGR24MMX2 \
553 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
554 "movq M24A, %%mm0 \n\t"\
555 "movq M24C, %%mm7 \n\t"\
556 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
557 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
558 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
560 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
561 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
562 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
564 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
565 "por %%mm1, %%mm6 \n\t"\
566 "por %%mm3, %%mm6 \n\t"\
567 MOVNTQ(%%mm6, (%%ebx))\
569 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
570 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
571 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
572 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
574 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
575 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
576 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
578 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
579 "por %%mm3, %%mm6 \n\t"\
580 MOVNTQ(%%mm6, 8(%%ebx))\
582 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
583 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
584 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
586 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
587 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
588 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
590 "por %%mm1, %%mm3 \n\t"\
591 "por %%mm3, %%mm6 \n\t"\
592 MOVNTQ(%%mm6, 16(%%ebx))\
594 "addl $24, %%ebx \n\t"\
596 "addl $8, %%eax \n\t"\
597 "cmpl %5, %%eax \n\t"\
601 #define WRITEBGR24 WRITEBGR24MMX2
603 #define WRITEBGR24 WRITEBGR24MMX
607 void in_asm_used_var_warning_killer()
609 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
610 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
616 static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
617 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
619 int yalpha1=yalpha^4095;
620 int uvalpha1=uvalpha^4095;
623 asm volatile ("\n\t"::: "memory");
627 ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
632 for(i=0; i<(dstw>>1); i++)
634 ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
635 ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
641 * vertical scale YV12 to RGB
643 static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
644 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
646 int yalpha1=yalpha^4095;
647 int uvalpha1=uvalpha^4095;
659 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
660 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
662 "movq %%mm3, %%mm1 \n\t"
663 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
664 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
666 MOVNTQ(%%mm3, (%4, %%eax, 4))
667 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
669 "addl $4, %%eax \n\t"
670 "cmpl %5, %%eax \n\t"
674 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
675 "m" (yalpha1), "m" (uvalpha1)
686 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
687 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
689 "movq %%mm3, %%mm1 \n\t"
690 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
691 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
693 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
694 "psrlq $8, %%mm3 \n\t" // GR0BGR00
695 "pand bm00000111, %%mm2 \n\t" // BGR00000
696 "pand bm11111000, %%mm3 \n\t" // 000BGR00
697 "por %%mm2, %%mm3 \n\t" // BGRBGR00
698 "movq %%mm1, %%mm2 \n\t"
699 "psllq $48, %%mm1 \n\t" // 000000BG
700 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
702 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
703 "psrld $16, %%mm2 \n\t" // R000R000
704 "psrlq $24, %%mm1 \n\t" // 0BGR0000
705 "por %%mm2, %%mm1 \n\t" // RBGRR000
707 "movl %4, %%ebx \n\t"
708 "addl %%eax, %%ebx \n\t"
712 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
713 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
715 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
716 "psrlq $32, %%mm3 \n\t"
717 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
718 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
720 "addl $4, %%eax \n\t"
721 "cmpl %5, %%eax \n\t"
724 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
725 "m" (yalpha1), "m" (uvalpha1)
735 "paddusb g5Dither, %%mm1 \n\t"
736 "paddusb r5Dither, %%mm0 \n\t"
737 "paddusb b5Dither, %%mm3 \n\t"
739 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
740 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
741 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
743 "psrlw $3, %%mm3 \n\t"
744 "psllw $2, %%mm1 \n\t"
745 "psllw $7, %%mm0 \n\t"
746 "pand g15Mask, %%mm1 \n\t"
747 "pand r15Mask, %%mm0 \n\t"
749 "por %%mm3, %%mm1 \n\t"
750 "por %%mm1, %%mm0 \n\t"
752 MOVNTQ(%%mm0, (%4, %%eax, 2))
754 "addl $4, %%eax \n\t"
755 "cmpl %5, %%eax \n\t"
758 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
759 "m" (yalpha1), "m" (uvalpha1)
769 "paddusb g6Dither, %%mm1 \n\t"
770 "paddusb r5Dither, %%mm0 \n\t"
771 "paddusb b5Dither, %%mm3 \n\t"
773 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
774 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
775 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
777 "psrlw $3, %%mm3 \n\t"
778 "psllw $3, %%mm1 \n\t"
779 "psllw $8, %%mm0 \n\t"
780 "pand g16Mask, %%mm1 \n\t"
781 "pand r16Mask, %%mm0 \n\t"
783 "por %%mm3, %%mm1 \n\t"
784 "por %%mm1, %%mm0 \n\t"
786 MOVNTQ(%%mm0, (%4, %%eax, 2))
788 "addl $4, %%eax \n\t"
789 "cmpl %5, %%eax \n\t"
792 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
793 "m" (yalpha1), "m" (uvalpha1)
798 asm volatile ("\n\t"::: "memory");
800 if(dstbpp==32 || dstbpp==24)
804 // vertical linear interpolation && yuv2rgb in a single step:
805 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
806 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
807 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
808 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
809 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
810 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
818 // vertical linear interpolation && yuv2rgb in a single step:
819 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
820 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
821 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
823 ((uint16_t*)dest)[i] =
824 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
825 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
826 clip_table16r[(Y + yuvtab_3343[V]) >>13];
833 // vertical linear interpolation && yuv2rgb in a single step:
834 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
835 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
836 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
838 ((uint16_t*)dest)[i] =
839 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
840 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
841 clip_table15r[(Y + yuvtab_3343[V]) >>13];
855 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
856 "m" (yalpha1), "m" (uvalpha1)
863 "movl %4, %%ebx \n\t"
867 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
868 "m" (yalpha1), "m" (uvalpha1)
876 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
878 "paddusb b5Dither, %%mm2 \n\t"
879 "paddusb g5Dither, %%mm4 \n\t"
880 "paddusb r5Dither, %%mm5 \n\t"
885 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
886 "m" (yalpha1), "m" (uvalpha1)
894 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
896 "paddusb b5Dither, %%mm2 \n\t"
897 "paddusb g6Dither, %%mm4 \n\t"
898 "paddusb r5Dither, %%mm5 \n\t"
903 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
904 "m" (yalpha1), "m" (uvalpha1)
909 asm volatile ("\n\t"::: "memory");
914 for(i=0; i<dstw-1; i+=2){
915 // vertical linear interpolation && yuv2rgb in a single step:
916 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
917 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
918 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
919 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
921 int Cb= yuvtab_40cf[U];
922 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
923 int Cr= yuvtab_3343[V];
925 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
926 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
927 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
929 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
930 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
931 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
937 for(i=0; i<dstw-1; i+=2){
938 // vertical linear interpolation && yuv2rgb in a single step:
939 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
940 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
941 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
942 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
944 int Cb= yuvtab_40cf[U];
945 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
946 int Cr= yuvtab_3343[V];
948 dest[0]=clip_table[((Y1 + Cb) >>13)];
949 dest[1]=clip_table[((Y1 + Cg) >>13)];
950 dest[2]=clip_table[((Y1 + Cr) >>13)];
952 dest[3]=clip_table[((Y2 + Cb) >>13)];
953 dest[4]=clip_table[((Y2 + Cg) >>13)];
954 dest[5]=clip_table[((Y2 + Cr) >>13)];
961 for(i=0; i<dstw-1; i+=2){
962 // vertical linear interpolation && yuv2rgb in a single step:
963 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
964 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
965 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
966 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
968 int Cb= yuvtab_40cf[U];
969 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
970 int Cr= yuvtab_3343[V];
972 ((uint16_t*)dest)[i] =
973 clip_table16b[(Y1 + Cb) >>13] |
974 clip_table16g[(Y1 + Cg) >>13] |
975 clip_table16r[(Y1 + Cr) >>13];
977 ((uint16_t*)dest)[i+1] =
978 clip_table16b[(Y2 + Cb) >>13] |
979 clip_table16g[(Y2 + Cg) >>13] |
980 clip_table16r[(Y2 + Cr) >>13];
986 for(i=0; i<dstw-1; i+=2){
987 // vertical linear interpolation && yuv2rgb in a single step:
988 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
989 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
990 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
991 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
993 int Cb= yuvtab_40cf[U];
994 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
995 int Cr= yuvtab_3343[V];
997 ((uint16_t*)dest)[i] =
998 clip_table15b[(Y1 + Cb) >>13] |
999 clip_table15g[(Y1 + Cg) >>13] |
1000 clip_table15r[(Y1 + Cr) >>13];
1002 ((uint16_t*)dest)[i+1] =
1003 clip_table15b[(Y2 + Cb) >>13] |
1004 clip_table15g[(Y2 + Cg) >>13] |
1005 clip_table15r[(Y2 + Cr) >>13];
1013 * YV12 to RGB without scaling or interpolating
1015 static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1016 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
1018 int uvalpha1=uvalpha^4095;
1020 int yalpha1=yalpha^4095;
1023 if(fullUVIpol || allwaysIpol)
1025 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1028 if( yalpha > 2048 ) buf0 = buf1;
1031 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1038 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1039 "m" (yalpha1), "m" (uvalpha1)
1046 "movl %4, %%ebx \n\t"
1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1050 "m" (yalpha1), "m" (uvalpha1)
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1060 "paddusb b5Dither, %%mm2 \n\t"
1061 "paddusb g5Dither, %%mm4 \n\t"
1062 "paddusb r5Dither, %%mm5 \n\t"
1065 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1066 "m" (yalpha1), "m" (uvalpha1)
1074 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076 "paddusb b5Dither, %%mm2 \n\t"
1077 "paddusb g6Dither, %%mm4 \n\t"
1078 "paddusb r5Dither, %%mm5 \n\t"
1082 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1083 "m" (yalpha1), "m" (uvalpha1)
1095 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1096 "m" (yalpha1), "m" (uvalpha1)
1103 "movl %4, %%ebx \n\t"
1106 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1107 "m" (yalpha1), "m" (uvalpha1)
1115 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1117 "paddusb b5Dither, %%mm2 \n\t"
1118 "paddusb g5Dither, %%mm4 \n\t"
1119 "paddusb r5Dither, %%mm5 \n\t"
1122 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1123 "m" (yalpha1), "m" (uvalpha1)
1131 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1133 "paddusb b5Dither, %%mm2 \n\t"
1134 "paddusb g6Dither, %%mm4 \n\t"
1135 "paddusb r5Dither, %%mm5 \n\t"
1139 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1140 "m" (yalpha1), "m" (uvalpha1)
1146 //FIXME write 2 versions (for even & odd lines)
1147 asm volatile ("\n\t"::: "memory");
1152 for(i=0; i<dstw-1; i+=2){
1153 // vertical linear interpolation && yuv2rgb in a single step:
1154 int Y1=yuvtab_2568[buf0[i]>>7];
1155 int Y2=yuvtab_2568[buf0[i+1]>>7];
1156 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1157 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1159 int Cb= yuvtab_40cf[U];
1160 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1161 int Cr= yuvtab_3343[V];
1163 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1164 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1165 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1167 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1168 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1169 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1175 for(i=0; i<dstw-1; i+=2){
1176 // vertical linear interpolation && yuv2rgb in a single step:
1177 int Y1=yuvtab_2568[buf0[i]>>7];
1178 int Y2=yuvtab_2568[buf0[i+1]>>7];
1179 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1180 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1182 int Cb= yuvtab_40cf[U];
1183 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1184 int Cr= yuvtab_3343[V];
1186 dest[0]=clip_table[((Y1 + Cb) >>13)];
1187 dest[1]=clip_table[((Y1 + Cg) >>13)];
1188 dest[2]=clip_table[((Y1 + Cr) >>13)];
1190 dest[3]=clip_table[((Y2 + Cb) >>13)];
1191 dest[4]=clip_table[((Y2 + Cg) >>13)];
1192 dest[5]=clip_table[((Y2 + Cr) >>13)];
1199 for(i=0; i<dstw-1; i+=2){
1200 // vertical linear interpolation && yuv2rgb in a single step:
1201 int Y1=yuvtab_2568[buf0[i]>>7];
1202 int Y2=yuvtab_2568[buf0[i+1]>>7];
1203 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1204 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1206 int Cb= yuvtab_40cf[U];
1207 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1208 int Cr= yuvtab_3343[V];
1210 ((uint16_t*)dest)[i] =
1211 clip_table16b[(Y1 + Cb) >>13] |
1212 clip_table16g[(Y1 + Cg) >>13] |
1213 clip_table16r[(Y1 + Cr) >>13];
1215 ((uint16_t*)dest)[i+1] =
1216 clip_table16b[(Y2 + Cb) >>13] |
1217 clip_table16g[(Y2 + Cg) >>13] |
1218 clip_table16r[(Y2 + Cr) >>13];
1224 for(i=0; i<dstw-1; i+=2){
1225 // vertical linear interpolation && yuv2rgb in a single step:
1226 int Y1=yuvtab_2568[buf0[i]>>7];
1227 int Y2=yuvtab_2568[buf0[i+1]>>7];
1228 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1229 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1231 int Cb= yuvtab_40cf[U];
1232 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1233 int Cr= yuvtab_3343[V];
1235 ((uint16_t*)dest)[i] =
1236 clip_table15b[(Y1 + Cb) >>13] |
1237 clip_table15g[(Y1 + Cg) >>13] |
1238 clip_table15r[(Y1 + Cr) >>13];
1240 ((uint16_t*)dest)[i+1] =
1241 clip_table15b[(Y2 + Cb) >>13] |
1242 clip_table15g[(Y2 + Cg) >>13] |
1243 clip_table15r[(Y2 + Cr) >>13];
1250 static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
1252 // *** horizontal scale Y line to temp buffer
1259 "pxor %%mm7, %%mm7 \n\t"
1260 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1261 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1262 "punpcklwd %%mm6, %%mm6 \n\t"
1263 "punpcklwd %%mm6, %%mm6 \n\t"
1264 "movq %%mm6, %%mm2 \n\t"
1265 "psllq $16, %%mm2 \n\t"
1266 "paddw %%mm6, %%mm2 \n\t"
1267 "psllq $16, %%mm2 \n\t"
1268 "paddw %%mm6, %%mm2 \n\t"
1269 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1270 "movq %%mm2, temp0 \n\t"
1271 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1272 "punpcklwd %%mm6, %%mm6 \n\t"
1273 "punpcklwd %%mm6, %%mm6 \n\t"
1274 "xorl %%eax, %%eax \n\t" // i
1275 "movl %0, %%esi \n\t" // src
1276 "movl %1, %%edi \n\t" // buf1
1277 "movl %3, %%edx \n\t" // (xInc*4)>>16
1278 "xorl %%ecx, %%ecx \n\t"
1279 "xorl %%ebx, %%ebx \n\t"
1280 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1282 #define FUNNY_Y_CODE \
1283 PREFETCH" 1024(%%esi) \n\t"\
1284 PREFETCH" 1056(%%esi) \n\t"\
1285 PREFETCH" 1088(%%esi) \n\t"\
1286 "call funnyYCode \n\t"\
1287 "movq temp0, %%mm2 \n\t"\
1288 "xorl %%ecx, %%ecx \n\t"
1299 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1300 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1301 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1303 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
1308 //NO MMX just normal asm ...
1310 "xorl %%eax, %%eax \n\t" // i
1311 "xorl %%ebx, %%ebx \n\t" // xx
1312 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1315 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1316 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1317 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1318 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1319 "shll $16, %%edi \n\t"
1320 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1321 "movl %1, %%edi \n\t"
1322 "shrl $9, %%esi \n\t"
1323 "movw %%si, (%%edi, %%eax, 2) \n\t"
1324 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1325 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1327 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1328 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1329 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1330 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1331 "shll $16, %%edi \n\t"
1332 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1333 "movl %1, %%edi \n\t"
1334 "shrl $9, %%esi \n\t"
1335 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
1336 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1337 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1340 "addl $2, %%eax \n\t"
1341 "cmpl %2, %%eax \n\t"
1345 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1346 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1349 } //if MMX2 cant be used
1353 unsigned int xpos=0;
1354 for(i=0;i<dstWidth;i++)
1356 register unsigned int xx=xpos>>16;
1357 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1358 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1364 inline static void hcscale(uint16_t *dst, int dstWidth,
1365 uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
1373 "pxor %%mm7, %%mm7 \n\t"
1374 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1375 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1376 "punpcklwd %%mm6, %%mm6 \n\t"
1377 "punpcklwd %%mm6, %%mm6 \n\t"
1378 "movq %%mm6, %%mm2 \n\t"
1379 "psllq $16, %%mm2 \n\t"
1380 "paddw %%mm6, %%mm2 \n\t"
1381 "psllq $16, %%mm2 \n\t"
1382 "paddw %%mm6, %%mm2 \n\t"
1383 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1384 "movq %%mm2, temp0 \n\t"
1385 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1386 "punpcklwd %%mm6, %%mm6 \n\t"
1387 "punpcklwd %%mm6, %%mm6 \n\t"
1388 "xorl %%eax, %%eax \n\t" // i
1389 "movl %0, %%esi \n\t" // src
1390 "movl %1, %%edi \n\t" // buf1
1391 "movl %3, %%edx \n\t" // (xInc*4)>>16
1392 "xorl %%ecx, %%ecx \n\t"
1393 "xorl %%ebx, %%ebx \n\t"
1394 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1396 #define FUNNYUVCODE \
1397 PREFETCH" 1024(%%esi) \n\t"\
1398 PREFETCH" 1056(%%esi) \n\t"\
1399 PREFETCH" 1088(%%esi) \n\t"\
1400 "call funnyUVCode \n\t"\
1401 "movq temp0, %%mm2 \n\t"\
1402 "xorl %%ecx, %%ecx \n\t"
1413 "xorl %%eax, %%eax \n\t" // i
1414 "movl %6, %%esi \n\t" // src
1415 "movl %1, %%edi \n\t" // buf1
1416 "addl $4096, %%edi \n\t"
1428 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1429 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1430 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1432 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1434 dst[i] = src1[srcWidth/2-1]*128;
1435 dst[i+2048] = src2[srcWidth/2-1]*128;
1442 "xorl %%eax, %%eax \n\t" // i
1443 "xorl %%ebx, %%ebx \n\t" // xx
1444 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1447 "movl %0, %%esi \n\t"
1448 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1449 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1450 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1451 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1452 "shll $16, %%edi \n\t"
1453 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1454 "movl %1, %%edi \n\t"
1455 "shrl $9, %%esi \n\t"
1456 "movw %%si, (%%edi, %%eax, 2) \n\t"
1458 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1459 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1460 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1461 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1462 "shll $16, %%edi \n\t"
1463 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1464 "movl %1, %%edi \n\t"
1465 "shrl $9, %%esi \n\t"
1466 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1468 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1469 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1470 "addl $1, %%eax \n\t"
1471 "cmpl %2, %%eax \n\t"
1474 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1476 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1479 } //if MMX2 cant be used
1483 unsigned int xpos=0;
1484 for(i=0;i<dstWidth;i++)
1486 register unsigned int xx=xpos>>16;
1487 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1488 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1489 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1491 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1492 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1500 // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
1501 // *** Note: it's called multiple times while decoding a frame, first time y==0
1502 // *** Designed to upscale, but may work for downscale too.
1503 // s_xinc = (src_width << 16) / dst_width
1504 // s_yinc = (src_height << 16) / dst_height
1505 void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
1506 uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
1507 unsigned int s_xinc,unsigned int s_yinc){
1510 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
1511 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
1513 unsigned int s_xinc2;
1515 static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1518 // last horzontally interpolated lines, used to avoid unnecessary calculations
1519 static int s_last_ypos;
1520 static int s_last_y1pos;
1523 // used to detect a horizontal size change
1524 static int old_dstw= -1;
1525 static int old_s_xinc= -1;
1532 if(((dstw + 7)&(~7)) >= dststride) dstw&= ~7;
1534 srcWidth= (dstw*s_xinc + 0x8000)>>16;
1535 dstUVw= fullUVIpol ? dstw : dstw/2;
1538 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
1541 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1542 // n-2 is the last chrominance sample available
1543 // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
1544 // would be like the vertical one, but that would require some special code for the
1545 // first and last pixel
1546 if(canMMX2BeUsed) s_xinc+= 20;
1547 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
1549 if(fullUVIpol && !(dstbpp==12)) s_xinc2= s_xinc>>1;
1550 else s_xinc2= s_xinc;
1551 // force calculation of the horizontal interpolation of the first line
1554 // printf("dstw %d, srcw %d, mmx2 %d\n", dstw, srcWidth, canMMX2BeUsed);
1557 s_srcypos= s_yinc/2 - 0x8000;
1560 // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
1561 for(i=dstw-2; i<dstw+20; i++)
1563 pix_buf_uv[0][i] = pix_buf_uv[1][i]
1564 = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128*128;
1565 pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
1566 = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128;
1567 pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
1571 // cant downscale !!!
1572 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
1584 // create an optimized horizontal scaling routine
1592 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
1593 "movq %%mm0, %%mm1 \n\t"
1594 "psrlq $8, %%mm0 \n\t"
1595 "punpcklbw %%mm7, %%mm1 \n\t"
1596 "movq %%mm2, %%mm3 \n\t"
1597 "punpcklbw %%mm7, %%mm0 \n\t"
1598 "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
1599 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1601 "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry
1602 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1604 "psrlw $9, %%mm3 \n\t"
1605 "psubw %%mm1, %%mm0 \n\t"
1606 "pmullw %%mm3, %%mm0 \n\t"
1607 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
1608 "psllw $7, %%mm1 \n\t"
1609 "paddw %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%%edi, %%eax) \n\t"
1613 "addl $8, %%eax \n\t"
1626 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1627 "=r" (fragmentLength)
1630 xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
1632 /* choose xinc so that all 8 parts fit exactly
1633 Note: we cannot use just 1 part because it would not fit in the code cache */
1634 // s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
1635 // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
1637 // s_xinc2_diff+= ((0x10000/(dstw/8)));
1639 // s_xinc_diff= s_xinc2_diff*2;
1641 // s_xinc2+= s_xinc2_diff;
1642 // s_xinc+= s_xinc_diff;
1644 // old_s_xinc= s_xinc;
1646 for(i=0; i<dstw/8; i++)
1653 int b=((xpos+s_xinc)>>16) - xx;
1654 int c=((xpos+s_xinc*2)>>16) - xx;
1655 int d=((xpos+s_xinc*3)>>16) - xx;
1657 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1659 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1660 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1661 a | (b<<2) | (c<<4) | (d<<6);
1663 // if we dont need to read 8 bytes than dont :), reduces the chance of
1664 // crossing a cache line
1665 if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1667 funnyYCode[fragmentLength*(i+4)/4]= RET;
1672 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
1673 for(i=0; i<dstUVw/8; i++)
1680 int b=((xpos+s_xinc2)>>16) - xx;
1681 int c=((xpos+s_xinc2*2)>>16) - xx;
1682 int d=((xpos+s_xinc2*3)>>16) - xx;
1684 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1686 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1687 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1688 a | (b<<2) | (c<<4) | (d<<6);
1690 // if we dont need to read 8 bytes than dont :), reduces the chance of
1691 // crossing a cache line
1692 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1694 funnyUVCode[fragmentLength*(i+4)/4]= RET;
1698 // funnyCode[0]= RET;
1705 unsigned char *dest =dstptr[0]+dststride*s_ypos;
1706 unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
1707 unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
1709 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
1710 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1711 int srcuvpos= dstbpp==12 ? s_srcypos + s_yinc/2 - 0x8000 :
1713 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
1714 int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1715 int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
1716 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice
1717 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
1718 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
1719 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
1721 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1723 if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
1725 s_ypos++; s_srcypos+=s_yinc;
1727 //only interpolate the src line horizontally if we didnt do it allready
1731 // skip if first line has been horiz scaled alleady
1732 if(s_last_ypos != y0-1)
1734 // check if first line is before any available src lines
1735 if(y0-1 < y) src=srcptr[0]+(0 )*stride[0];
1736 else src=srcptr[0]+(y0-y-1)*stride[0];
1738 hyscale(buf0, dstw, src, srcWidth, s_xinc);
1740 // check if second line is after any available src lines
1741 if(y0-y >= h) src=srcptr[0]+(h-1)*stride[0];
1742 else src=srcptr[0]+(y0-y)*stride[0];
1744 // the min() is required to avoid reuseing lines which where not available
1745 s_last_ypos= MIN(y0, y+h-1);
1746 hyscale(buf1, dstw, src, srcWidth, s_xinc);
1748 // printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
1749 // *** horizontal scale U and V lines to temp buffer
1750 if(s_last_y1pos!=y1)
1752 uint8_t *src1, *src2;
1753 // skip if first line has been horiz scaled alleady
1754 if(s_last_y1pos != y1-1)
1756 // check if first line is before any available src lines
1759 src1= srcptr[1]+(0)*stride[1];
1760 src2= srcptr[2]+(0)*stride[2];
1762 src1= srcptr[1]+(y1-y/2-1)*stride[1];
1763 src2= srcptr[2]+(y1-y/2-1)*stride[2];
1765 hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
1768 // check if second line is after any available src lines
1771 src1= srcptr[1]+(h/2-1)*stride[1];
1772 src2= srcptr[2]+(h/2-1)*stride[2];
1774 src1= srcptr[1]+(y1-y/2)*stride[1];
1775 src2= srcptr[2]+(y1-y/2)*stride[2];
1777 hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
1779 // the min() is required to avoid reuseing lines which where not available
1780 s_last_y1pos= MIN(y1, y/2+h/2-1);
1783 b5Dither= dither8[s_ypos&1];
1784 g6Dither= dither4[s_ypos&1];
1785 g5Dither= dither8[s_ypos&1];
1786 r5Dither= dither8[(s_ypos+1)&1];
1789 if(dstbpp==12) //YV12
1790 yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
1791 else if(ABS(s_yinc - 0x10000) < 10)
1792 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1794 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1798 __asm __volatile(SFENCE:::"memory");
1799 __asm __volatile(EMMS:::"memory");
1804 void SwScale_Init(){
1805 // generating tables:
1809 clip_table[i+256]=i;
1810 clip_table[i+512]=255;
1811 yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1812 yuvtab_3343[i]=0x3343*(i-128);
1813 yuvtab_0c92[i]=-0x0c92*(i-128);
1814 yuvtab_1a1e[i]=-0x1a1e*(i-128);
1815 yuvtab_40cf[i]=0x40cf*(i-128);
1818 for(i=0; i<768; i++)
1820 int v= clip_table[i];
1821 clip_table16b[i]= v>>3;
1822 clip_table16g[i]= (v<<3)&0x07E0;
1823 clip_table16r[i]= (v<<8)&0xF800;
1824 clip_table15b[i]= v>>3;
1825 clip_table15g[i]= (v<<2)&0x03E0;
1826 clip_table15r[i]= (v<<7)&0x7C00;