]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
fixed out of regs "bug"
[ffmpeg] / postproc / swscale_template.c
1
2 // Software scaling and colorspace conversion routines for MPlayer
3
4 #include <inttypes.h>
5 #include "../config.h"
6
7 #undef HAVE_MMX2 //code is buggy
8 //#undef HAVE_MMX
9
10 #define RET 0xC3 //near return opcode
11
12 // temporary storage for 4 yuv lines:
13 // 16bit for now (mmx likes it more compact)
14 static uint16_t pix_buf_y[4][2048];
15 static uint16_t pix_buf_uv[2][2048*2];
16
17 // clipping helper table for C implementations:
18 static unsigned char clip_table[768];
19
20 // yuv->rgb conversion tables:
21 static    int yuvtab_2568[256];
22 static    int yuvtab_3343[256];
23 static    int yuvtab_0c92[256];
24 static    int yuvtab_1a1e[256];
25 static    int yuvtab_40cf[256];
26
27 static uint64_t yCoeff=    0x2568256825682568LL;
28 static uint64_t ubCoeff=   0x3343334333433343LL;
29 static uint64_t vrCoeff=   0x40cf40cf40cf40cfLL;
30 static uint64_t ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
31 static uint64_t vgCoeff=   0xF36EF36EF36EF36ELL;
32 static uint64_t w80=       0x0080008000800080LL;
33 static uint64_t w10=       0x0010001000100010LL;
34
35 static uint64_t b16Dither= 0x0004000400040004LL;
36 static uint64_t b16Dither1=0x0004000400040004LL;
37 static uint64_t b16Dither2=0x0602060206020602LL;
38 static uint64_t g16Dither= 0x0002000200020002LL;
39 static uint64_t g16Dither1=0x0002000200020002LL;
40 static uint64_t g16Dither2=0x0301030103010301LL;
41
42 static uint64_t b16Mask=   0x001F001F001F001FLL;
43 static uint64_t g16Mask=   0x07E007E007E007E0LL;
44 static uint64_t r16Mask=   0xF800F800F800F800LL;
45 static uint64_t temp0;
46
47 static uint8_t funnyYCode[10000];
48 static uint8_t funnyUVCode[10000];
49
50
51
52 // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
53 // *** Note: it's called multiple times while decoding a frame, first time y==0
54 // *** Designed to upscale, but may work for downscale too.
55 // s_xinc = (src_width << 8) / dst_width
56 // s_yinc = (src_height << 16) / dst_height
57 void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
58                              unsigned char* dstptr, int dststride, int dstw, int dstbpp,
59                              unsigned int s_xinc,unsigned int s_yinc){
60
61 // scaling factors:
62 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
63 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
64
65 unsigned int s_xinc2=s_xinc>>1;
66
67 static int s_srcypos;
68 static int s_ypos;
69 static int s_last_ypos;
70 static int static_dstw;
71
72 #ifdef HAVE_MMX2
73 static int old_dstw= -1;
74 static int old_s_xinc= -1;
75 #endif
76
77 s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other
78
79   if(y==0){
80       s_srcypos=-2*s_yinc;
81       s_ypos=-2;
82       s_last_ypos=-2;
83 #ifdef HAVE_MMX2
84 // cant downscale !!!
85         if(old_s_xinc != s_xinc || old_dstw!=dstw)
86         {
87                 uint8_t *fragment;
88                 int imm8OfPShufW1;
89                 int imm8OfPShufW2;
90                 int fragmentLength;
91
92                 int xpos, xx, xalpha, i;
93
94                 old_s_xinc= s_xinc;
95                 old_dstw= dstw;
96
97                 static_dstw= dstw;
98
99                 // create an optimized horizontal scaling routine
100
101                 //code fragment
102
103 //              fragmentLength=0;
104 //              printf("%d, %d\n", fragmentLength,imm8OfPShufW1);
105
106                 asm volatile(
107                         "jmp 9f                         \n\t"
108                 // Begin
109                         "0:                             \n\t"
110                         "movq (%%esi, %%ebx), %%mm0     \n\t" //FIXME Alignment
111                         "movq %%mm0, %%mm1              \n\t"
112                         "psrlq $8, %%mm0                \n\t"
113                         "punpcklbw %%mm7, %%mm1 \n\t"
114                         "punpcklbw %%mm7, %%mm0 \n\t"
115                         "pshufw $0xFF, %%mm1, %%mm1     \n\t"
116                         "1:                             \n\t"
117                         "pshufw $0xFF, %%mm0, %%mm0     \n\t"
118                         "2:                             \n\t"
119                         "psubw %%mm1, %%mm0             \n\t"
120                         "psraw $1, %%mm0                \n\t"
121                         "pmullw %%mm2, %%mm0            \n\t"
122                         "psllw $7, %%mm1                \n\t"
123                         "paddw %%mm1, %%mm0             \n\t"
124                         "movq %%mm0, (%%edi, %%eax)     \n\t"
125                         "paddb %%mm6, %%mm2             \n\t" // 2*alpha += xpos&0xFF
126
127                         "addb %%ch, %%cl                \n\t" //2*xalpha += (4*s_xinc)&0xFF
128                         "adcl %%edx, %%ebx              \n\t" //xx+= (4*s_xinc)>>8 + carry
129
130                         "addl $8, %%eax                 \n\t"
131                 // End
132                         "9:                             \n\t"
133 //              "int $3\n\t"
134                         "leal 0b, %0                    \n\t"
135                         "leal 1b, %1                    \n\t"
136                         "leal 2b, %2                    \n\t"
137                         "decl %1                        \n\t"
138                         "decl %2                        \n\t"
139                         "subl %0, %1                    \n\t"
140                         "subl %0, %2                    \n\t"
141                         "leal 9b, %3                    \n\t"
142                         "subl %0, %3                    \n\t"
143                         :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
144                          "=r" (fragmentLength)
145                 );
146
147                 xpos= xx=xalpha= 0;
148                 //FIXME choose size and or xinc so that they fit exactly
149                 for(i=0; i<dstw/8; i++)
150                 {
151                         int xx=xpos>>8;
152
153                         if((i&3) == 0)
154                         {
155                                 int a=0;
156                                 int b=((xpos+s_xinc)>>8) - xx;
157                                 int c=((xpos+s_xinc*2)>>8) - xx;
158                                 int d=((xpos+s_xinc*3)>>8) - xx;
159
160                                 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
161
162                                 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
163                                 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
164                                         a | (b<<2) | (c<<4) | (d<<6);
165
166                                 funnyYCode[fragmentLength*(i+4)/4]= RET;
167                         }
168                         xpos+=s_xinc;
169                 }
170
171                 xpos= xx=xalpha= 0;
172                 //FIXME choose size and or xinc so that they fit exactly
173                 for(i=0; i<dstw/8; i++)
174                 {
175                         int xx=xpos>>8;
176
177                         if((i&3) == 0)
178                         {
179                                 int a=0;
180                                 int b=((xpos+s_xinc2)>>8) - xx;
181                                 int c=((xpos+s_xinc2*2)>>8) - xx;
182                                 int d=((xpos+s_xinc2*3)>>8) - xx;
183
184                                 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
185
186                                 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
187                                 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
188                                         a | (b<<2) | (c<<4) | (d<<6);
189
190                                 funnyUVCode[fragmentLength*(i+4)/4]= RET;
191                         }
192                         xpos+=s_xinc2;
193                 }
194 //              funnyCode[0]= RET;
195
196
197         }
198 #endif
199   } // reset counters
200
201   while(1){
202     unsigned char *dest=dstptr+dststride*s_ypos;
203     int y0=2+(s_srcypos>>16);
204     int y1=1+(s_srcypos>>17);
205     int yalpha=(s_srcypos&0xFFFF)>>7;
206     int yalpha1=yalpha^511;
207     int uvalpha=((s_srcypos>>1)&0xFFFF)>>7;
208     int uvalpha1=uvalpha^511;
209     uint16_t *buf0=pix_buf_y[y0&3];
210     uint16_t *buf1=pix_buf_y[((y0+1)&3)];
211     uint16_t *uvbuf0=pix_buf_uv[y1&1];
212     uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1];
213     int i;
214
215     if(y0>=y+h) break;
216
217     s_ypos++; s_srcypos+=s_yinc;
218
219     if(s_last_ypos!=y0){
220       unsigned char *src=srcptr[0]+(y0-y)*stride[0];
221       unsigned int xpos=0;
222       s_last_ypos=y0;
223       // *** horizontal scale Y line to temp buffer
224       // this loop should be rewritten in MMX assembly!!!!
225 #ifdef HAVE_MMX2
226         asm volatile(
227                 "pxor %%mm7, %%mm7              \n\t"
228                 "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
229                 "movd %5, %%mm6                 \n\t" // s_xinc&0xFF
230                 "punpcklwd %%mm6, %%mm6         \n\t"
231                 "punpcklwd %%mm6, %%mm6         \n\t"
232                 "movq %%mm6, %%mm2              \n\t"
233                 "psllq $16, %%mm2               \n\t"
234                 "paddb %%mm6, %%mm2             \n\t"
235                 "psllq $16, %%mm2               \n\t"
236                 "paddb %%mm6, %%mm2             \n\t"
237                 "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=s_xinc&0xFF
238                 "movq %%mm2, temp0              \n\t"
239                 "movd %4, %%mm6                 \n\t" //(s_xinc*4)&0xFF
240                 "punpcklwd %%mm6, %%mm6         \n\t"
241                 "punpcklwd %%mm6, %%mm6         \n\t"
242                 "xorl %%eax, %%eax              \n\t" // i
243                 "xorl %%ebx, %%ebx              \n\t" // xx
244                 "movl %0, %%esi                 \n\t" // src
245                 "movl %1, %%edi                 \n\t" // buf1
246                 "movl %3, %%edx                 \n\t" // (s_xinc*4)>>8
247                 "xorl %%ecx, %%ecx              \n\t"
248                 "movb %4, %%ch                  \n\t" // (s_xinc*4)&0xFF
249 //      "int $3\n\t"
250                 "call funnyYCode                        \n\t"
251                 "movq temp0, %%mm2              \n\t"
252                 "xorb %%cl, %%cl                \n\t"
253                 "call funnyYCode                        \n\t"
254                 "movq temp0, %%mm2              \n\t"
255                 "xorb %%cl, %%cl                \n\t"
256                 "call funnyYCode                        \n\t"
257                 "movq temp0, %%mm2              \n\t"
258                 "xorb %%cl, %%cl                \n\t"
259                 "call funnyYCode                        \n\t"
260                 "movq temp0, %%mm2              \n\t"
261                 "xorb %%cl, %%cl                \n\t"
262                 "call funnyYCode                        \n\t"
263                 "movq temp0, %%mm2              \n\t"
264                 "xorb %%cl, %%cl                \n\t"
265                 "call funnyYCode                        \n\t"
266                 "movq temp0, %%mm2              \n\t"
267                 "xorb %%cl, %%cl                \n\t"
268                 "call funnyYCode                        \n\t"
269                 "movq temp0, %%mm2              \n\t"
270                 "xorb %%cl, %%cl                \n\t"
271                 "call funnyYCode                        \n\t"
272                 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8),
273                   "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF)
274                 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
275         );
276
277 #elif defined (ARCH_X86)
278         //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
279         //FIXME add prefetch
280         asm volatile(
281                 "xorl %%eax, %%eax              \n\t" // i
282                 "xorl %%ebx, %%ebx              \n\t" // xx
283                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
284                 "1:                             \n\t"
285                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
286                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
287                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
288                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
289                 "shll $8, %%edi                 \n\t"
290                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
291                 "movl %1, %%edi                 \n\t"
292                 "shrl $1, %%esi                 \n\t"
293                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
294                 "addb %4, %%cl                  \n\t" //2*xalpha += s_xinc&0xFF
295                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
296
297                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
298                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
299                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
300                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
301                 "shll $8, %%edi                 \n\t"
302                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
303                 "movl %1, %%edi                 \n\t"
304                 "shrl $1, %%esi                 \n\t"
305                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
306                 "addb %4, %%cl                  \n\t" //2*xalpha += s_xinc&0xFF
307                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
308
309
310                 "addl $2, %%eax                 \n\t"
311                 "cmpl %2, %%eax                 \n\t"
312                 " jb 1b                         \n\t"
313
314
315                 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF)
316                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
317                 );
318 #else
319       for(i=0;i<dstw;i++){
320         register unsigned int xx=xpos>>8;
321         register unsigned int xalpha=(xpos&0xFF)>>1;
322         buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
323         xpos+=s_xinc;
324       }
325 #endif
326       // *** horizontal scale U and V lines to temp buffer
327       if(!(y0&1)){
328         unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
329         unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
330         xpos=0;
331         // this loop should be rewritten in MMX assembly!!!!
332 #ifdef HAVE_MMX2
333         asm volatile(
334                 "pxor %%mm7, %%mm7              \n\t"
335                 "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
336                 "movd %5, %%mm6                 \n\t" // s_xinc&0xFF
337                 "punpcklwd %%mm6, %%mm6         \n\t"
338                 "punpcklwd %%mm6, %%mm6         \n\t"
339                 "movq %%mm6, %%mm2              \n\t"
340                 "psllq $16, %%mm2               \n\t"
341                 "paddb %%mm6, %%mm2             \n\t"
342                 "psllq $16, %%mm2               \n\t"
343                 "paddb %%mm6, %%mm2             \n\t"
344                 "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=s_xinc&0xFF
345                 "movq %%mm2, temp0              \n\t"
346                 "movd %4, %%mm6                 \n\t" //(s_xinc*4)&0xFF
347                 "punpcklwd %%mm6, %%mm6         \n\t"
348                 "punpcklwd %%mm6, %%mm6         \n\t"
349                 "xorl %%eax, %%eax              \n\t" // i
350                 "xorl %%ebx, %%ebx              \n\t" // xx
351                 "movl %0, %%esi                 \n\t" // src
352                 "movl %1, %%edi                 \n\t" // buf1
353                 "movl %3, %%edx                 \n\t" // (s_xinc*4)>>8
354                 "xorl %%ecx, %%ecx              \n\t"
355                 "movb %4, %%ch                  \n\t" // (s_xinc*4)&0xFF
356 //      "int $3\n\t"
357                 "call funnyUVCode                       \n\t"
358                 "movq temp0, %%mm2              \n\t"
359                 "xorb %%cl, %%cl                \n\t"
360                 "call funnyUVCode                       \n\t"
361                 "movq temp0, %%mm2              \n\t"
362                 "xorb %%cl, %%cl                \n\t"
363                 "call funnyUVCode                       \n\t"
364                 "movq temp0, %%mm2              \n\t"
365                 "xorb %%cl, %%cl                \n\t"
366                 "call funnyUVCode                       \n\t"
367                 "movq temp0, %%mm2              \n\t"
368                 "xorb %%cl, %%cl                \n\t"
369                 "call funnyUVCode                       \n\t"
370                 "movq temp0, %%mm2              \n\t"
371                 "xorb %%cl, %%cl                \n\t"
372                 "call funnyUVCode                       \n\t"
373                 "movq temp0, %%mm2              \n\t"
374                 "xorb %%cl, %%cl                \n\t"
375                 "call funnyUVCode                       \n\t"
376                 "movq temp0, %%mm2              \n\t"
377                 "xorb %%cl, %%cl                \n\t"
378                 "call funnyUVCode                       \n\t"
379
380                 "xorl %%eax, %%eax              \n\t" // i
381                 "xorl %%ebx, %%ebx              \n\t" // xx
382                 "movl %6, %%esi                 \n\t" // src
383                 "movl %1, %%edi                 \n\t" // buf1
384                 "addl $4096, %%edi              \n\t"
385
386                 "call funnyUVCode                       \n\t"
387                 "movq temp0, %%mm2              \n\t"
388                 "xorb %%cl, %%cl                \n\t"
389                 "call funnyUVCode                       \n\t"
390                 "movq temp0, %%mm2              \n\t"
391                 "xorb %%cl, %%cl                \n\t"
392                 "call funnyUVCode                       \n\t"
393                 "movq temp0, %%mm2              \n\t"
394                 "xorb %%cl, %%cl                \n\t"
395                 "call funnyUVCode                       \n\t"
396                 "movq temp0, %%mm2              \n\t"
397                 "xorb %%cl, %%cl                \n\t"
398                 "call funnyUVCode                       \n\t"
399                 "movq temp0, %%mm2              \n\t"
400                 "xorb %%cl, %%cl                \n\t"
401                 "call funnyUVCode                       \n\t"
402                 "movq temp0, %%mm2              \n\t"
403                 "xorb %%cl, %%cl                \n\t"
404                 "call funnyUVCode                       \n\t"
405                 "movq temp0, %%mm2              \n\t"
406                 "xorb %%cl, %%cl                \n\t"
407                 "call funnyUVCode                       \n\t"
408
409                 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8),
410                   "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2)
411                 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
412         );
413
414 #elif defined (ARCH_X86)
415         //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
416         asm volatile(
417                 "xorl %%eax, %%eax              \n\t" // i
418                 "xorl %%ebx, %%ebx              \n\t" // xx
419                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
420                 "1:                             \n\t"
421                 "movl %0, %%esi                 \n\t"
422                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
423                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
424                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
425                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
426                 "shll $8, %%edi                 \n\t"
427                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
428                 "movl %1, %%edi                 \n\t"
429                 "shrl $1, %%esi                 \n\t"
430                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
431
432                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
433                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
434                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
435                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
436                 "shll $8, %%edi                 \n\t"
437                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
438                 "movl %1, %%edi                 \n\t"
439                 "shrl $1, %%esi                 \n\t"
440                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
441
442                 "addb %4, %%cl                  \n\t" //2*xalpha += s_xinc&0xFF
443                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
444                 "addl $1, %%eax                 \n\t"
445                 "cmpl %2, %%eax                 \n\t"
446                 " jb 1b                         \n\t"
447
448
449                 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF),
450                 "r" (src2)
451                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
452                 );
453 #else
454         for(i=0;i<dstw;i++){
455           register unsigned int xx=xpos>>8;
456           register unsigned int xalpha=(xpos&0xFF)>>1;
457           uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
458           uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
459           xpos+=s_xinc2;
460         }
461 #endif
462       }
463       if(!y0) continue;
464     }
465
466     // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
467     // Re: Note1: ok n*4 for now
468     // Note2: instead of using lookup tabs, mmx version could do the multiply...
469     // Re: Note2: yep
470     // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
471     // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
472 #ifdef HAVE_MMX
473         //FIXME write lq version with less uv ...
474         //FIXME reorder / optimize
475         if(dstbpp == 4)
476         {
477                 asm volatile(
478
479 #define YSCALEYUV2RGB \
480                 "pxor %%mm7, %%mm7              \n\t"\
481                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
482                 "punpcklwd %%mm6, %%mm6         \n\t"\
483                 "punpcklwd %%mm6, %%mm6         \n\t"\
484                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
485                 "punpcklwd %%mm5, %%mm5         \n\t"\
486                 "punpcklwd %%mm5, %%mm5         \n\t"\
487                 "xorl %%eax, %%eax              \n\t"\
488                 "1:                             \n\t"\
489                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
490                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
491                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
492                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
493                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>7*/\
494                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
495                 "psubw w10, %%mm1               \n\t" /* Y-16*/\
496                 "psllw $3, %%mm1                \n\t" /* (y-16)*8*/\
497                 "pmulhw yCoeff, %%mm1           \n\t"\
498 \
499                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
500                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
501                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
502                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
503                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
504                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
505                 "psubw w80, %%mm3               \n\t" /* (U-128)*/\
506                 "psllw $3, %%mm3                \n\t" /*(U-128)8*/\
507 \
508                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
509                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
510                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
511                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
512                 "psraw $7, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
513                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
514                 "psubw w80, %%mm0               \n\t" /* (V-128)*/\
515                 "psllw $3, %%mm0                \n\t" /* (V-128)8*/\
516 \
517                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
518                 "pmulhw ubCoeff, %%mm3          \n\t"\
519                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
520 \
521                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
522                 "pmulhw vrCoeff, %%mm0          \n\t"\
523                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
524 \
525                 "pmulhw ugCoeff, %%mm2          \n\t"\
526                 "pmulhw vgCoeff, %%mm4          \n\t"\
527                 "paddw %%mm4, %%mm2             \n\t"\
528                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
529 \
530                 "packuswb %%mm3, %%mm3          \n\t"\
531                 "packuswb %%mm0, %%mm0          \n\t"\
532                 "packuswb %%mm1, %%mm1          \n\t"
533
534 YSCALEYUV2RGB
535                 "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
536                 "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
537
538                 "movq %%mm3, %%mm1              \n\t"
539                 "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
540                 "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
541 #ifdef HAVE_MMX2
542                 "movntq %%mm3, (%4, %%eax, 4)   \n\t"
543                 "movntq %%mm1, 8(%4, %%eax, 4)  \n\t"
544 #else
545                 "movq %%mm3, (%4, %%eax, 4)     \n\t"
546                 "movq %%mm1, 8(%4, %%eax, 4)    \n\t"
547 #endif
548                 "addl $4, %%eax                 \n\t"
549                 "cmpl %5, %%eax                 \n\t"
550                 " jb 1b                         \n\t"
551
552
553                 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
554                 "m" (yalpha1), "m" (uvalpha1)
555                 : "%eax"
556                 );
557         }
558         else if(dstbpp==2)
559         {
560                 asm volatile(
561
562 YSCALEYUV2RGB
563                 "paddusb g16Dither, %%mm1       \n\t"
564                 "paddusb b16Dither, %%mm0       \n\t"
565                 "paddusb b16Dither, %%mm3       \n\t"
566                 "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
567                 "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
568                 "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
569
570                 "psrlw $3, %%mm3                \n\t"
571                 "psllw $3, %%mm1                \n\t"
572                 "psllw $8, %%mm0                \n\t"
573                 "pand g16Mask, %%mm1            \n\t"
574                 "pand r16Mask, %%mm0            \n\t"
575
576                 "por %%mm3, %%mm1               \n\t"
577                 "por %%mm1, %%mm0               \n\t"
578 #ifdef HAVE_MMX2
579                 "movntq %%mm0, (%4, %%eax, 2)   \n\t"
580 #else
581                 "movq %%mm0, (%4, %%eax, 2)     \n\t"
582 #endif
583                 "addl $4, %%eax                 \n\t"
584                 "cmpl %5, %%eax                 \n\t"
585                 " jb 1b                         \n\t"
586
587                 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
588                 "m" (yalpha1), "m" (uvalpha1)
589                 : "%eax"
590                 );
591         }
592 #else
593         if(dstbpp==4 || dstbpp==3)
594         {
595                 for(i=0;i<dstw;i++){
596                         // vertical linear interpolation && yuv2rgb in a single step:
597                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
598                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
599                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
600                         dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
601                         dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
602                         dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
603                         dest+=dstbpp;
604                 }
605         }
606         else if(dstbpp==2) //16bit
607         {
608                 for(i=0;i<dstw;i++){
609                         // vertical linear interpolation && yuv2rgb in a single step:
610                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
611                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
612                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
613
614                         ((uint16_t*)dest)[0] =
615                                 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
616                                 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
617                                 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
618                         dest+=dstbpp;
619                 }
620         }
621         else if(dstbpp==2) //15bit FIXME how do i figure out if its 15 or 16?
622         {
623                 for(i=0;i<dstw;i++){
624                         // vertical linear interpolation && yuv2rgb in a single step:
625                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
626                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
627                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
628
629                         ((uint16_t*)dest)[0] =
630                                 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
631                                 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
632                                 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
633                         dest+=dstbpp;
634                 }
635         }
636 #endif
637
638         b16Dither= b16Dither1;
639         b16Dither1= b16Dither2;
640         b16Dither2= b16Dither;
641
642         g16Dither= g16Dither1;
643         g16Dither1= g16Dither2;
644         g16Dither2= g16Dither;
645   }
646
647 }
648
649
650 void SwScale_Init(){
651     // generating tables:
652     int i;
653     for(i=0;i<256;i++){
654         clip_table[i]=0;
655         clip_table[i+256]=i;
656         clip_table[i+512]=255;
657         yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
658         yuvtab_3343[i]=0x3343*(i-128);
659         yuvtab_0c92[i]=-0x0c92*(i-128);
660         yuvtab_1a1e[i]=-0x1a1e*(i-128);
661         yuvtab_40cf[i]=0x40cf*(i-128);
662     }
663
664 }