]> git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c
xinc scaled by 16 instead of 8
[ffmpeg] / postproc / swscale_template.c
1
2 // Software scaling and colorspace conversion routines for MPlayer
3
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
7 #include <inttypes.h>
8 #include "../config.h"
9
10 //#undef HAVE_MMX2
11 //#undef HAVE_MMX
12 //#undef ARCH_X86
13 #define DITHER16BPP
14 #define ALT_ERROR
15
16 #define RET 0xC3 //near return opcode
17 /*
18 NOTES
19
20 known BUGS with known cause (no bugreports please!)
21 line at the right (c,asm and mmx2)
22 code reads 1 sample too much (might cause a sig11)
23
24 TODO
25 check alignment off everything
26 */
27
28 static uint64_t yCoeff=    0x2568256825682568LL;
29 static uint64_t ubCoeff=   0x3343334333433343LL;
30 static uint64_t vrCoeff=   0x40cf40cf40cf40cfLL;
31 static uint64_t ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
32 static uint64_t vgCoeff=   0xF36EF36EF36EF36ELL;
33 static uint64_t w80=       0x0080008000800080LL;
34 static uint64_t w10=       0x0010001000100010LL;
35 static uint64_t bm00000111=0x0000000000FFFFFFLL;
36 static uint64_t bm11111000=0xFFFFFFFFFF000000LL;
37
38 static uint64_t b16Dither= 0x0004000400040004LL;
39 static uint64_t b16Dither1=0x0004000400040004LL;
40 static uint64_t b16Dither2=0x0602060206020602LL;
41 static uint64_t g16Dither= 0x0002000200020002LL;
42 static uint64_t g16Dither1=0x0002000200020002LL;
43 static uint64_t g16Dither2=0x0301030103010301LL;
44
45 static uint64_t b16Mask=   0x001F001F001F001FLL;
46 static uint64_t g16Mask=   0x07E007E007E007E0LL;
47 static uint64_t r16Mask=   0xF800F800F800F800LL;
48 static uint64_t temp0;
49
50
51 // temporary storage for 4 yuv lines:
52 // 16bit for now (mmx likes it more compact)
53 static uint16_t pix_buf_y[4][2048];
54 static uint16_t pix_buf_uv[2][2048*2];
55
56 // clipping helper table for C implementations:
57 static unsigned char clip_table[768];
58
59 // yuv->rgb conversion tables:
60 static    int yuvtab_2568[256];
61 static    int yuvtab_3343[256];
62 static    int yuvtab_0c92[256];
63 static    int yuvtab_1a1e[256];
64 static    int yuvtab_40cf[256];
65
66
67 static uint8_t funnyYCode[10000];
68 static uint8_t funnyUVCode[10000];
69
70
71 // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
72 // *** Note: it's called multiple times while decoding a frame, first time y==0
73 // *** Designed to upscale, but may work for downscale too.
74 // s_xinc = (src_width << 16) / dst_width
75 // s_yinc = (src_height << 16) / dst_height
76 void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
77                              unsigned char* dstptr, int dststride, int dstw, int dstbpp,
78                              unsigned int s_xinc,unsigned int s_yinc){
79
80 // scaling factors:
81 //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
82 //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
83
84 unsigned int s_xinc2;
85
86 static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
87 static int s_ypos;
88
89 // last horzontally interpolated lines, used to avoid unnecessary calculations
90 static int s_last_ypos;
91 static int s_last_y1pos;
92
93 static int static_dstw;
94
95 #ifdef HAVE_MMX2
96 // used to detect a horizontal size change
97 static int old_dstw= -1;
98 static int old_s_xinc= -1;
99
100 // difference between the requested xinc and the required one for the mmx2 routine
101 static int s_xinc_diff=0;
102 static int s_xinc2_diff=0;
103 #endif
104 int canMMX2BeUsed;
105
106 // we need that precission at least for the mmx2 code
107 //s_xinc*= 256;
108 s_xinc2=s_xinc>>1;
109 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0;
110
111 #ifdef HAVE_MMX2
112         if(canMMX2BeUsed)
113         {
114                 s_xinc+= s_xinc_diff;
115                 s_xinc2+= s_xinc2_diff;
116         }
117 #endif
118
119   // force calculation of the horizontal interpolation of the first line
120   s_last_ypos=-99;
121   s_last_y1pos=-99;
122
123   if(y==0){
124       s_srcypos= s_yinc/2 - 0x8000;
125       s_ypos=0;
126 #ifdef HAVE_MMX2
127 // cant downscale !!!
128         if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
129         {
130                 uint8_t *fragment;
131                 int imm8OfPShufW1;
132                 int imm8OfPShufW2;
133                 int fragmentLength;
134
135                 int xpos, xx, xalpha, i;
136
137                 old_s_xinc= s_xinc;
138                 old_dstw= dstw;
139
140                 static_dstw= dstw;
141
142                 // create an optimized horizontal scaling routine
143
144                 //code fragment
145
146                 asm volatile(
147                         "jmp 9f                         \n\t"
148                 // Begin
149                         "0:                             \n\t"
150                         "movq (%%esi), %%mm0            \n\t" //FIXME Alignment
151                         "movq %%mm0, %%mm1              \n\t"
152                         "psrlq $8, %%mm0                \n\t"
153                         "punpcklbw %%mm7, %%mm1 \n\t"
154                         "movq %%mm2, %%mm3              \n\t"
155                         "punpcklbw %%mm7, %%mm0 \n\t"
156                         "addw %%bx, %%cx                \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
157                         "pshufw $0xFF, %%mm1, %%mm1     \n\t"
158                         "1:                             \n\t"
159                         "adcl %%edx, %%esi              \n\t" //xx+= (4*s_xinc)>>16 + carry
160                         "pshufw $0xFF, %%mm0, %%mm0     \n\t"
161                         "2:                             \n\t"
162                         "psrlw $9, %%mm3                \n\t"
163                         "psubw %%mm1, %%mm0             \n\t"
164                         "pmullw %%mm3, %%mm0            \n\t"
165                         "paddw %%mm6, %%mm2             \n\t" // 2*alpha += xpos&0xFFFF
166                         "psllw $7, %%mm1                \n\t"
167                         "paddw %%mm1, %%mm0             \n\t"
168
169                         "movq %%mm0, (%%edi, %%eax)     \n\t"
170
171                         "addl $8, %%eax                 \n\t"
172                 // End
173                         "9:                             \n\t"
174 //              "int $3\n\t"
175                         "leal 0b, %0                    \n\t"
176                         "leal 1b, %1                    \n\t"
177                         "leal 2b, %2                    \n\t"
178                         "decl %1                        \n\t"
179                         "decl %2                        \n\t"
180                         "subl %0, %1                    \n\t"
181                         "subl %0, %2                    \n\t"
182                         "leal 9b, %3                    \n\t"
183                         "subl %0, %3                    \n\t"
184                         :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
185                          "=r" (fragmentLength)
186                 );
187
188                 xpos= xx=xalpha= 0;
189
190                 /* choose xinc so that all 8 parts fit exactly
191                    Note: we cannot use just 1 part because it would not fit in the code cache */
192                 s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10;
193 //              s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
194 #ifdef ALT_ERROR
195                 s_xinc2_diff+= ((0x10000/(dstw/8)));
196 #endif
197                 s_xinc_diff= s_xinc2_diff*2;
198
199                 s_xinc2+= s_xinc2_diff;
200                 s_xinc+= s_xinc_diff;
201
202                 old_s_xinc= s_xinc;
203
204                 for(i=0; i<dstw/8; i++)
205                 {
206                         int xx=xpos>>16;
207
208                         if((i&3) == 0)
209                         {
210                                 int a=0;
211                                 int b=((xpos+s_xinc)>>16) - xx;
212                                 int c=((xpos+s_xinc*2)>>16) - xx;
213                                 int d=((xpos+s_xinc*3)>>16) - xx;
214
215                                 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
216
217                                 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
218                                 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
219                                         a | (b<<2) | (c<<4) | (d<<6);
220
221                                 funnyYCode[fragmentLength*(i+4)/4]= RET;
222                         }
223                         xpos+=s_xinc;
224                 }
225
226                 xpos= xx=xalpha= 0;
227                 //FIXME choose size and or xinc so that they fit exactly
228                 for(i=0; i<dstw/8; i++)
229                 {
230                         int xx=xpos>>16;
231
232                         if((i&3) == 0)
233                         {
234                                 int a=0;
235                                 int b=((xpos+s_xinc2)>>16) - xx;
236                                 int c=((xpos+s_xinc2*2)>>16) - xx;
237                                 int d=((xpos+s_xinc2*3)>>16) - xx;
238
239                                 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
240
241                                 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
242                                 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
243                                         a | (b<<2) | (c<<4) | (d<<6);
244
245                                 funnyUVCode[fragmentLength*(i+4)/4]= RET;
246                         }
247                         xpos+=s_xinc2;
248                 }
249 //              funnyCode[0]= RET;
250         }
251
252 #endif // HAVE_MMX2
253   } // reset counters
254
255
256   while(1){
257     unsigned char *dest=dstptr+dststride*s_ypos;
258     int y0=(s_srcypos + 0xFFFF)>>16;  // first luminance source line number below the dst line
259         // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
260     int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
261     int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
262     int yalpha=((s_srcypos-1)&0xFFFF)>>7;
263     int yalpha1=yalpha^511;
264     int uvalpha=((srcuvpos-1)&0x1FFFF)>>8;
265     int uvalpha1=uvalpha^511;
266     uint16_t *buf0=pix_buf_y[y0&1];             // top line of the interpolated slice
267     uint16_t *buf1=pix_buf_y[((y0+1)&1)];       // bottom line of the interpolated slice
268     uint16_t *uvbuf0=pix_buf_uv[y1&1];          // top line of the interpolated slice
269     uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1];      // bottom line of the interpolated slice
270     int i;
271
272     // if this is before the first line than use only the first src line
273     if(y0==0) buf0= buf1;
274     if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0
275
276     if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
277
278     // if this is after the last line than use only the last src line
279     if(y0>=y+h)
280     {
281         buf1= buf0;
282         s_last_ypos=y0;
283     }
284     if(y1>=(y+h)/2)
285     {
286         uvbuf1= uvbuf0;
287         s_last_y1pos=y1;
288     }
289
290
291     s_ypos++; s_srcypos+=s_yinc;
292
293     //only interpolate the src line horizontally if we didnt do it allready
294     if(s_last_ypos!=y0){
295       unsigned char *src=srcptr[0]+(y0-y)*stride[0];
296       unsigned int xpos=0;
297       s_last_ypos=y0;
298       // *** horizontal scale Y line to temp buffer
299 #ifdef ARCH_X86
300
301 #ifdef HAVE_MMX2
302         if(canMMX2BeUsed)
303         {
304                 asm volatile(
305                         "pxor %%mm7, %%mm7              \n\t"
306                         "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
307                         "movd %5, %%mm6                 \n\t" // s_xinc&0xFFFF
308                         "punpcklwd %%mm6, %%mm6         \n\t"
309                         "punpcklwd %%mm6, %%mm6         \n\t"
310                         "movq %%mm6, %%mm2              \n\t"
311                         "psllq $16, %%mm2               \n\t"
312                         "paddw %%mm6, %%mm2             \n\t"
313                         "psllq $16, %%mm2               \n\t"
314                         "paddw %%mm6, %%mm2             \n\t"
315                         "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=s_xinc&0xFF
316                         "movq %%mm2, temp0              \n\t"
317                         "movd %4, %%mm6                 \n\t" //(s_xinc*4)&0xFFFF
318                         "punpcklwd %%mm6, %%mm6         \n\t"
319                         "punpcklwd %%mm6, %%mm6         \n\t"
320                         "xorl %%eax, %%eax              \n\t" // i
321                         "movl %0, %%esi                 \n\t" // src
322                         "movl %1, %%edi                 \n\t" // buf1
323                         "movl %3, %%edx                 \n\t" // (s_xinc*4)>>16
324                         "xorl %%ecx, %%ecx              \n\t"
325                         "xorl %%ebx, %%ebx              \n\t"
326                         "movw %4, %%bx                  \n\t" // (s_xinc*4)&0xFFFF
327         //      "int $3\n\t"
328                         "call funnyYCode                        \n\t"
329                         "movq temp0, %%mm2              \n\t"
330                         "xorl %%ecx, %%ecx              \n\t"
331                         "call funnyYCode                        \n\t"
332                         "movq temp0, %%mm2              \n\t"
333                         "xorl %%ecx, %%ecx              \n\t"
334                         "call funnyYCode                        \n\t"
335                         "movq temp0, %%mm2              \n\t"
336                         "xorl %%ecx, %%ecx              \n\t"
337                         "call funnyYCode                        \n\t"
338                         "movq temp0, %%mm2              \n\t"
339                         "xorl %%ecx, %%ecx              \n\t"
340                         "call funnyYCode                        \n\t"
341                         "movq temp0, %%mm2              \n\t"
342                         "xorl %%ecx, %%ecx              \n\t"
343                         "call funnyYCode                        \n\t"
344                         "movq temp0, %%mm2              \n\t"
345                         "xorl %%ecx, %%ecx              \n\t"
346                         "call funnyYCode                        \n\t"
347                         "movq temp0, %%mm2              \n\t"
348                         "xorl %%ecx, %%ecx              \n\t"
349                         "call funnyYCode                        \n\t"
350                         :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
351                         "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
352                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
353                 );
354         }
355         else
356         {
357 #endif
358         //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
359         //FIXME add prefetch
360         asm volatile(
361                 "xorl %%eax, %%eax              \n\t" // i
362                 "xorl %%ebx, %%ebx              \n\t" // xx
363                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
364                 "1:                             \n\t"
365                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
366                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
367                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
368                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
369                 "shll $16, %%edi                \n\t"
370                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
371                 "movl %1, %%edi                 \n\t"
372                 "shrl $9, %%esi                 \n\t"
373                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
374                 "addw %4, %%cx                  \n\t" //2*xalpha += s_xinc&0xFF
375                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
376
377                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
378                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
379                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
380                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
381                 "shll $16, %%edi                \n\t"
382                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
383                 "movl %1, %%edi                 \n\t"
384                 "shrl $9, %%esi                 \n\t"
385                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
386                 "addw %4, %%cx                  \n\t" //2*xalpha += s_xinc&0xFF
387                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
388
389
390                 "addl $2, %%eax                 \n\t"
391                 "cmpl %2, %%eax                 \n\t"
392                 " jb 1b                         \n\t"
393
394
395                 :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF)
396                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
397                 );
398 #ifdef HAVE_MMX2
399         } //if MMX2 cant be used
400 #endif
401 #else
402       for(i=0;i<dstw;i++){
403         register unsigned int xx=xpos>>16;
404         register unsigned int xalpha=(xpos&0xFFFF)>>9;
405         buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
406         xpos+=s_xinc;
407       }
408 #endif
409     }
410       // *** horizontal scale U and V lines to temp buffer
411     if(s_last_y1pos!=y1){
412         unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
413         unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
414         int xpos=0;
415         s_last_y1pos= y1;
416 #ifdef ARCH_X86
417 #ifdef HAVE_MMX2
418         if(canMMX2BeUsed)
419         {
420                 asm volatile(
421                 "pxor %%mm7, %%mm7              \n\t"
422                 "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
423                 "movd %5, %%mm6                 \n\t" // s_xinc&0xFFFF
424                 "punpcklwd %%mm6, %%mm6         \n\t"
425                 "punpcklwd %%mm6, %%mm6         \n\t"
426                 "movq %%mm6, %%mm2              \n\t"
427                 "psllq $16, %%mm2               \n\t"
428                 "paddw %%mm6, %%mm2             \n\t"
429                 "psllq $16, %%mm2               \n\t"
430                 "paddw %%mm6, %%mm2             \n\t"
431                 "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=s_xinc&0xFFFF
432                 "movq %%mm2, temp0              \n\t"
433                 "movd %4, %%mm6                 \n\t" //(s_xinc*4)&0xFFFF
434                 "punpcklwd %%mm6, %%mm6         \n\t"
435                 "punpcklwd %%mm6, %%mm6         \n\t"
436                 "xorl %%eax, %%eax              \n\t" // i
437                 "movl %0, %%esi                 \n\t" // src
438                 "movl %1, %%edi                 \n\t" // buf1
439                 "movl %3, %%edx                 \n\t" // (s_xinc*4)>>16
440                 "xorl %%ecx, %%ecx              \n\t"
441                 "xorl %%ebx, %%ebx              \n\t"
442                 "movw %4, %%bx                  \n\t" // (s_xinc*4)&0xFFFF
443
444 //      "int $3\n\t"
445 #define FUNNYUVCODE \
446                 "call funnyUVCode               \n\t"\
447                 "movq temp0, %%mm2              \n\t"\
448                 "xorl %%ecx, %%ecx              \n\t"
449
450 FUNNYUVCODE
451 FUNNYUVCODE
452 FUNNYUVCODE
453 FUNNYUVCODE
454
455 FUNNYUVCODE
456 FUNNYUVCODE
457 FUNNYUVCODE
458 FUNNYUVCODE
459
460
461
462                 "xorl %%eax, %%eax              \n\t" // i
463                 "movl %6, %%esi                 \n\t" // src
464                 "movl %1, %%edi                 \n\t" // buf1
465                 "addl $4096, %%edi              \n\t"
466
467 FUNNYUVCODE
468 FUNNYUVCODE
469 FUNNYUVCODE
470 FUNNYUVCODE
471
472 FUNNYUVCODE
473 FUNNYUVCODE
474 FUNNYUVCODE
475 FUNNYUVCODE
476
477                 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16),
478                   "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
479                 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
480         );
481         }
482         else
483         {
484 #endif
485         asm volatile(
486                 "xorl %%eax, %%eax              \n\t" // i
487                 "xorl %%ebx, %%ebx              \n\t" // xx
488                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
489                 "1:                             \n\t"
490                 "movl %0, %%esi                 \n\t"
491                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
492                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
493                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
494                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
495                 "shll $16, %%edi                \n\t"
496                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
497                 "movl %1, %%edi                 \n\t"
498                 "shrl $9, %%esi                 \n\t"
499                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
500
501                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
502                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
503                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
504                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
505                 "shll $16, %%edi                \n\t"
506                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
507                 "movl %1, %%edi                 \n\t"
508                 "shrl $9, %%esi                 \n\t"
509                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
510
511                 "addw %4, %%cx                  \n\t" //2*xalpha += s_xinc&0xFF
512                 "adcl %3, %%ebx                 \n\t" //xx+= s_xinc>>8 + carry
513                 "addl $1, %%eax                 \n\t"
514                 "cmpl %2, %%eax                 \n\t"
515                 " jb 1b                         \n\t"
516
517
518                 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
519                 "r" (src2)
520                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
521                 );
522 #ifdef HAVE_MMX2
523         } //if MMX2 cant be used
524 #endif
525 #else
526       for(i=0;i<dstw;i++){
527           register unsigned int xx=xpos>>16;
528           register unsigned int xalpha=(xpos&0xFFFF)>>9;
529           uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
530           uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
531           xpos+=s_xinc2;
532       }
533 #endif
534     }
535
536
537     // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
538     // Re: Note1: ok n*4 for now
539     // Note2: instead of using lookup tabs, mmx version could do the multiply...
540     // Re: Note2: yep
541     // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
542     // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
543 #ifdef HAVE_MMX
544         //FIXME write lq version with less uv ...
545         //FIXME reorder / optimize
546         if(dstbpp == 32)
547         {
548                 asm volatile(
549
550 #define YSCALEYUV2RGB \
551                 "pxor %%mm7, %%mm7              \n\t"\
552                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
553                 "punpcklwd %%mm6, %%mm6         \n\t"\
554                 "punpcklwd %%mm6, %%mm6         \n\t"\
555                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
556                 "punpcklwd %%mm5, %%mm5         \n\t"\
557                 "punpcklwd %%mm5, %%mm5         \n\t"\
558                 "xorl %%eax, %%eax              \n\t"\
559                 "1:                             \n\t"\
560                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
561                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
562                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
563                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
564                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
565                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
566                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
567                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
568                 "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>7*/\
569                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
570                 "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
571                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
572                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
573                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
574                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
575                 "psubw w10, %%mm1               \n\t" /* Y-16*/\
576                 "psubw w80, %%mm3               \n\t" /* (U-128)*/\
577                 "psllw $3, %%mm1                \n\t" /* (y-16)*8*/\
578                 "psllw $3, %%mm3                \n\t" /*(U-128)8*/\
579                 "pmulhw yCoeff, %%mm1           \n\t"\
580 \
581 \
582                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
583                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
584                 "pmulhw ubCoeff, %%mm3          \n\t"\
585                 "psraw $7, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
586                 "pmulhw ugCoeff, %%mm2          \n\t"\
587                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
588                 "psubw w80, %%mm0               \n\t" /* (V-128)*/\
589                 "psllw $3, %%mm0                \n\t" /* (V-128)8*/\
590 \
591 \
592                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
593                 "pmulhw vrCoeff, %%mm0          \n\t"\
594                 "pmulhw vgCoeff, %%mm4          \n\t"\
595                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
596                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
597                 "packuswb %%mm3, %%mm3          \n\t"\
598 \
599                 "packuswb %%mm0, %%mm0          \n\t"\
600                 "paddw %%mm4, %%mm2             \n\t"\
601                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
602 \
603                 "packuswb %%mm1, %%mm1          \n\t"
604
605 YSCALEYUV2RGB
606                 "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
607                 "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
608
609                 "movq %%mm3, %%mm1              \n\t"
610                 "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
611                 "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
612 #ifdef HAVE_MMX2
613                 "movntq %%mm3, (%4, %%eax, 4)   \n\t"
614                 "movntq %%mm1, 8(%4, %%eax, 4)  \n\t"
615 #else
616                 "movq %%mm3, (%4, %%eax, 4)     \n\t"
617                 "movq %%mm1, 8(%4, %%eax, 4)    \n\t"
618 #endif
619                 "addl $4, %%eax                 \n\t"
620                 "cmpl %5, %%eax                 \n\t"
621                 " jb 1b                         \n\t"
622
623
624                 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
625                 "m" (yalpha1), "m" (uvalpha1)
626                 : "%eax"
627                 );
628         }
629         else if(dstbpp==24)
630         {
631                 asm volatile(
632
633 YSCALEYUV2RGB
634
635                                                         // lsb ... msb
636                 "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
637                 "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
638
639                 "movq %%mm3, %%mm1              \n\t"
640                 "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
641                 "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
642
643                 "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
644                 "psrlq $8, %%mm3                \n\t" // GR0BGR00
645                 "pand bm00000111, %%mm2         \n\t" // BGR00000
646                 "pand bm11111000, %%mm3         \n\t" // 000BGR00
647                 "por %%mm2, %%mm3               \n\t" // BGRBGR00
648                 "movq %%mm1, %%mm2              \n\t"
649                 "psllq $48, %%mm1               \n\t" // 000000BG
650                 "por %%mm1, %%mm3               \n\t" // BGRBGRBG
651
652                 "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
653                 "psrld $16, %%mm2               \n\t" // R000R000
654                 "psrlq $24, %%mm1               \n\t" // 0BGR0000
655                 "por %%mm2, %%mm1               \n\t" // RBGRR000
656
657                 "movl %4, %%ebx                 \n\t"
658                 "addl %%eax, %%ebx              \n\t"
659 #ifdef HAVE_MMX2
660                 //FIXME Alignment
661                 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
662                 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
663 #else
664                 "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
665                 "psrlq $32, %%mm3               \n\t"
666                 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
667                 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
668 #endif
669                 "addl $4, %%eax                 \n\t"
670                 "cmpl %5, %%eax                 \n\t"
671                 " jb 1b                         \n\t"
672
673                 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
674                 "m" (yalpha1), "m" (uvalpha1)
675                 : "%eax", "%ebx"
676                 );
677         }
678         else if(dstbpp==16)
679         {
680                 asm volatile(
681
682 YSCALEYUV2RGB
683 #ifdef DITHER16BPP
684                 "paddusb g16Dither, %%mm1       \n\t"
685                 "paddusb b16Dither, %%mm0       \n\t"
686                 "paddusb b16Dither, %%mm3       \n\t"
687 #endif
688                 "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
689                 "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
690                 "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
691
692                 "psrlw $3, %%mm3                \n\t"
693                 "psllw $3, %%mm1                \n\t"
694                 "psllw $8, %%mm0                \n\t"
695                 "pand g16Mask, %%mm1            \n\t"
696                 "pand r16Mask, %%mm0            \n\t"
697
698                 "por %%mm3, %%mm1               \n\t"
699                 "por %%mm1, %%mm0               \n\t"
700 #ifdef HAVE_MMX2
701                 "movntq %%mm0, (%4, %%eax, 2)   \n\t"
702 #else
703                 "movq %%mm0, (%4, %%eax, 2)     \n\t"
704 #endif
705                 "addl $4, %%eax                 \n\t"
706                 "cmpl %5, %%eax                 \n\t"
707                 " jb 1b                         \n\t"
708
709                 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
710                 "m" (yalpha1), "m" (uvalpha1)
711                 : "%eax"
712                 );
713         }
714 #else
715         if(dstbpp==32 || dstbpp==24)
716         {
717                 for(i=0;i<dstw;i++){
718                         // vertical linear interpolation && yuv2rgb in a single step:
719                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
720                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
721                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
722                         dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
723                         dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
724                         dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
725                         dest+=dstbpp>>3;
726                 }
727         }
728         else if(dstbpp==16)
729         {
730                 for(i=0;i<dstw;i++){
731                         // vertical linear interpolation && yuv2rgb in a single step:
732                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
733                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
734                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
735
736                         ((uint16_t*)dest)[0] =
737                                 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
738                                 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
739                                 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
740                         dest+=2;
741                 }
742         }
743         else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16?
744         {
745                 for(i=0;i<dstw;i++){
746                         // vertical linear interpolation && yuv2rgb in a single step:
747                         int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
748                         int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
749                         int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
750
751                         ((uint16_t*)dest)[0] =
752                                 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
753                                 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
754                                 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
755                         dest+=2;
756                 }
757         }
758 #endif
759
760         b16Dither= b16Dither1;
761         b16Dither1= b16Dither2;
762         b16Dither2= b16Dither;
763
764         g16Dither= g16Dither1;
765         g16Dither1= g16Dither2;
766         g16Dither2= g16Dither;
767   }
768
769 #ifdef HAVE_3DNOW
770         asm volatile("femms");
771 #elif defined (HAVE_MMX)
772         asm volatile("emms");
773 #endif
774 }
775
776
777 void SwScale_Init(){
778     // generating tables:
779     int i;
780     for(i=0;i<256;i++){
781         clip_table[i]=0;
782         clip_table[i+256]=i;
783         clip_table[i+512]=255;
784         yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
785         yuvtab_3343[i]=0x3343*(i-128);
786         yuvtab_0c92[i]=-0x0c92*(i-128);
787         yuvtab_1a1e[i]=-0x1a1e*(i-128);
788         yuvtab_40cf[i]=0x40cf*(i-128);
789     }
790
791 }