]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
rgb24toyv12 in MMX (untested)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #undef PREFETCH
12 #undef MOVNTQ
13 #undef EMMS
14 #undef SFENCE
15 #undef MMREG_SIZE
16 #undef PREFETCHW
17 #undef PAVGB
18
19 #ifdef HAVE_SSE2
20 #define MMREG_SIZE 16
21 #else
22 #define MMREG_SIZE 8
23 #endif
24
25 #ifdef HAVE_3DNOW
26 #define PREFETCH  "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB     "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
32 #define PAVGB     "pavgb"
33 #else
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
36 #endif
37
38 #ifdef HAVE_3DNOW
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define EMMS     "femms"
41 #else
42 #define EMMS     "emms"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
48 #else
49 #define MOVNTQ "movq"
50 #define SFENCE "/nop"
51 #endif
52
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
54 {
55   uint8_t *dest = dst;
56   const uint8_t *s = src;
57   const uint8_t *end;
58 #ifdef HAVE_MMX
59   uint8_t *mm_end;
60 #endif
61   end = s + src_size;
62 #ifdef HAVE_MMX
63   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
64   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
65   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
66   if(mm_end == end) mm_end -= MMREG_SIZE*4;
67   while(s < mm_end)
68   {
69     __asm __volatile(
70         PREFETCH"       32%1\n\t"
71         "movd   %1, %%mm0\n\t"
72         "punpckldq 3%1, %%mm0\n\t"
73         "movd   6%1, %%mm1\n\t"
74         "punpckldq 9%1, %%mm1\n\t"
75         "movd   12%1, %%mm2\n\t"
76         "punpckldq 15%1, %%mm2\n\t"
77         "movd   18%1, %%mm3\n\t"
78         "punpckldq 21%1, %%mm3\n\t"
79         "pand   %%mm7, %%mm0\n\t"
80         "pand   %%mm7, %%mm1\n\t"
81         "pand   %%mm7, %%mm2\n\t"
82         "pand   %%mm7, %%mm3\n\t"
83         MOVNTQ" %%mm0, %0\n\t"
84         MOVNTQ" %%mm1, 8%0\n\t"
85         MOVNTQ" %%mm2, 16%0\n\t"
86         MOVNTQ" %%mm3, 24%0"
87         :"=m"(*dest)
88         :"m"(*s)
89         :"memory");
90     dest += 32;
91     s += 24;
92   }
93   __asm __volatile(SFENCE:::"memory");
94   __asm __volatile(EMMS:::"memory");
95 #endif
96   while(s < end)
97   {
98     *dest++ = *s++;
99     *dest++ = *s++;
100     *dest++ = *s++;
101     *dest++ = 0;
102   }
103 }
104
105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
106 {
107   uint8_t *dest = dst;
108   const uint8_t *s = src;
109   const uint8_t *end;
110 #ifdef HAVE_MMX
111   uint8_t *mm_end;
112 #endif
113   end = s + src_size;
114 #ifdef HAVE_MMX
115   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
116   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
117   while(s < mm_end)
118   {
119     __asm __volatile(
120         PREFETCH"       32%1\n\t"
121         "movq   %1, %%mm0\n\t"
122         "movq   8%1, %%mm1\n\t"
123         "movq   16%1, %%mm4\n\t"
124         "movq   24%1, %%mm5\n\t"
125         "movq   %%mm0, %%mm2\n\t"
126         "movq   %%mm1, %%mm3\n\t"
127         "movq   %%mm4, %%mm6\n\t"
128         "movq   %%mm5, %%mm7\n\t"
129         "psrlq  $8, %%mm2\n\t"
130         "psrlq  $8, %%mm3\n\t"
131         "psrlq  $8, %%mm6\n\t"
132         "psrlq  $8, %%mm7\n\t"
133         "pand   %2, %%mm0\n\t"
134         "pand   %2, %%mm1\n\t"
135         "pand   %2, %%mm4\n\t"
136         "pand   %2, %%mm5\n\t"
137         "pand   %3, %%mm2\n\t"
138         "pand   %3, %%mm3\n\t"
139         "pand   %3, %%mm6\n\t"
140         "pand   %3, %%mm7\n\t"
141         "por    %%mm2, %%mm0\n\t"
142         "por    %%mm3, %%mm1\n\t"
143         "por    %%mm6, %%mm4\n\t"
144         "por    %%mm7, %%mm5\n\t"
145
146         "movq   %%mm1, %%mm2\n\t"
147         "movq   %%mm4, %%mm3\n\t"
148         "psllq  $48, %%mm2\n\t"
149         "psllq  $32, %%mm3\n\t"
150         "pand   %4, %%mm2\n\t"
151         "pand   %5, %%mm3\n\t"
152         "por    %%mm2, %%mm0\n\t"
153         "psrlq  $16, %%mm1\n\t"
154         "psrlq  $32, %%mm4\n\t"
155         "psllq  $16, %%mm5\n\t"
156         "por    %%mm3, %%mm1\n\t"
157         "pand   %6, %%mm5\n\t"
158         "por    %%mm5, %%mm4\n\t"
159
160         MOVNTQ" %%mm0, %0\n\t"
161         MOVNTQ" %%mm1, 8%0\n\t"
162         MOVNTQ" %%mm4, 16%0"
163         :"=m"(*dest)
164         :"m"(*s),"m"(mask24l),
165          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
166         :"memory");
167     dest += 24;
168     s += 32;
169   }
170   __asm __volatile(SFENCE:::"memory");
171   __asm __volatile(EMMS:::"memory");
172 #endif
173   while(s < end)
174   {
175     *dest++ = *s++;
176     *dest++ = *s++;
177     *dest++ = *s++;
178     s++;
179   }
180 }
181
182 /*
183  Original by Strepto/Astral
184  ported to gcc & bugfixed : A'rpi
185  MMX2, 3DNOW optimization by Nick Kurshev
186  32bit c version, and and&add trick by Michael Niedermayer
187 */
188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
189 {
190 #ifdef HAVE_MMX
191   register const char* s=src+src_size;
192   register char* d=dst+src_size;
193   register int offs=-src_size;
194   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
195   __asm __volatile(
196         "movq   %0, %%mm4\n\t"
197         ::"m"(mask15s));
198   while(offs<0)
199   {
200         __asm __volatile(
201                 PREFETCH"       32%1\n\t"
202                 "movq   %1, %%mm0\n\t"
203                 "movq   8%1, %%mm2\n\t"
204                 "movq   %%mm0, %%mm1\n\t"
205                 "movq   %%mm2, %%mm3\n\t"
206                 "pand   %%mm4, %%mm0\n\t"
207                 "pand   %%mm4, %%mm2\n\t"
208                 "paddw  %%mm1, %%mm0\n\t"
209                 "paddw  %%mm3, %%mm2\n\t"
210                 MOVNTQ" %%mm0, %0\n\t"
211                 MOVNTQ" %%mm2, 8%0"
212                 :"=m"(*(d+offs))
213                 :"m"(*(s+offs))
214                 );
215         offs+=16;
216   }
217   __asm __volatile(SFENCE:::"memory");
218   __asm __volatile(EMMS:::"memory");
219 #else
220 #if 0
221    const uint16_t *s1=( uint16_t * )src;
222    uint16_t *d1=( uint16_t * )dst;
223    uint16_t *e=((uint8_t *)s1)+src_size;
224    while( s1<e ){
225      register int x=*( s1++ );
226      /* rrrrrggggggbbbbb
227         0rrrrrgggggbbbbb
228         0111 1111 1110 0000=0x7FE0
229         00000000000001 1111=0x001F */
230      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
231    }
232 #else
233         const unsigned *s1=( unsigned * )src;
234         unsigned *d1=( unsigned * )dst;
235         int i;
236         int size= src_size>>2;
237         for(i=0; i<size; i++)
238         {
239                 register int x= s1[i];
240 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
241                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
242
243         }
244 #endif
245 #endif
246 }
247
248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
249 {
250 #ifdef HAVE_MMX
251         const uint8_t *s = src;
252         const uint8_t *end,*mm_end;
253         uint16_t *d = (uint16_t *)dst;
254         end = s + src_size;
255         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
256         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
257         __asm __volatile(
258             "movq       %0, %%mm7\n\t"
259             "movq       %1, %%mm6\n\t"
260             ::"m"(red_16mask),"m"(green_16mask));
261         while(s < mm_end)
262         {
263             __asm __volatile(
264                 PREFETCH" 32%1\n\t"
265                 "movd   %1, %%mm0\n\t"
266                 "movd   4%1, %%mm3\n\t"
267                 "punpckldq 8%1, %%mm0\n\t"
268                 "punpckldq 12%1, %%mm3\n\t"
269                 "movq   %%mm0, %%mm1\n\t"
270                 "movq   %%mm0, %%mm2\n\t"
271                 "movq   %%mm3, %%mm4\n\t"
272                 "movq   %%mm3, %%mm5\n\t"
273                 "psrlq  $3, %%mm0\n\t"
274                 "psrlq  $3, %%mm3\n\t"
275                 "pand   %2, %%mm0\n\t"
276                 "pand   %2, %%mm3\n\t"
277                 "psrlq  $5, %%mm1\n\t"
278                 "psrlq  $5, %%mm4\n\t"
279                 "pand   %%mm6, %%mm1\n\t"
280                 "pand   %%mm6, %%mm4\n\t"
281                 "psrlq  $8, %%mm2\n\t"
282                 "psrlq  $8, %%mm5\n\t"
283                 "pand   %%mm7, %%mm2\n\t"
284                 "pand   %%mm7, %%mm5\n\t"
285                 "por    %%mm1, %%mm0\n\t"
286                 "por    %%mm4, %%mm3\n\t"
287                 "por    %%mm2, %%mm0\n\t"
288                 "por    %%mm5, %%mm3\n\t"
289                 "psllq  $16, %%mm3\n\t"
290                 "por    %%mm3, %%mm0\n\t"
291                 MOVNTQ" %%mm0, %0\n\t"
292                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
293                 d += 4;
294                 s += 16;
295         }
296         while(s < end)
297         {
298                 const int b= *s++;
299                 const int g= *s++;
300                 const int r= *s++;
301                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
302         }
303         __asm __volatile(SFENCE:::"memory");
304         __asm __volatile(EMMS:::"memory");
305 #else
306         unsigned j,i,num_pixels=src_size/4;
307         uint16_t *d = (uint16_t *)dst;
308         for(i=0,j=0; j<num_pixels; i+=4,j++)
309         {
310                 const int b= src[i+0];
311                 const int g= src[i+1];
312                 const int r= src[i+2];
313
314                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
315         }
316 #endif
317 }
318
319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
320 {
321 #ifdef HAVE_MMX
322         const uint8_t *s = src;
323         const uint8_t *end,*mm_end;
324         uint16_t *d = (uint16_t *)dst;
325         end = s + src_size;
326         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
327         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
328         __asm __volatile(
329             "movq       %0, %%mm7\n\t"
330             "movq       %1, %%mm6\n\t"
331             ::"m"(red_15mask),"m"(green_15mask));
332         while(s < mm_end)
333         {
334             __asm __volatile(
335                 PREFETCH" 32%1\n\t"
336                 "movd   %1, %%mm0\n\t"
337                 "movd   4%1, %%mm3\n\t"
338                 "punpckldq 8%1, %%mm0\n\t"
339                 "punpckldq 12%1, %%mm3\n\t"
340                 "movq   %%mm0, %%mm1\n\t"
341                 "movq   %%mm0, %%mm2\n\t"
342                 "movq   %%mm3, %%mm4\n\t"
343                 "movq   %%mm3, %%mm5\n\t"
344                 "psrlq  $3, %%mm0\n\t"
345                 "psrlq  $3, %%mm3\n\t"
346                 "pand   %2, %%mm0\n\t"
347                 "pand   %2, %%mm3\n\t"
348                 "psrlq  $6, %%mm1\n\t"
349                 "psrlq  $6, %%mm4\n\t"
350                 "pand   %%mm6, %%mm1\n\t"
351                 "pand   %%mm6, %%mm4\n\t"
352                 "psrlq  $9, %%mm2\n\t"
353                 "psrlq  $9, %%mm5\n\t"
354                 "pand   %%mm7, %%mm2\n\t"
355                 "pand   %%mm7, %%mm5\n\t"
356                 "por    %%mm1, %%mm0\n\t"
357                 "por    %%mm4, %%mm3\n\t"
358                 "por    %%mm2, %%mm0\n\t"
359                 "por    %%mm5, %%mm3\n\t"
360                 "psllq  $16, %%mm3\n\t"
361                 "por    %%mm3, %%mm0\n\t"
362                 MOVNTQ" %%mm0, %0\n\t"
363                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
364                 d += 4;
365                 s += 16;
366         }
367         while(s < end)
368         {
369                 const int b= *s++;
370                 const int g= *s++;
371                 const int r= *s++;
372                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
373         }
374         __asm __volatile(SFENCE:::"memory");
375         __asm __volatile(EMMS:::"memory");
376 #else
377         unsigned j,i,num_pixels=src_size/4;
378         uint16_t *d = (uint16_t *)dst;
379         for(i=0,j=0; j<num_pixels; i+=4,j++)
380         {
381                 const int b= src[i+0];
382                 const int g= src[i+1];
383                 const int r= src[i+2];
384
385                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
386         }
387 #endif
388 }
389
390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
391 {
392 #ifdef HAVE_MMX
393         const uint8_t *s = src;
394         const uint8_t *end,*mm_end;
395         uint16_t *d = (uint16_t *)dst;
396         end = s + src_size;
397         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
398         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
399         __asm __volatile(
400             "movq       %0, %%mm7\n\t"
401             "movq       %1, %%mm6\n\t"
402             ::"m"(red_16mask),"m"(green_16mask));
403         if(mm_end == end) mm_end -= MMREG_SIZE*2;
404         while(s < mm_end)
405         {
406             __asm __volatile(
407                 PREFETCH" 32%1\n\t"
408                 "movd   %1, %%mm0\n\t"
409                 "movd   3%1, %%mm3\n\t"
410                 "punpckldq 6%1, %%mm0\n\t"
411                 "punpckldq 9%1, %%mm3\n\t"
412                 "movq   %%mm0, %%mm1\n\t"
413                 "movq   %%mm0, %%mm2\n\t"
414                 "movq   %%mm3, %%mm4\n\t"
415                 "movq   %%mm3, %%mm5\n\t"
416                 "psrlq  $3, %%mm0\n\t"
417                 "psrlq  $3, %%mm3\n\t"
418                 "pand   %2, %%mm0\n\t"
419                 "pand   %2, %%mm3\n\t"
420                 "psrlq  $5, %%mm1\n\t"
421                 "psrlq  $5, %%mm4\n\t"
422                 "pand   %%mm6, %%mm1\n\t"
423                 "pand   %%mm6, %%mm4\n\t"
424                 "psrlq  $8, %%mm2\n\t"
425                 "psrlq  $8, %%mm5\n\t"
426                 "pand   %%mm7, %%mm2\n\t"
427                 "pand   %%mm7, %%mm5\n\t"
428                 "por    %%mm1, %%mm0\n\t"
429                 "por    %%mm4, %%mm3\n\t"
430                 "por    %%mm2, %%mm0\n\t"
431                 "por    %%mm5, %%mm3\n\t"
432                 "psllq  $16, %%mm3\n\t"
433                 "por    %%mm3, %%mm0\n\t"
434                 MOVNTQ" %%mm0, %0\n\t"
435                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
436                 d += 4;
437                 s += 12;
438         }
439         while(s < end)
440         {
441                 const int b= *s++;
442                 const int g= *s++;
443                 const int r= *s++;
444                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
445         }
446         __asm __volatile(SFENCE:::"memory");
447         __asm __volatile(EMMS:::"memory");
448 #else
449         unsigned j,i,num_pixels=src_size/3;
450         uint16_t *d = (uint16_t *)dst;
451         for(i=0,j=0; j<num_pixels; i+=3,j++)
452         {
453                 const int b= src[i+0];
454                 const int g= src[i+1];
455                 const int r= src[i+2];
456
457                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
458         }
459 #endif
460 }
461
462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
463 {
464 #ifdef HAVE_MMX
465         const uint8_t *s = src;
466         const uint8_t *end,*mm_end;
467         uint16_t *d = (uint16_t *)dst;
468         end = s + src_size;
469         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
470         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
471         __asm __volatile(
472             "movq       %0, %%mm7\n\t"
473             "movq       %1, %%mm6\n\t"
474             ::"m"(red_15mask),"m"(green_15mask));
475         if(mm_end == end) mm_end -= MMREG_SIZE*2;
476         while(s < mm_end)
477         {
478             __asm __volatile(
479                 PREFETCH" 32%1\n\t"
480                 "movd   %1, %%mm0\n\t"
481                 "movd   3%1, %%mm3\n\t"
482                 "punpckldq 6%1, %%mm0\n\t"
483                 "punpckldq 9%1, %%mm3\n\t"
484                 "movq   %%mm0, %%mm1\n\t"
485                 "movq   %%mm0, %%mm2\n\t"
486                 "movq   %%mm3, %%mm4\n\t"
487                 "movq   %%mm3, %%mm5\n\t"
488                 "psrlq  $3, %%mm0\n\t"
489                 "psrlq  $3, %%mm3\n\t"
490                 "pand   %2, %%mm0\n\t"
491                 "pand   %2, %%mm3\n\t"
492                 "psrlq  $6, %%mm1\n\t"
493                 "psrlq  $6, %%mm4\n\t"
494                 "pand   %%mm6, %%mm1\n\t"
495                 "pand   %%mm6, %%mm4\n\t"
496                 "psrlq  $9, %%mm2\n\t"
497                 "psrlq  $9, %%mm5\n\t"
498                 "pand   %%mm7, %%mm2\n\t"
499                 "pand   %%mm7, %%mm5\n\t"
500                 "por    %%mm1, %%mm0\n\t"
501                 "por    %%mm4, %%mm3\n\t"
502                 "por    %%mm2, %%mm0\n\t"
503                 "por    %%mm5, %%mm3\n\t"
504                 "psllq  $16, %%mm3\n\t"
505                 "por    %%mm3, %%mm0\n\t"
506                 MOVNTQ" %%mm0, %0\n\t"
507                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
508                 d += 4;
509                 s += 12;
510         }
511         while(s < end)
512         {
513                 const int b= *s++;
514                 const int g= *s++;
515                 const int r= *s++;
516                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
517         }
518         __asm __volatile(SFENCE:::"memory");
519         __asm __volatile(EMMS:::"memory");
520 #else
521         unsigned j,i,num_pixels=src_size/3;
522         uint16_t *d = (uint16_t *)dst;
523         for(i=0,j=0; j<num_pixels; i+=3,j++)
524         {
525                 const int b= src[i+0];
526                 const int g= src[i+1];
527                 const int r= src[i+2];
528
529                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
530         }
531 #endif
532 }
533
534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
535 {
536         int num_pixels= src_size >> 2;
537 #ifdef HAVE_MMX
538         asm volatile (
539                 "xorl %%eax, %%eax              \n\t"
540                 ".balign 16                     \n\t"
541                 "1:                             \n\t"
542                 PREFETCH" 32(%0, %%eax)         \n\t"
543                 "movq (%0, %%eax), %%mm0        \n\t"
544                 "movq %%mm0, %%mm1              \n\t"
545                 "movq %%mm0, %%mm2              \n\t"
546                 "pslld $16, %%mm0               \n\t"
547                 "psrld $16, %%mm1               \n\t"
548                 "pand mask32r, %%mm0            \n\t"
549                 "pand mask32g, %%mm2            \n\t"
550                 "pand mask32b, %%mm1            \n\t"
551                 "por %%mm0, %%mm2               \n\t"
552                 "por %%mm1, %%mm2               \n\t"
553                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
554                 "addl $2, %%eax                 \n\t"
555                 "cmpl %2, %%eax                 \n\t"
556                 " jb 1b                         \n\t"
557                 :: "r" (src), "r"(dst), "r" (num_pixels)
558                 : "%eax"
559         );
560
561         __asm __volatile(SFENCE:::"memory");
562         __asm __volatile(EMMS:::"memory");
563 #else
564         int i;
565         for(i=0; i<num_pixels; i++)
566         {
567                 dst[4*i + 0] = src[4*i + 2];
568                 dst[4*i + 1] = src[4*i + 1];
569                 dst[4*i + 2] = src[4*i + 0];
570         }
571 #endif
572 }
573
574 /**
575  *
576  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
577  * problem for anyone then tell me, and ill fix it)
578  */
579 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
580         unsigned int width, unsigned int height,
581         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
582 {
583         int y;
584         const int chromWidth= width>>1;
585         for(y=0; y<height; y++)
586         {
587 #ifdef HAVE_MMX
588 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
589                 asm volatile(
590                         "xorl %%eax, %%eax              \n\t"
591                         ".balign 16                     \n\t"
592                         "1:                             \n\t"
593                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
594                         PREFETCH" 32(%2, %%eax)         \n\t"
595                         PREFETCH" 32(%3, %%eax)         \n\t"
596                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
597                         "movq %%mm0, %%mm2              \n\t" // U(0)
598                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
599                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
600                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
601
602                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
603                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
604                         "movq %%mm3, %%mm4              \n\t" // Y(0)
605                         "movq %%mm5, %%mm6              \n\t" // Y(8)
606                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
607                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
608                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
609                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
610
611                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
612                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
613                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
614                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
615
616                         "addl $8, %%eax                 \n\t"
617                         "cmpl %4, %%eax                 \n\t"
618                         " jb 1b                         \n\t"
619                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
620                         : "%eax"
621                 );
622 #else
623                 int i;
624                 for(i=0; i<chromWidth; i++)
625                 {
626                         dst[4*i+0] = ysrc[2*i+0];
627                         dst[4*i+1] = usrc[i];
628                         dst[4*i+2] = ysrc[2*i+1];
629                         dst[4*i+3] = vsrc[i];
630                 }
631 #endif
632                 if(y&1)
633                 {
634                         usrc += chromStride;
635                         vsrc += chromStride;
636                 }
637                 ysrc += lumStride;
638                 dst += dstStride;
639         }
640 #ifdef HAVE_MMX
641 asm(    EMMS" \n\t"
642         SFENCE" \n\t"
643         :::"memory");
644 #endif
645 }
646
647 /**
648  *
649  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
650  * problem for anyone then tell me, and ill fix it)
651  */
652 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
653         unsigned int width, unsigned int height,
654         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
655 {
656         int y;
657         const int chromWidth= width>>1;
658         for(y=0; y<height; y+=2)
659         {
660 #ifdef HAVE_MMX
661                 asm volatile(
662                         "xorl %%eax, %%eax              \n\t"
663                         "pcmpeqw %%mm7, %%mm7           \n\t"
664                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
665                         ".balign 16                     \n\t"
666                         "1:                             \n\t"
667                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
668                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
669                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
670                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
671                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
672                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
673                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
674                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
675                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
676                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
677                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
678
679                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
680
681                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
682                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
683                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
684                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
685                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
686                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
687                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
688                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
689                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
690                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
691
692                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
693
694                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
695                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
696                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
697                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
698                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
699                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
700                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
701                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
702
703                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
704                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
705
706                         "addl $8, %%eax                 \n\t"
707                         "cmpl %4, %%eax                 \n\t"
708                         " jb 1b                         \n\t"
709                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
710                         : "memory", "%eax"
711                 );
712
713                 ydst += lumStride;
714                 src  += srcStride;
715
716                 asm volatile(
717                         "xorl %%eax, %%eax              \n\t"
718                         ".balign 16                     \n\t"
719                         "1:                             \n\t"
720                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
721                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
722                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
723                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
724                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
725                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
726                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
727                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
728                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
729                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
730                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
731
732                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
733                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
734
735                         "addl $8, %%eax                 \n\t"
736                         "cmpl %4, %%eax                 \n\t"
737                         " jb 1b                         \n\t"
738
739                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
740                         : "memory", "%eax"
741                 );
742 #else
743                 int i;
744                 for(i=0; i<chromWidth; i++)
745                 {
746                         ydst[2*i+0]     = src[4*i+0];
747                         udst[i]         = src[4*i+1];
748                         ydst[2*i+1]     = src[4*i+2];
749                         vdst[i]         = src[4*i+3];
750                 }
751                 ydst += lumStride;
752                 src  += srcStride;
753
754                 for(i=0; i<chromWidth; i++)
755                 {
756                         ydst[2*i+0]     = src[4*i+0];
757                         ydst[2*i+1]     = src[4*i+2];
758                 }
759 #endif
760                 udst += chromStride;
761                 vdst += chromStride;
762                 ydst += lumStride;
763                 src  += srcStride;
764         }
765 #ifdef HAVE_MMX
766 asm volatile(   EMMS" \n\t"
767                 SFENCE" \n\t"
768                 :::"memory");
769 #endif
770 }
771
772 /**
773  *
774  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
775  * problem for anyone then tell me, and ill fix it)
776  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
777  */
778 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
779         unsigned int width, unsigned int height,
780         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
781 {
782         int y;
783         const int chromWidth= width>>1;
784         for(y=0; y<height; y+=2)
785         {
786 #ifdef HAVE_MMX
787                 asm volatile(
788                         "xorl %%eax, %%eax              \n\t"
789                         "pcmpeqw %%mm7, %%mm7           \n\t"
790                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
791                         ".balign 16                     \n\t"
792                         "1:                             \n\t"
793                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
794                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
795                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
796                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
797                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
798                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
799                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
800                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
801                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
802                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
803                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
804
805                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
806
807                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
808                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
809                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
810                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
811                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
812                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
813                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
814                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
815                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
816                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
817
818                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
819
820                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
821                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
822                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
823                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
824                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
825                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
826                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
827                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
828
829                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
830                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
831
832                         "addl $8, %%eax                 \n\t"
833                         "cmpl %4, %%eax                 \n\t"
834                         " jb 1b                         \n\t"
835                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
836                         : "memory", "%eax"
837                 );
838
839                 ydst += lumStride;
840                 src  += srcStride;
841
842                 asm volatile(
843                         "xorl %%eax, %%eax              \n\t"
844                         ".balign 16                     \n\t"
845                         "1:                             \n\t"
846                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
847                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
848                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
849                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
850                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
851                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
852                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
853                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
854                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
855                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
856                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
857
858                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
859                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
860
861                         "addl $8, %%eax                 \n\t"
862                         "cmpl %4, %%eax                 \n\t"
863                         " jb 1b                         \n\t"
864
865                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
866                         : "memory", "%eax"
867                 );
868 #else
869                 int i;
870                 for(i=0; i<chromWidth; i++)
871                 {
872                         udst[i]         = src[4*i+0];
873                         ydst[2*i+0]     = src[4*i+1];
874                         vdst[i]         = src[4*i+2];
875                         ydst[2*i+1]     = src[4*i+3];
876                 }
877                 ydst += lumStride;
878                 src  += srcStride;
879
880                 for(i=0; i<chromWidth; i++)
881                 {
882                         ydst[2*i+0]     = src[4*i+1];
883                         ydst[2*i+1]     = src[4*i+3];
884                 }
885 #endif
886                 udst += chromStride;
887                 vdst += chromStride;
888                 ydst += lumStride;
889                 src  += srcStride;
890         }
891 #ifdef HAVE_MMX
892 asm volatile(   EMMS" \n\t"
893                 SFENCE" \n\t"
894                 :::"memory");
895 #endif
896 }
897
898 /**
899  *
900  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
901  * problem for anyone then tell me, and ill fix it)
902  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
903  */
904 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
905         unsigned int width, unsigned int height,
906         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
907 {
908         int y;
909         const int chromWidth= width>>1;
910 #ifdef HAVE_MMX
911         for(y=0; y<height-2; y+=2)
912         {
913                 int i;
914                 for(i=0; i<2; i++)
915                 {
916                         asm volatile(
917                                 "movl %2, %%eax                 \n\t"
918                                 "movq bgr2YCoeff, %%mm6         \n\t"
919                                 "movq w1111, %%mm5              \n\t"
920                                 "pxor %%mm7, %%mm7              \n\t"
921                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
922                                 ".balign 16                     \n\t"
923                                 "1:                             \n\t"
924                                 PREFETCH" 64(%0, %%ebx)         \n\t"
925                                 "movd (%0, %%ebx), %%mm0        \n\t"
926                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
927                                 "punpcklbw %%mm7, %%mm0         \n\t"
928                                 "punpcklbw %%mm7, %%mm1         \n\t"
929                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
930                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
931                                 "punpcklbw %%mm7, %%mm2         \n\t"
932                                 "punpcklbw %%mm7, %%mm3         \n\t"
933                                 "pmaddwd %%mm6, %%mm0           \n\t"
934                                 "pmaddwd %%mm6, %%mm1           \n\t"
935                                 "pmaddwd %%mm6, %%mm2           \n\t"
936                                 "pmaddwd %%mm6, %%mm3           \n\t"
937 #ifndef FAST_BGR2YV12
938                                 "psrad $8, %%mm0                \n\t"
939                                 "psrad $8, %%mm1                \n\t"
940                                 "psrad $8, %%mm2                \n\t"
941                                 "psrad $8, %%mm3                \n\t"
942 #endif
943                                 "packssdw %%mm1, %%mm0          \n\t"
944                                 "packssdw %%mm3, %%mm2          \n\t"
945                                 "pmaddwd %%mm5, %%mm0           \n\t"
946                                 "pmaddwd %%mm5, %%mm2           \n\t"
947                                 "packssdw %%mm2, %%mm0          \n\t"
948                                 "psraw $7, %%mm0                \n\t"
949
950                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
951                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
952                                 "punpcklbw %%mm7, %%mm4         \n\t"
953                                 "punpcklbw %%mm7, %%mm1         \n\t"
954                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
955                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
956                                 "punpcklbw %%mm7, %%mm2         \n\t"
957                                 "punpcklbw %%mm7, %%mm3         \n\t"
958                                 "pmaddwd %%mm6, %%mm4           \n\t"
959                                 "pmaddwd %%mm6, %%mm1           \n\t"
960                                 "pmaddwd %%mm6, %%mm2           \n\t"
961                                 "pmaddwd %%mm6, %%mm3           \n\t"
962 #ifndef FAST_BGR2YV12
963                                 "psrad $8, %%mm4                \n\t"
964                                 "psrad $8, %%mm1                \n\t"
965                                 "psrad $8, %%mm2                \n\t"
966                                 "psrad $8, %%mm3                \n\t"
967 #endif
968                                 "packssdw %%mm1, %%mm4          \n\t"
969                                 "packssdw %%mm3, %%mm2          \n\t"
970                                 "pmaddwd %%mm5, %%mm4           \n\t"
971                                 "pmaddwd %%mm5, %%mm2           \n\t"
972                                 "addl $24, %%ebx                \n\t"
973                                 "packssdw %%mm2, %%mm4          \n\t"
974                                 "psraw $7, %%mm4                \n\t"
975
976                                 "packuswb %%mm4, %%mm0          \n\t"
977                                 "paddusb bgr2YOffset, %%mm0     \n\t"
978
979                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
980                                 "addl $8, %%eax                 \n\t"
981                                 " js 1b                         \n\t"
982                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
983                                 : "%eax", "%ebx"
984                         );
985                         ydst += lumStride;
986                         src  += srcStride;
987                 }
988                 src -= srcStride*2;
989                 asm volatile(
990                         "movl %4, %%eax                 \n\t"
991                         "movq w1111, %%mm5              \n\t"
992                         "movq bgr2UCoeff, %%mm6         \n\t"
993                         "pxor %%mm7, %%mm7              \n\t"
994                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
995                         "addl %%ebx, %%ebx              \n\t"
996                         ".balign 16                     \n\t"
997                         "1:                             \n\t"
998                         PREFETCH" 64(%0, %%ebx)         \n\t"
999                         PREFETCH" 64(%1, %%ebx)         \n\t"
1000 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1001                         "movq (%0, %%ebx), %%mm0        \n\t"
1002                         "movq (%1, %%ebx), %%mm1        \n\t"
1003                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1004                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1005                         PAVGB" %%mm1, %%mm0             \n\t"
1006                         PAVGB" %%mm3, %%mm2             \n\t"
1007                         "movq %%mm0, %%mm1              \n\t"
1008                         "movq %%mm2, %%mm3              \n\t"
1009                         "psrlq $24, %%mm0               \n\t"
1010                         "psrlq $24, %%mm2               \n\t"
1011                         PAVGB" %%mm1, %%mm0             \n\t"
1012                         PAVGB" %%mm3, %%mm2             \n\t"
1013                         "punpcklbw %%mm7, %%mm0         \n\t"
1014                         "punpcklbw %%mm7, %%mm2         \n\t"
1015 #else
1016                         "movd (%0, %%ebx), %%mm0        \n\t"
1017                         "movd (%1, %%ebx), %%mm1        \n\t"
1018                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1019                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1020                         "punpcklbw %%mm7, %%mm0         \n\t"
1021                         "punpcklbw %%mm7, %%mm1         \n\t"
1022                         "punpcklbw %%mm7, %%mm2         \n\t"
1023                         "punpcklbw %%mm7, %%mm3         \n\t"
1024                         "paddw %%mm1, %%mm0             \n\t"
1025                         "paddw %%mm3, %%mm2             \n\t"
1026                         "paddw %%mm2, %%mm0             \n\t"
1027                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1028                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1029                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1030                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1031                         "punpcklbw %%mm7, %%mm4         \n\t"
1032                         "punpcklbw %%mm7, %%mm1         \n\t"
1033                         "punpcklbw %%mm7, %%mm2         \n\t"
1034                         "punpcklbw %%mm7, %%mm3         \n\t"
1035                         "paddw %%mm1, %%mm4             \n\t"
1036                         "paddw %%mm3, %%mm2             \n\t"
1037                         "paddw %%mm4, %%mm2             \n\t"
1038                         "psrlw $2, %%mm0                \n\t"
1039                         "psrlw $2, %%mm2                \n\t"
1040 #endif
1041                         "movq bgr2VCoeff, %%mm1         \n\t"
1042                         "movq bgr2VCoeff, %%mm3         \n\t"
1043
1044                         "pmaddwd %%mm0, %%mm1           \n\t"
1045                         "pmaddwd %%mm2, %%mm3           \n\t"
1046                         "pmaddwd %%mm6, %%mm0           \n\t"
1047                         "pmaddwd %%mm6, %%mm2           \n\t"
1048 #ifndef FAST_BGR2YV12
1049                         "psrad $8, %%mm0                \n\t"
1050                         "psrad $8, %%mm1                \n\t"
1051                         "psrad $8, %%mm2                \n\t"
1052                         "psrad $8, %%mm3                \n\t"
1053 #endif
1054                         "packssdw %%mm2, %%mm0          \n\t"
1055                         "packssdw %%mm3, %%mm1          \n\t"
1056                         "pmaddwd %%mm5, %%mm0           \n\t"
1057                         "pmaddwd %%mm5, %%mm1           \n\t"
1058                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1059                         "psraw $7, %%mm0                \n\t"
1060
1061 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1062                         "movq 12(%0, %%ebx), %%mm4      \n\t"
1063                         "movq 12(%1, %%ebx), %%mm1      \n\t"
1064                         "movq 18(%0, %%ebx), %%mm2      \n\t"
1065                         "movq 18(%1, %%ebx), %%mm3      \n\t"
1066                         PAVGB" %%mm1, %%mm4             \n\t"
1067                         PAVGB" %%mm3, %%mm2             \n\t"
1068                         "movq %%mm4, %%mm1              \n\t"
1069                         "movq %%mm2, %%mm3              \n\t"
1070                         "psrlq $24, %%mm4               \n\t"
1071                         "psrlq $24, %%mm2               \n\t"
1072                         PAVGB" %%mm1, %%mm4             \n\t"
1073                         PAVGB" %%mm3, %%mm2             \n\t"
1074                         "punpcklbw %%mm7, %%mm4         \n\t"
1075                         "punpcklbw %%mm7, %%mm2         \n\t"
1076 #else
1077                         "movd 12(%0, %%ebx), %%mm4      \n\t"
1078                         "movd 12(%1, %%ebx), %%mm1      \n\t"
1079                         "movd 15(%0, %%ebx), %%mm2      \n\t"
1080                         "movd 15(%1, %%ebx), %%mm3      \n\t"
1081                         "punpcklbw %%mm7, %%mm4         \n\t"
1082                         "punpcklbw %%mm7, %%mm1         \n\t"
1083                         "punpcklbw %%mm7, %%mm2         \n\t"
1084                         "punpcklbw %%mm7, %%mm3         \n\t"
1085                         "paddw %%mm1, %%mm4             \n\t"
1086                         "paddw %%mm3, %%mm2             \n\t"
1087                         "paddw %%mm2, %%mm4             \n\t"
1088                         "movd 18(%0, %%ebx), %%mm5      \n\t"
1089                         "movd 18(%1, %%ebx), %%mm1      \n\t"
1090                         "movd 21(%0, %%ebx), %%mm2      \n\t"
1091                         "movd 21(%1, %%ebx), %%mm3      \n\t"
1092                         "punpcklbw %%mm7, %%mm5         \n\t"
1093                         "punpcklbw %%mm7, %%mm1         \n\t"
1094                         "punpcklbw %%mm7, %%mm2         \n\t"
1095                         "punpcklbw %%mm7, %%mm3         \n\t"
1096                         "paddw %%mm1, %%mm5             \n\t"
1097                         "paddw %%mm3, %%mm2             \n\t"
1098                         "paddw %%mm5, %%mm2             \n\t"
1099                         "movq w1111, %%mm5              \n\t"
1100                         "psrlw $2, %%mm4                \n\t"
1101                         "psrlw $2, %%mm2                \n\t"
1102 #endif
1103                         "movq bgr2VCoeff, %%mm1         \n\t"
1104                         "movq bgr2VCoeff, %%mm3         \n\t"
1105
1106                         "pmaddwd %%mm4, %%mm1           \n\t"
1107                         "pmaddwd %%mm2, %%mm3           \n\t"
1108                         "pmaddwd %%mm6, %%mm4           \n\t"
1109                         "pmaddwd %%mm6, %%mm2           \n\t"
1110 #ifndef FAST_BGR2YV12
1111                         "psrad $8, %%mm4                \n\t"
1112                         "psrad $8, %%mm1                \n\t"
1113                         "psrad $8, %%mm2                \n\t"
1114                         "psrad $8, %%mm3                \n\t"
1115 #endif
1116                         "packssdw %%mm2, %%mm4          \n\t"
1117                         "packssdw %%mm3, %%mm1          \n\t"
1118                         "pmaddwd %%mm5, %%mm4           \n\t"
1119                         "pmaddwd %%mm5, %%mm1           \n\t"
1120                         "addl $24, %%ebx                \n\t"
1121                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1122                         "psraw $7, %%mm4                \n\t"
1123
1124                         "movq %%mm0, %%mm1              \n\t"
1125                         "punpckldq %%mm4, %%mm0         \n\t"
1126                         "punpckhdq %%mm4, %%mm1         \n\t"
1127                         "packsswb %%mm1, %%mm0          \n\t"
1128                         "paddb bgr2UVOffset, %%mm0      \n\t"
1129
1130                         "movd %%mm0, (%2, %%eax)        \n\t"
1131                         "punpckhdq %%mm0, %%mm0         \n\t"
1132                         "movd %%mm0, (%3, %%eax)        \n\t"
1133                         "addl $4, %%eax                 \n\t"
1134                         " js 1b                         \n\t"
1135                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1136                         : "%eax", "%ebx"
1137                 );
1138
1139                 udst += chromStride;
1140                 vdst += chromStride;
1141                 src  += srcStride*2;
1142         }
1143
1144         asm volatile(   EMMS" \n\t"
1145                         SFENCE" \n\t"
1146                         :::"memory");
1147 #else
1148         y=0;
1149 #endif
1150         for(; y<height; y+=2)
1151         {
1152                 int i;
1153                 for(i=0; i<chromWidth; i++)
1154                 {
1155                         unsigned int b= src[6*i+0];
1156                         unsigned int g= src[6*i+1];
1157                         unsigned int r= src[6*i+2];
1158
1159                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1160                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1161                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1162
1163                         udst[i]         = U;
1164                         vdst[i]         = V;
1165                         ydst[2*i]       = Y;
1166
1167                         b= src[6*i+3];
1168                         g= src[6*i+4];
1169                         r= src[6*i+5];
1170
1171                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1172                         ydst[2*i+1]     = Y;
1173                 }
1174                 ydst += lumStride;
1175                 src  += srcStride;
1176
1177                 for(i=0; i<chromWidth; i++)
1178                 {
1179                         unsigned int b= src[6*i+0];
1180                         unsigned int g= src[6*i+1];
1181                         unsigned int r= src[6*i+2];
1182
1183                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1184
1185                         ydst[2*i]       = Y;
1186
1187                         b= src[6*i+3];
1188                         g= src[6*i+4];
1189                         r= src[6*i+5];
1190
1191                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1192                         ydst[2*i+1]     = Y;
1193                 }
1194                 udst += chromStride;
1195                 vdst += chromStride;
1196                 ydst += lumStride;
1197                 src  += srcStride;
1198         }
1199 }