]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
pre-yvu9toyv12 converter, only grayscale Y-plane coping :)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #undef PREFETCH
12 #undef MOVNTQ
13 #undef EMMS
14 #undef SFENCE
15 #undef MMREG_SIZE
16 #undef PREFETCHW
17 #undef PAVGB
18
19 #ifdef HAVE_SSE2
20 #define MMREG_SIZE 16
21 #else
22 #define MMREG_SIZE 8
23 #endif
24
25 #ifdef HAVE_3DNOW
26 #define PREFETCH  "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB     "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
32 #define PAVGB     "pavgb"
33 #else
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
36 #endif
37
38 #ifdef HAVE_3DNOW
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define EMMS     "femms"
41 #else
42 #define EMMS     "emms"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
48 #else
49 #define MOVNTQ "movq"
50 #define SFENCE "/nop"
51 #endif
52
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
54 {
55   uint8_t *dest = dst;
56   const uint8_t *s = src;
57   const uint8_t *end;
58 #ifdef HAVE_MMX
59   const uint8_t *mm_end;
60 #endif
61   end = s + src_size;
62 #ifdef HAVE_MMX
63   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
64   mm_end = end - 23;
65   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
66   while(s < mm_end)
67   {
68     __asm __volatile(
69         PREFETCH"       32%1\n\t"
70         "movd   %1, %%mm0\n\t"
71         "punpckldq 3%1, %%mm0\n\t"
72         "movd   6%1, %%mm1\n\t"
73         "punpckldq 9%1, %%mm1\n\t"
74         "movd   12%1, %%mm2\n\t"
75         "punpckldq 15%1, %%mm2\n\t"
76         "movd   18%1, %%mm3\n\t"
77         "punpckldq 21%1, %%mm3\n\t"
78         "pand   %%mm7, %%mm0\n\t"
79         "pand   %%mm7, %%mm1\n\t"
80         "pand   %%mm7, %%mm2\n\t"
81         "pand   %%mm7, %%mm3\n\t"
82         MOVNTQ" %%mm0, %0\n\t"
83         MOVNTQ" %%mm1, 8%0\n\t"
84         MOVNTQ" %%mm2, 16%0\n\t"
85         MOVNTQ" %%mm3, 24%0"
86         :"=m"(*dest)
87         :"m"(*s)
88         :"memory");
89     dest += 32;
90     s += 24;
91   }
92   __asm __volatile(SFENCE:::"memory");
93   __asm __volatile(EMMS:::"memory");
94 #endif
95   while(s < end)
96   {
97     *dest++ = *s++;
98     *dest++ = *s++;
99     *dest++ = *s++;
100     *dest++ = 0;
101   }
102 }
103
104 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
105 {
106   uint8_t *dest = dst;
107   const uint8_t *s = src;
108   const uint8_t *end;
109 #ifdef HAVE_MMX
110   const uint8_t *mm_end;
111 #endif
112   end = s + src_size;
113 #ifdef HAVE_MMX
114   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
115   mm_end = end - 31;
116   while(s < mm_end)
117   {
118     __asm __volatile(
119         PREFETCH"       32%1\n\t"
120         "movq   %1, %%mm0\n\t"
121         "movq   8%1, %%mm1\n\t"
122         "movq   16%1, %%mm4\n\t"
123         "movq   24%1, %%mm5\n\t"
124         "movq   %%mm0, %%mm2\n\t"
125         "movq   %%mm1, %%mm3\n\t"
126         "movq   %%mm4, %%mm6\n\t"
127         "movq   %%mm5, %%mm7\n\t"
128         "psrlq  $8, %%mm2\n\t"
129         "psrlq  $8, %%mm3\n\t"
130         "psrlq  $8, %%mm6\n\t"
131         "psrlq  $8, %%mm7\n\t"
132         "pand   %2, %%mm0\n\t"
133         "pand   %2, %%mm1\n\t"
134         "pand   %2, %%mm4\n\t"
135         "pand   %2, %%mm5\n\t"
136         "pand   %3, %%mm2\n\t"
137         "pand   %3, %%mm3\n\t"
138         "pand   %3, %%mm6\n\t"
139         "pand   %3, %%mm7\n\t"
140         "por    %%mm2, %%mm0\n\t"
141         "por    %%mm3, %%mm1\n\t"
142         "por    %%mm6, %%mm4\n\t"
143         "por    %%mm7, %%mm5\n\t"
144
145         "movq   %%mm1, %%mm2\n\t"
146         "movq   %%mm4, %%mm3\n\t"
147         "psllq  $48, %%mm2\n\t"
148         "psllq  $32, %%mm3\n\t"
149         "pand   %4, %%mm2\n\t"
150         "pand   %5, %%mm3\n\t"
151         "por    %%mm2, %%mm0\n\t"
152         "psrlq  $16, %%mm1\n\t"
153         "psrlq  $32, %%mm4\n\t"
154         "psllq  $16, %%mm5\n\t"
155         "por    %%mm3, %%mm1\n\t"
156         "pand   %6, %%mm5\n\t"
157         "por    %%mm5, %%mm4\n\t"
158
159         MOVNTQ" %%mm0, %0\n\t"
160         MOVNTQ" %%mm1, 8%0\n\t"
161         MOVNTQ" %%mm4, 16%0"
162         :"=m"(*dest)
163         :"m"(*s),"m"(mask24l),
164          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
165         :"memory");
166     dest += 24;
167     s += 32;
168   }
169   __asm __volatile(SFENCE:::"memory");
170   __asm __volatile(EMMS:::"memory");
171 #endif
172   while(s < end)
173   {
174     *dest++ = *s++;
175     *dest++ = *s++;
176     *dest++ = *s++;
177     s++;
178   }
179 }
180
181 /*
182  Original by Strepto/Astral
183  ported to gcc & bugfixed : A'rpi
184  MMX2, 3DNOW optimization by Nick Kurshev
185  32bit c version, and and&add trick by Michael Niedermayer
186 */
187 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
188 {
189 #ifdef HAVE_MMX
190   register int offs=15-src_size;
191   register const char* s=src-offs;
192   register char* d=dst-offs;
193   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
194   __asm __volatile(
195         "movq   %0, %%mm4\n\t"
196         ::"m"(mask15s));
197   while(offs<0)
198   {
199         __asm __volatile(
200                 PREFETCH"       32%1\n\t"
201                 "movq   %1, %%mm0\n\t"
202                 "movq   8%1, %%mm2\n\t"
203                 "movq   %%mm0, %%mm1\n\t"
204                 "movq   %%mm2, %%mm3\n\t"
205                 "pand   %%mm4, %%mm0\n\t"
206                 "pand   %%mm4, %%mm2\n\t"
207                 "paddw  %%mm1, %%mm0\n\t"
208                 "paddw  %%mm3, %%mm2\n\t"
209                 MOVNTQ" %%mm0, %0\n\t"
210                 MOVNTQ" %%mm2, 8%0"
211                 :"=m"(*(d+offs))
212                 :"m"(*(s+offs))
213                 );
214         offs+=16;
215   }
216   __asm __volatile(SFENCE:::"memory");
217   __asm __volatile(EMMS:::"memory");
218 #else
219 #if 0
220    const uint16_t *s1=( uint16_t * )src;
221    uint16_t *d1=( uint16_t * )dst;
222    uint16_t *e=((uint8_t *)s1)+src_size;
223    while( s1<e ){
224      register int x=*( s1++ );
225      /* rrrrrggggggbbbbb
226         0rrrrrgggggbbbbb
227         0111 1111 1110 0000=0x7FE0
228         00000000000001 1111=0x001F */
229      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
230    }
231 #else
232         const unsigned *s1=( unsigned * )src;
233         unsigned *d1=( unsigned * )dst;
234         int i;
235         int size= src_size>>2;
236         for(i=0; i<size; i++)
237         {
238                 register int x= s1[i];
239 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
240                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
241
242         }
243 #endif
244 #endif
245 }
246
247 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
248 {
249         unsigned j,i,num_pixels=src_size/3;
250         for(i=0,j=0; j<num_pixels; i+=3,j+=3)
251         {
252                 dst[j+0] = src[i+2];
253                 dst[j+1] = src[i+1];
254                 dst[j+2] = src[i+0];
255         }
256 }
257
258 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
259 {
260 #ifdef HAVE_MMX
261         const uint8_t *s = src;
262         const uint8_t *end,*mm_end;
263         uint16_t *d = (uint16_t *)dst;
264         end = s + src_size;
265         mm_end = end - 15;
266         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
267         __asm __volatile(
268             "movq       %0, %%mm7\n\t"
269             "movq       %1, %%mm6\n\t"
270             ::"m"(red_16mask),"m"(green_16mask));
271         while(s < mm_end)
272         {
273             __asm __volatile(
274                 PREFETCH" 32%1\n\t"
275                 "movd   %1, %%mm0\n\t"
276                 "movd   4%1, %%mm3\n\t"
277                 "punpckldq 8%1, %%mm0\n\t"
278                 "punpckldq 12%1, %%mm3\n\t"
279                 "movq   %%mm0, %%mm1\n\t"
280                 "movq   %%mm0, %%mm2\n\t"
281                 "movq   %%mm3, %%mm4\n\t"
282                 "movq   %%mm3, %%mm5\n\t"
283                 "psrlq  $3, %%mm0\n\t"
284                 "psrlq  $3, %%mm3\n\t"
285                 "pand   %2, %%mm0\n\t"
286                 "pand   %2, %%mm3\n\t"
287                 "psrlq  $5, %%mm1\n\t"
288                 "psrlq  $5, %%mm4\n\t"
289                 "pand   %%mm6, %%mm1\n\t"
290                 "pand   %%mm6, %%mm4\n\t"
291                 "psrlq  $8, %%mm2\n\t"
292                 "psrlq  $8, %%mm5\n\t"
293                 "pand   %%mm7, %%mm2\n\t"
294                 "pand   %%mm7, %%mm5\n\t"
295                 "por    %%mm1, %%mm0\n\t"
296                 "por    %%mm4, %%mm3\n\t"
297                 "por    %%mm2, %%mm0\n\t"
298                 "por    %%mm5, %%mm3\n\t"
299                 "psllq  $16, %%mm3\n\t"
300                 "por    %%mm3, %%mm0\n\t"
301                 MOVNTQ" %%mm0, %0\n\t"
302                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
303                 d += 4;
304                 s += 16;
305         }
306         while(s < end)
307         {
308                 const int b= *s++;
309                 const int g= *s++;
310                 const int r= *s++;
311                 s++;
312                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
313         }
314         __asm __volatile(SFENCE:::"memory");
315         __asm __volatile(EMMS:::"memory");
316 #else
317         unsigned j,i,num_pixels=src_size/4;
318         uint16_t *d = (uint16_t *)dst;
319         for(i=0,j=0; j<num_pixels; i+=4,j++)
320         {
321                 const int b= src[i+0];
322                 const int g= src[i+1];
323                 const int r= src[i+2];
324
325                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
326         }
327 #endif
328 }
329
330 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
331 {
332 #ifdef HAVE_MMX
333         const uint8_t *s = src;
334         const uint8_t *end,*mm_end;
335         uint16_t *d = (uint16_t *)dst;
336         end = s + src_size;
337         mm_end = end - 15;
338         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
339         __asm __volatile(
340             "movq       %0, %%mm7\n\t"
341             "movq       %1, %%mm6\n\t"
342             ::"m"(red_15mask),"m"(green_15mask));
343         while(s < mm_end)
344         {
345             __asm __volatile(
346                 PREFETCH" 32%1\n\t"
347                 "movd   %1, %%mm0\n\t"
348                 "movd   4%1, %%mm3\n\t"
349                 "punpckldq 8%1, %%mm0\n\t"
350                 "punpckldq 12%1, %%mm3\n\t"
351                 "movq   %%mm0, %%mm1\n\t"
352                 "movq   %%mm0, %%mm2\n\t"
353                 "movq   %%mm3, %%mm4\n\t"
354                 "movq   %%mm3, %%mm5\n\t"
355                 "psrlq  $3, %%mm0\n\t"
356                 "psrlq  $3, %%mm3\n\t"
357                 "pand   %2, %%mm0\n\t"
358                 "pand   %2, %%mm3\n\t"
359                 "psrlq  $6, %%mm1\n\t"
360                 "psrlq  $6, %%mm4\n\t"
361                 "pand   %%mm6, %%mm1\n\t"
362                 "pand   %%mm6, %%mm4\n\t"
363                 "psrlq  $9, %%mm2\n\t"
364                 "psrlq  $9, %%mm5\n\t"
365                 "pand   %%mm7, %%mm2\n\t"
366                 "pand   %%mm7, %%mm5\n\t"
367                 "por    %%mm1, %%mm0\n\t"
368                 "por    %%mm4, %%mm3\n\t"
369                 "por    %%mm2, %%mm0\n\t"
370                 "por    %%mm5, %%mm3\n\t"
371                 "psllq  $16, %%mm3\n\t"
372                 "por    %%mm3, %%mm0\n\t"
373                 MOVNTQ" %%mm0, %0\n\t"
374                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
375                 d += 4;
376                 s += 16;
377         }
378         while(s < end)
379         {
380                 const int b= *s++;
381                 const int g= *s++;
382                 const int r= *s++;
383                 s++;
384                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
385         }
386         __asm __volatile(SFENCE:::"memory");
387         __asm __volatile(EMMS:::"memory");
388 #else
389         unsigned j,i,num_pixels=src_size/4;
390         uint16_t *d = (uint16_t *)dst;
391         for(i=0,j=0; j<num_pixels; i+=4,j++)
392         {
393                 const int b= src[i+0];
394                 const int g= src[i+1];
395                 const int r= src[i+2];
396
397                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
398         }
399 #endif
400 }
401
402 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
403 {
404 #ifdef HAVE_MMX
405         const uint8_t *s = src;
406         const uint8_t *end,*mm_end;
407         uint16_t *d = (uint16_t *)dst;
408         end = s + src_size;
409         mm_end = end - 11;
410         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
411         __asm __volatile(
412             "movq       %0, %%mm7\n\t"
413             "movq       %1, %%mm6\n\t"
414             ::"m"(red_16mask),"m"(green_16mask));
415         while(s < mm_end)
416         {
417             __asm __volatile(
418                 PREFETCH" 32%1\n\t"
419                 "movd   %1, %%mm0\n\t"
420                 "movd   3%1, %%mm3\n\t"
421                 "punpckldq 6%1, %%mm0\n\t"
422                 "punpckldq 9%1, %%mm3\n\t"
423                 "movq   %%mm0, %%mm1\n\t"
424                 "movq   %%mm0, %%mm2\n\t"
425                 "movq   %%mm3, %%mm4\n\t"
426                 "movq   %%mm3, %%mm5\n\t"
427                 "psrlq  $3, %%mm0\n\t"
428                 "psrlq  $3, %%mm3\n\t"
429                 "pand   %2, %%mm0\n\t"
430                 "pand   %2, %%mm3\n\t"
431                 "psrlq  $5, %%mm1\n\t"
432                 "psrlq  $5, %%mm4\n\t"
433                 "pand   %%mm6, %%mm1\n\t"
434                 "pand   %%mm6, %%mm4\n\t"
435                 "psrlq  $8, %%mm2\n\t"
436                 "psrlq  $8, %%mm5\n\t"
437                 "pand   %%mm7, %%mm2\n\t"
438                 "pand   %%mm7, %%mm5\n\t"
439                 "por    %%mm1, %%mm0\n\t"
440                 "por    %%mm4, %%mm3\n\t"
441                 "por    %%mm2, %%mm0\n\t"
442                 "por    %%mm5, %%mm3\n\t"
443                 "psllq  $16, %%mm3\n\t"
444                 "por    %%mm3, %%mm0\n\t"
445                 MOVNTQ" %%mm0, %0\n\t"
446                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
447                 d += 4;
448                 s += 12;
449         }
450         while(s < end)
451         {
452                 const int b= *s++;
453                 const int g= *s++;
454                 const int r= *s++;
455                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
456         }
457         __asm __volatile(SFENCE:::"memory");
458         __asm __volatile(EMMS:::"memory");
459 #else
460         unsigned j,i,num_pixels=src_size/3;
461         uint16_t *d = (uint16_t *)dst;
462         for(i=0,j=0; j<num_pixels; i+=3,j++)
463         {
464                 const int b= src[i+0];
465                 const int g= src[i+1];
466                 const int r= src[i+2];
467
468                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
469         }
470 #endif
471 }
472
473 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
474 {
475 #ifdef HAVE_MMX
476         const uint8_t *s = src;
477         const uint8_t *end,*mm_end;
478         uint16_t *d = (uint16_t *)dst;
479         end = s + src_size;
480         mm_end = end -11;
481         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
482         __asm __volatile(
483             "movq       %0, %%mm7\n\t"
484             "movq       %1, %%mm6\n\t"
485             ::"m"(red_15mask),"m"(green_15mask));
486         while(s < mm_end)
487         {
488             __asm __volatile(
489                 PREFETCH" 32%1\n\t"
490                 "movd   %1, %%mm0\n\t"
491                 "movd   3%1, %%mm3\n\t"
492                 "punpckldq 6%1, %%mm0\n\t"
493                 "punpckldq 9%1, %%mm3\n\t"
494                 "movq   %%mm0, %%mm1\n\t"
495                 "movq   %%mm0, %%mm2\n\t"
496                 "movq   %%mm3, %%mm4\n\t"
497                 "movq   %%mm3, %%mm5\n\t"
498                 "psrlq  $3, %%mm0\n\t"
499                 "psrlq  $3, %%mm3\n\t"
500                 "pand   %2, %%mm0\n\t"
501                 "pand   %2, %%mm3\n\t"
502                 "psrlq  $6, %%mm1\n\t"
503                 "psrlq  $6, %%mm4\n\t"
504                 "pand   %%mm6, %%mm1\n\t"
505                 "pand   %%mm6, %%mm4\n\t"
506                 "psrlq  $9, %%mm2\n\t"
507                 "psrlq  $9, %%mm5\n\t"
508                 "pand   %%mm7, %%mm2\n\t"
509                 "pand   %%mm7, %%mm5\n\t"
510                 "por    %%mm1, %%mm0\n\t"
511                 "por    %%mm4, %%mm3\n\t"
512                 "por    %%mm2, %%mm0\n\t"
513                 "por    %%mm5, %%mm3\n\t"
514                 "psllq  $16, %%mm3\n\t"
515                 "por    %%mm3, %%mm0\n\t"
516                 MOVNTQ" %%mm0, %0\n\t"
517                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
518                 d += 4;
519                 s += 12;
520         }
521         while(s < end)
522         {
523                 const int b= *s++;
524                 const int g= *s++;
525                 const int r= *s++;
526                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
527         }
528         __asm __volatile(SFENCE:::"memory");
529         __asm __volatile(EMMS:::"memory");
530 #else
531         unsigned j,i,num_pixels=src_size/3;
532         uint16_t *d = (uint16_t *)dst;
533         for(i=0,j=0; j<num_pixels; i+=3,j++)
534         {
535                 const int b= src[i+0];
536                 const int g= src[i+1];
537                 const int r= src[i+2];
538
539                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
540         }
541 #endif
542 }
543
544 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
545 {
546 #ifdef HAVE_MMX
547         asm volatile (
548                 "xorl %%eax, %%eax              \n\t"
549                 ".balign 16                     \n\t"
550                 "1:                             \n\t"
551                 PREFETCH" 32(%0, %%eax)         \n\t"
552                 "movq (%0, %%eax), %%mm0        \n\t"
553                 "movq %%mm0, %%mm1              \n\t"
554                 "movq %%mm0, %%mm2              \n\t"
555                 "pslld $16, %%mm0               \n\t"
556                 "psrld $16, %%mm1               \n\t"
557                 "pand "MANGLE(mask32r)", %%mm0          \n\t"
558                 "pand "MANGLE(mask32g)", %%mm2          \n\t"
559                 "pand "MANGLE(mask32b)", %%mm1          \n\t"
560                 "por %%mm0, %%mm2               \n\t"
561                 "por %%mm1, %%mm2               \n\t"
562                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
563                 "addl $8, %%eax                 \n\t"
564                 "cmpl %2, %%eax                 \n\t"
565                 " jb 1b                         \n\t"
566                 :: "r" (src), "r"(dst), "r" (src_size)
567                 : "%eax"
568         );
569
570         __asm __volatile(SFENCE:::"memory");
571         __asm __volatile(EMMS:::"memory");
572 #else
573         int i;
574         int num_pixels= src_size >> 2;
575         for(i=0; i<num_pixels; i++)
576         {
577                 dst[4*i + 0] = src[4*i + 2];
578                 dst[4*i + 1] = src[4*i + 1];
579                 dst[4*i + 2] = src[4*i + 0];
580         }
581 #endif
582 }
583
584 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
585 {
586         int i;
587 #ifdef HAVE_MMX
588         int mmx_size= 23 - src_size;
589         asm volatile (
590                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
591                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
592                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
593                 ".balign 16                     \n\t"
594                 "1:                             \n\t"
595                 PREFETCH" 32(%1, %%eax)         \n\t"
596                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
597                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
598                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
599                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
600                 "pand %%mm5, %%mm0              \n\t"
601                 "pand %%mm6, %%mm1              \n\t"
602                 "pand %%mm7, %%mm2              \n\t"
603                 "por %%mm0, %%mm1               \n\t"
604                 "por %%mm2, %%mm1               \n\t"                
605                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
606                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
607                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
608                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
609                 "pand %%mm7, %%mm0              \n\t"
610                 "pand %%mm5, %%mm1              \n\t"
611                 "pand %%mm6, %%mm2              \n\t"
612                 "por %%mm0, %%mm1               \n\t"
613                 "por %%mm2, %%mm1               \n\t"                
614                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
615                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
616                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
617                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
618                 "pand %%mm6, %%mm0              \n\t"
619                 "pand %%mm7, %%mm1              \n\t"
620                 "pand %%mm5, %%mm2              \n\t"
621                 "por %%mm0, %%mm1               \n\t"
622                 "por %%mm2, %%mm1               \n\t"                
623                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
624                 "addl $24, %%eax                \n\t"
625                 " js 1b                         \n\t"
626                 : "+a" (mmx_size)
627                 : "r" (src-mmx_size), "r"(dst-mmx_size)
628         );
629
630         __asm __volatile(SFENCE:::"memory");
631         __asm __volatile(EMMS:::"memory");
632
633         if(mmx_size==23) return; //finihsed, was multiple of 8
634         src+= src_size;
635         dst+= src_size;
636         src_size= 23 - mmx_size;
637         src-= src_size;
638         dst-= src_size;
639 #endif
640         for(i=0; i<src_size; i+=3)
641         {
642                 register int x;
643                 x          = src[i + 2];
644                 dst[i + 1] = src[i + 1];
645                 dst[i + 2] = src[i + 0];
646                 dst[i + 0] = x;
647         }
648 }
649
650 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
651         unsigned int width, unsigned int height,
652         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
653 {
654         int y;
655         const int chromWidth= width>>1;
656         for(y=0; y<height; y++)
657         {
658 #ifdef HAVE_MMX
659 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
660                 asm volatile(
661                         "xorl %%eax, %%eax              \n\t"
662                         ".balign 16                     \n\t"
663                         "1:                             \n\t"
664                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
665                         PREFETCH" 32(%2, %%eax)         \n\t"
666                         PREFETCH" 32(%3, %%eax)         \n\t"
667                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
668                         "movq %%mm0, %%mm2              \n\t" // U(0)
669                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
670                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
671                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
672
673                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
674                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
675                         "movq %%mm3, %%mm4              \n\t" // Y(0)
676                         "movq %%mm5, %%mm6              \n\t" // Y(8)
677                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
678                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
679                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
680                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
681
682                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
683                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
684                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
685                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
686
687                         "addl $8, %%eax                 \n\t"
688                         "cmpl %4, %%eax                 \n\t"
689                         " jb 1b                         \n\t"
690                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
691                         : "%eax"
692                 );
693 #else
694                 int i;
695                 for(i=0; i<chromWidth; i++)
696                 {
697                         dst[4*i+0] = ysrc[2*i+0];
698                         dst[4*i+1] = usrc[i];
699                         dst[4*i+2] = ysrc[2*i+1];
700                         dst[4*i+3] = vsrc[i];
701                 }
702 #endif
703                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
704                 {
705                         usrc += chromStride;
706                         vsrc += chromStride;
707                 }
708                 ysrc += lumStride;
709                 dst += dstStride;
710         }
711 #ifdef HAVE_MMX
712 asm(    EMMS" \n\t"
713         SFENCE" \n\t"
714         :::"memory");
715 #endif
716 }
717
718 /**
719  *
720  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
721  * problem for anyone then tell me, and ill fix it)
722  */
723 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
724         unsigned int width, unsigned int height,
725         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
726 {
727         //FIXME interpolate chroma
728         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
729 }
730
731 /**
732  *
733  * width should be a multiple of 16
734  */
735 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
736         unsigned int width, unsigned int height,
737         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
738 {
739         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
740 }
741
742 /**
743  *
744  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
745  * problem for anyone then tell me, and ill fix it)
746  */
747 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
748         unsigned int width, unsigned int height,
749         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
750 {
751         int y;
752         const int chromWidth= width>>1;
753         for(y=0; y<height; y+=2)
754         {
755 #ifdef HAVE_MMX
756                 asm volatile(
757                         "xorl %%eax, %%eax              \n\t"
758                         "pcmpeqw %%mm7, %%mm7           \n\t"
759                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
760                         ".balign 16                     \n\t"
761                         "1:                             \n\t"
762                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
763                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
764                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
765                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
766                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
767                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
768                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
769                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
770                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
771                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
772                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
773
774                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
775
776                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
777                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
778                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
779                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
780                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
781                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
782                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
783                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
784                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
785                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
786
787                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
788
789                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
790                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
791                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
792                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
793                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
794                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
795                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
796                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
797
798                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
799                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
800
801                         "addl $8, %%eax                 \n\t"
802                         "cmpl %4, %%eax                 \n\t"
803                         " jb 1b                         \n\t"
804                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
805                         : "memory", "%eax"
806                 );
807
808                 ydst += lumStride;
809                 src  += srcStride;
810
811                 asm volatile(
812                         "xorl %%eax, %%eax              \n\t"
813                         ".balign 16                     \n\t"
814                         "1:                             \n\t"
815                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
816                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
817                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
818                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
819                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
820                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
821                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
822                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
823                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
824                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
825                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
826
827                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
828                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
829
830                         "addl $8, %%eax                 \n\t"
831                         "cmpl %4, %%eax                 \n\t"
832                         " jb 1b                         \n\t"
833
834                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
835                         : "memory", "%eax"
836                 );
837 #else
838                 int i;
839                 for(i=0; i<chromWidth; i++)
840                 {
841                         ydst[2*i+0]     = src[4*i+0];
842                         udst[i]         = src[4*i+1];
843                         ydst[2*i+1]     = src[4*i+2];
844                         vdst[i]         = src[4*i+3];
845                 }
846                 ydst += lumStride;
847                 src  += srcStride;
848
849                 for(i=0; i<chromWidth; i++)
850                 {
851                         ydst[2*i+0]     = src[4*i+0];
852                         ydst[2*i+1]     = src[4*i+2];
853                 }
854 #endif
855                 udst += chromStride;
856                 vdst += chromStride;
857                 ydst += lumStride;
858                 src  += srcStride;
859         }
860 #ifdef HAVE_MMX
861 asm volatile(   EMMS" \n\t"
862                 SFENCE" \n\t"
863                 :::"memory");
864 #endif
865 }
866
867 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
868         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
869         unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
870 {
871         /* Y Plane */
872         memcpy(ydst, ysrc, width*height);
873
874         /* XXX: implement upscaling for U,V */
875 }
876
877 /**
878  *
879  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
880  * problem for anyone then tell me, and ill fix it)
881  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
882  */
883 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
884         unsigned int width, unsigned int height,
885         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
886 {
887         int y;
888         const int chromWidth= width>>1;
889         for(y=0; y<height; y+=2)
890         {
891 #ifdef HAVE_MMX
892                 asm volatile(
893                         "xorl %%eax, %%eax              \n\t"
894                         "pcmpeqw %%mm7, %%mm7           \n\t"
895                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
896                         ".balign 16                     \n\t"
897                         "1:                             \n\t"
898                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
899                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
900                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
901                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
902                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
903                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
904                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
905                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
906                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
907                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
908                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
909
910                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
911
912                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
913                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
914                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
915                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
916                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
917                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
918                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
919                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
920                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
921                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
922
923                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
924
925                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
926                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
927                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
928                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
929                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
930                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
931                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
932                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
933
934                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
935                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
936
937                         "addl $8, %%eax                 \n\t"
938                         "cmpl %4, %%eax                 \n\t"
939                         " jb 1b                         \n\t"
940                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
941                         : "memory", "%eax"
942                 );
943
944                 ydst += lumStride;
945                 src  += srcStride;
946
947                 asm volatile(
948                         "xorl %%eax, %%eax              \n\t"
949                         ".balign 16                     \n\t"
950                         "1:                             \n\t"
951                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
952                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
953                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
954                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
955                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
956                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
957                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
958                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
959                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
960                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
961                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
962
963                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
964                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
965
966                         "addl $8, %%eax                 \n\t"
967                         "cmpl %4, %%eax                 \n\t"
968                         " jb 1b                         \n\t"
969
970                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
971                         : "memory", "%eax"
972                 );
973 #else
974                 int i;
975                 for(i=0; i<chromWidth; i++)
976                 {
977                         udst[i]         = src[4*i+0];
978                         ydst[2*i+0]     = src[4*i+1];
979                         vdst[i]         = src[4*i+2];
980                         ydst[2*i+1]     = src[4*i+3];
981                 }
982                 ydst += lumStride;
983                 src  += srcStride;
984
985                 for(i=0; i<chromWidth; i++)
986                 {
987                         ydst[2*i+0]     = src[4*i+1];
988                         ydst[2*i+1]     = src[4*i+3];
989                 }
990 #endif
991                 udst += chromStride;
992                 vdst += chromStride;
993                 ydst += lumStride;
994                 src  += srcStride;
995         }
996 #ifdef HAVE_MMX
997 asm volatile(   EMMS" \n\t"
998                 SFENCE" \n\t"
999                 :::"memory");
1000 #endif
1001 }
1002
1003 /**
1004  *
1005  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1006  * problem for anyone then tell me, and ill fix it)
1007  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1008  */
1009 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1010         unsigned int width, unsigned int height,
1011         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1012 {
1013         int y;
1014         const int chromWidth= width>>1;
1015 #ifdef HAVE_MMX
1016         for(y=0; y<height-2; y+=2)
1017         {
1018                 int i;
1019                 for(i=0; i<2; i++)
1020                 {
1021                         asm volatile(
1022                                 "movl %2, %%eax                 \n\t"
1023                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1024                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1025                                 "pxor %%mm7, %%mm7              \n\t"
1026                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1027                                 ".balign 16                     \n\t"
1028                                 "1:                             \n\t"
1029                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1030                                 "movd (%0, %%ebx), %%mm0        \n\t"
1031                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1032                                 "punpcklbw %%mm7, %%mm0         \n\t"
1033                                 "punpcklbw %%mm7, %%mm1         \n\t"
1034                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1035                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1036                                 "punpcklbw %%mm7, %%mm2         \n\t"
1037                                 "punpcklbw %%mm7, %%mm3         \n\t"
1038                                 "pmaddwd %%mm6, %%mm0           \n\t"
1039                                 "pmaddwd %%mm6, %%mm1           \n\t"
1040                                 "pmaddwd %%mm6, %%mm2           \n\t"
1041                                 "pmaddwd %%mm6, %%mm3           \n\t"
1042 #ifndef FAST_BGR2YV12
1043                                 "psrad $8, %%mm0                \n\t"
1044                                 "psrad $8, %%mm1                \n\t"
1045                                 "psrad $8, %%mm2                \n\t"
1046                                 "psrad $8, %%mm3                \n\t"
1047 #endif
1048                                 "packssdw %%mm1, %%mm0          \n\t"
1049                                 "packssdw %%mm3, %%mm2          \n\t"
1050                                 "pmaddwd %%mm5, %%mm0           \n\t"
1051                                 "pmaddwd %%mm5, %%mm2           \n\t"
1052                                 "packssdw %%mm2, %%mm0          \n\t"
1053                                 "psraw $7, %%mm0                \n\t"
1054
1055                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1056                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1057                                 "punpcklbw %%mm7, %%mm4         \n\t"
1058                                 "punpcklbw %%mm7, %%mm1         \n\t"
1059                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1060                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1061                                 "punpcklbw %%mm7, %%mm2         \n\t"
1062                                 "punpcklbw %%mm7, %%mm3         \n\t"
1063                                 "pmaddwd %%mm6, %%mm4           \n\t"
1064                                 "pmaddwd %%mm6, %%mm1           \n\t"
1065                                 "pmaddwd %%mm6, %%mm2           \n\t"
1066                                 "pmaddwd %%mm6, %%mm3           \n\t"
1067 #ifndef FAST_BGR2YV12
1068                                 "psrad $8, %%mm4                \n\t"
1069                                 "psrad $8, %%mm1                \n\t"
1070                                 "psrad $8, %%mm2                \n\t"
1071                                 "psrad $8, %%mm3                \n\t"
1072 #endif
1073                                 "packssdw %%mm1, %%mm4          \n\t"
1074                                 "packssdw %%mm3, %%mm2          \n\t"
1075                                 "pmaddwd %%mm5, %%mm4           \n\t"
1076                                 "pmaddwd %%mm5, %%mm2           \n\t"
1077                                 "addl $24, %%ebx                \n\t"
1078                                 "packssdw %%mm2, %%mm4          \n\t"
1079                                 "psraw $7, %%mm4                \n\t"
1080
1081                                 "packuswb %%mm4, %%mm0          \n\t"
1082                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1083
1084                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
1085                                 "addl $8, %%eax                 \n\t"
1086                                 " js 1b                         \n\t"
1087                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1088                                 : "%eax", "%ebx"
1089                         );
1090                         ydst += lumStride;
1091                         src  += srcStride;
1092                 }
1093                 src -= srcStride*2;
1094                 asm volatile(
1095                         "movl %4, %%eax                 \n\t"
1096                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1097                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1098                         "pxor %%mm7, %%mm7              \n\t"
1099                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1100                         "addl %%ebx, %%ebx              \n\t"
1101                         ".balign 16                     \n\t"
1102                         "1:                             \n\t"
1103                         PREFETCH" 64(%0, %%ebx)         \n\t"
1104                         PREFETCH" 64(%1, %%ebx)         \n\t"
1105 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1106                         "movq (%0, %%ebx), %%mm0        \n\t"
1107                         "movq (%1, %%ebx), %%mm1        \n\t"
1108                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1109                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1110                         PAVGB" %%mm1, %%mm0             \n\t"
1111                         PAVGB" %%mm3, %%mm2             \n\t"
1112                         "movq %%mm0, %%mm1              \n\t"
1113                         "movq %%mm2, %%mm3              \n\t"
1114                         "psrlq $24, %%mm0               \n\t"
1115                         "psrlq $24, %%mm2               \n\t"
1116                         PAVGB" %%mm1, %%mm0             \n\t"
1117                         PAVGB" %%mm3, %%mm2             \n\t"
1118                         "punpcklbw %%mm7, %%mm0         \n\t"
1119                         "punpcklbw %%mm7, %%mm2         \n\t"
1120 #else
1121                         "movd (%0, %%ebx), %%mm0        \n\t"
1122                         "movd (%1, %%ebx), %%mm1        \n\t"
1123                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1124                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1125                         "punpcklbw %%mm7, %%mm0         \n\t"
1126                         "punpcklbw %%mm7, %%mm1         \n\t"
1127                         "punpcklbw %%mm7, %%mm2         \n\t"
1128                         "punpcklbw %%mm7, %%mm3         \n\t"
1129                         "paddw %%mm1, %%mm0             \n\t"
1130                         "paddw %%mm3, %%mm2             \n\t"
1131                         "paddw %%mm2, %%mm0             \n\t"
1132                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1133                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1134                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1135                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1136                         "punpcklbw %%mm7, %%mm4         \n\t"
1137                         "punpcklbw %%mm7, %%mm1         \n\t"
1138                         "punpcklbw %%mm7, %%mm2         \n\t"
1139                         "punpcklbw %%mm7, %%mm3         \n\t"
1140                         "paddw %%mm1, %%mm4             \n\t"
1141                         "paddw %%mm3, %%mm2             \n\t"
1142                         "paddw %%mm4, %%mm2             \n\t"
1143                         "psrlw $2, %%mm0                \n\t"
1144                         "psrlw $2, %%mm2                \n\t"
1145 #endif
1146                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1147                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1148
1149                         "pmaddwd %%mm0, %%mm1           \n\t"
1150                         "pmaddwd %%mm2, %%mm3           \n\t"
1151                         "pmaddwd %%mm6, %%mm0           \n\t"
1152                         "pmaddwd %%mm6, %%mm2           \n\t"
1153 #ifndef FAST_BGR2YV12
1154                         "psrad $8, %%mm0                \n\t"
1155                         "psrad $8, %%mm1                \n\t"
1156                         "psrad $8, %%mm2                \n\t"
1157                         "psrad $8, %%mm3                \n\t"
1158 #endif
1159                         "packssdw %%mm2, %%mm0          \n\t"
1160                         "packssdw %%mm3, %%mm1          \n\t"
1161                         "pmaddwd %%mm5, %%mm0           \n\t"
1162                         "pmaddwd %%mm5, %%mm1           \n\t"
1163                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1164                         "psraw $7, %%mm0                \n\t"
1165
1166 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1167                         "movq 12(%0, %%ebx), %%mm4      \n\t"
1168                         "movq 12(%1, %%ebx), %%mm1      \n\t"
1169                         "movq 18(%0, %%ebx), %%mm2      \n\t"
1170                         "movq 18(%1, %%ebx), %%mm3      \n\t"
1171                         PAVGB" %%mm1, %%mm4             \n\t"
1172                         PAVGB" %%mm3, %%mm2             \n\t"
1173                         "movq %%mm4, %%mm1              \n\t"
1174                         "movq %%mm2, %%mm3              \n\t"
1175                         "psrlq $24, %%mm4               \n\t"
1176                         "psrlq $24, %%mm2               \n\t"
1177                         PAVGB" %%mm1, %%mm4             \n\t"
1178                         PAVGB" %%mm3, %%mm2             \n\t"
1179                         "punpcklbw %%mm7, %%mm4         \n\t"
1180                         "punpcklbw %%mm7, %%mm2         \n\t"
1181 #else
1182                         "movd 12(%0, %%ebx), %%mm4      \n\t"
1183                         "movd 12(%1, %%ebx), %%mm1      \n\t"
1184                         "movd 15(%0, %%ebx), %%mm2      \n\t"
1185                         "movd 15(%1, %%ebx), %%mm3      \n\t"
1186                         "punpcklbw %%mm7, %%mm4         \n\t"
1187                         "punpcklbw %%mm7, %%mm1         \n\t"
1188                         "punpcklbw %%mm7, %%mm2         \n\t"
1189                         "punpcklbw %%mm7, %%mm3         \n\t"
1190                         "paddw %%mm1, %%mm4             \n\t"
1191                         "paddw %%mm3, %%mm2             \n\t"
1192                         "paddw %%mm2, %%mm4             \n\t"
1193                         "movd 18(%0, %%ebx), %%mm5      \n\t"
1194                         "movd 18(%1, %%ebx), %%mm1      \n\t"
1195                         "movd 21(%0, %%ebx), %%mm2      \n\t"
1196                         "movd 21(%1, %%ebx), %%mm3      \n\t"
1197                         "punpcklbw %%mm7, %%mm5         \n\t"
1198                         "punpcklbw %%mm7, %%mm1         \n\t"
1199                         "punpcklbw %%mm7, %%mm2         \n\t"
1200                         "punpcklbw %%mm7, %%mm3         \n\t"
1201                         "paddw %%mm1, %%mm5             \n\t"
1202                         "paddw %%mm3, %%mm2             \n\t"
1203                         "paddw %%mm5, %%mm2             \n\t"
1204                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1205                         "psrlw $2, %%mm4                \n\t"
1206                         "psrlw $2, %%mm2                \n\t"
1207 #endif
1208                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1209                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1210
1211                         "pmaddwd %%mm4, %%mm1           \n\t"
1212                         "pmaddwd %%mm2, %%mm3           \n\t"
1213                         "pmaddwd %%mm6, %%mm4           \n\t"
1214                         "pmaddwd %%mm6, %%mm2           \n\t"
1215 #ifndef FAST_BGR2YV12
1216                         "psrad $8, %%mm4                \n\t"
1217                         "psrad $8, %%mm1                \n\t"
1218                         "psrad $8, %%mm2                \n\t"
1219                         "psrad $8, %%mm3                \n\t"
1220 #endif
1221                         "packssdw %%mm2, %%mm4          \n\t"
1222                         "packssdw %%mm3, %%mm1          \n\t"
1223                         "pmaddwd %%mm5, %%mm4           \n\t"
1224                         "pmaddwd %%mm5, %%mm1           \n\t"
1225                         "addl $24, %%ebx                \n\t"
1226                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1227                         "psraw $7, %%mm4                \n\t"
1228
1229                         "movq %%mm0, %%mm1              \n\t"
1230                         "punpckldq %%mm4, %%mm0         \n\t"
1231                         "punpckhdq %%mm4, %%mm1         \n\t"
1232                         "packsswb %%mm1, %%mm0          \n\t"
1233                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1234
1235                         "movd %%mm0, (%2, %%eax)        \n\t"
1236                         "punpckhdq %%mm0, %%mm0         \n\t"
1237                         "movd %%mm0, (%3, %%eax)        \n\t"
1238                         "addl $4, %%eax                 \n\t"
1239                         " js 1b                         \n\t"
1240                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1241                         : "%eax", "%ebx"
1242                 );
1243
1244                 udst += chromStride;
1245                 vdst += chromStride;
1246                 src  += srcStride*2;
1247         }
1248
1249         asm volatile(   EMMS" \n\t"
1250                         SFENCE" \n\t"
1251                         :::"memory");
1252 #else
1253         y=0;
1254 #endif
1255         for(; y<height; y+=2)
1256         {
1257                 int i;
1258                 for(i=0; i<chromWidth; i++)
1259                 {
1260                         unsigned int b= src[6*i+0];
1261                         unsigned int g= src[6*i+1];
1262                         unsigned int r= src[6*i+2];
1263
1264                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1265                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1266                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1267
1268                         udst[i]         = U;
1269                         vdst[i]         = V;
1270                         ydst[2*i]       = Y;
1271
1272                         b= src[6*i+3];
1273                         g= src[6*i+4];
1274                         r= src[6*i+5];
1275
1276                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1277                         ydst[2*i+1]     = Y;
1278                 }
1279                 ydst += lumStride;
1280                 src  += srcStride;
1281
1282                 for(i=0; i<chromWidth; i++)
1283                 {
1284                         unsigned int b= src[6*i+0];
1285                         unsigned int g= src[6*i+1];
1286                         unsigned int r= src[6*i+2];
1287
1288                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1289
1290                         ydst[2*i]       = Y;
1291
1292                         b= src[6*i+3];
1293                         g= src[6*i+4];
1294                         r= src[6*i+5];
1295
1296                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1297                         ydst[2*i+1]     = Y;
1298                 }
1299                 udst += chromStride;
1300                 vdst += chromStride;
1301                 ydst += lumStride;
1302                 src  += srcStride;
1303         }
1304 }
1305
1306 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
1307                             int width, int height, int src1Stride, int src2Stride, int dstStride){
1308         int h;
1309
1310         for(h=0; h < height; h++)
1311         {
1312                 int w;
1313
1314 #ifdef HAVE_MMX
1315 #ifdef HAVE_SSE2
1316                 asm(
1317                         "xorl %%eax, %%eax              \n\t"
1318                         "1:                             \n\t"
1319                         PREFETCH" 64(%1, %%eax)         \n\t"
1320                         PREFETCH" 64(%2, %%eax)         \n\t"
1321                         "movdqa (%1, %%eax), %%xmm0     \n\t"
1322                         "movdqa (%1, %%eax), %%xmm1     \n\t"
1323                         "movdqa (%2, %%eax), %%xmm2     \n\t"
1324                         "punpcklbw %%xmm2, %%xmm0       \n\t"
1325                         "punpckhbw %%xmm2, %%xmm1       \n\t"
1326                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
1327                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
1328                         "addl $16, %%eax                        \n\t"
1329                         "cmpl %3, %%eax                 \n\t"
1330                         " jb 1b                         \n\t"
1331                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1332                         : "memory", "%eax"
1333                 );
1334 #else
1335                 asm(
1336                         "xorl %%eax, %%eax              \n\t"
1337                         "1:                             \n\t"
1338                         PREFETCH" 64(%1, %%eax)         \n\t"
1339                         PREFETCH" 64(%2, %%eax)         \n\t"
1340                         "movq (%1, %%eax), %%mm0        \n\t"
1341                         "movq 8(%1, %%eax), %%mm2       \n\t"
1342                         "movq %%mm0, %%mm1              \n\t"
1343                         "movq %%mm2, %%mm3              \n\t"
1344                         "movq (%2, %%eax), %%mm4        \n\t"
1345                         "movq 8(%2, %%eax), %%mm5       \n\t"
1346                         "punpcklbw %%mm4, %%mm0         \n\t"
1347                         "punpckhbw %%mm4, %%mm1         \n\t"
1348                         "punpcklbw %%mm5, %%mm2         \n\t"
1349                         "punpckhbw %%mm5, %%mm3         \n\t"
1350                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
1351                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
1352                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
1353                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
1354                         "addl $16, %%eax                        \n\t"
1355                         "cmpl %3, %%eax                 \n\t"
1356                         " jb 1b                         \n\t"
1357                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1358                         : "memory", "%eax"
1359                 );
1360 #endif
1361                 for(w= (width&(~15)); w < width; w++)
1362                 {
1363                         dest[2*w+0] = src1[w];
1364                         dest[2*w+1] = src2[w];
1365                 }
1366 #else
1367                 for(w=0; w < width; w++)
1368                 {
1369                         dest[2*w+0] = src1[w];
1370                         dest[2*w+1] = src2[w];
1371                 }
1372 #endif
1373                 dest += dstStride;
1374                 src1 += src1Stride;
1375                 src2 += src2Stride;
1376         }
1377 #ifdef HAVE_MMX
1378         asm(
1379                 EMMS" \n\t"
1380                 SFENCE" \n\t"
1381                 ::: "memory"
1382                 );
1383 #endif
1384 }