]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
fixing end overwrite bugs (some at least)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #undef PREFETCH
12 #undef MOVNTQ
13 #undef EMMS
14 #undef SFENCE
15 #undef MMREG_SIZE
16 #undef PREFETCHW
17 #undef PAVGB
18
19 #ifdef HAVE_SSE2
20 #define MMREG_SIZE 16
21 #else
22 #define MMREG_SIZE 8
23 #endif
24
25 #ifdef HAVE_3DNOW
26 #define PREFETCH  "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB     "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
32 #define PAVGB     "pavgb"
33 #else
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
36 #endif
37
38 #ifdef HAVE_3DNOW
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define EMMS     "femms"
41 #else
42 #define EMMS     "emms"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
48 #else
49 #define MOVNTQ "movq"
50 #define SFENCE "/nop"
51 #endif
52
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
54 {
55   uint8_t *dest = dst;
56   const uint8_t *s = src;
57   const uint8_t *end;
58 #ifdef HAVE_MMX
59   const uint8_t *mm_end;
60 #endif
61   end = s + src_size;
62 #ifdef HAVE_MMX
63   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
64   mm_end = end - 23;
65   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
66   while(s < mm_end)
67   {
68     __asm __volatile(
69         PREFETCH"       32%1\n\t"
70         "movd   %1, %%mm0\n\t"
71         "punpckldq 3%1, %%mm0\n\t"
72         "movd   6%1, %%mm1\n\t"
73         "punpckldq 9%1, %%mm1\n\t"
74         "movd   12%1, %%mm2\n\t"
75         "punpckldq 15%1, %%mm2\n\t"
76         "movd   18%1, %%mm3\n\t"
77         "punpckldq 21%1, %%mm3\n\t"
78         "pand   %%mm7, %%mm0\n\t"
79         "pand   %%mm7, %%mm1\n\t"
80         "pand   %%mm7, %%mm2\n\t"
81         "pand   %%mm7, %%mm3\n\t"
82         MOVNTQ" %%mm0, %0\n\t"
83         MOVNTQ" %%mm1, 8%0\n\t"
84         MOVNTQ" %%mm2, 16%0\n\t"
85         MOVNTQ" %%mm3, 24%0"
86         :"=m"(*dest)
87         :"m"(*s)
88         :"memory");
89     dest += 32;
90     s += 24;
91   }
92   __asm __volatile(SFENCE:::"memory");
93   __asm __volatile(EMMS:::"memory");
94 #endif
95   while(s < end)
96   {
97     *dest++ = *s++;
98     *dest++ = *s++;
99     *dest++ = *s++;
100     *dest++ = 0;
101   }
102 }
103
104 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
105 {
106   uint8_t *dest = dst;
107   const uint8_t *s = src;
108   const uint8_t *end;
109 #ifdef HAVE_MMX
110   const uint8_t *mm_end;
111 #endif
112   end = s + src_size;
113 #ifdef HAVE_MMX
114   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
115   mm_end = end - 31;
116   while(s < mm_end)
117   {
118     __asm __volatile(
119         PREFETCH"       32%1\n\t"
120         "movq   %1, %%mm0\n\t"
121         "movq   8%1, %%mm1\n\t"
122         "movq   16%1, %%mm4\n\t"
123         "movq   24%1, %%mm5\n\t"
124         "movq   %%mm0, %%mm2\n\t"
125         "movq   %%mm1, %%mm3\n\t"
126         "movq   %%mm4, %%mm6\n\t"
127         "movq   %%mm5, %%mm7\n\t"
128         "psrlq  $8, %%mm2\n\t"
129         "psrlq  $8, %%mm3\n\t"
130         "psrlq  $8, %%mm6\n\t"
131         "psrlq  $8, %%mm7\n\t"
132         "pand   %2, %%mm0\n\t"
133         "pand   %2, %%mm1\n\t"
134         "pand   %2, %%mm4\n\t"
135         "pand   %2, %%mm5\n\t"
136         "pand   %3, %%mm2\n\t"
137         "pand   %3, %%mm3\n\t"
138         "pand   %3, %%mm6\n\t"
139         "pand   %3, %%mm7\n\t"
140         "por    %%mm2, %%mm0\n\t"
141         "por    %%mm3, %%mm1\n\t"
142         "por    %%mm6, %%mm4\n\t"
143         "por    %%mm7, %%mm5\n\t"
144
145         "movq   %%mm1, %%mm2\n\t"
146         "movq   %%mm4, %%mm3\n\t"
147         "psllq  $48, %%mm2\n\t"
148         "psllq  $32, %%mm3\n\t"
149         "pand   %4, %%mm2\n\t"
150         "pand   %5, %%mm3\n\t"
151         "por    %%mm2, %%mm0\n\t"
152         "psrlq  $16, %%mm1\n\t"
153         "psrlq  $32, %%mm4\n\t"
154         "psllq  $16, %%mm5\n\t"
155         "por    %%mm3, %%mm1\n\t"
156         "pand   %6, %%mm5\n\t"
157         "por    %%mm5, %%mm4\n\t"
158
159         MOVNTQ" %%mm0, %0\n\t"
160         MOVNTQ" %%mm1, 8%0\n\t"
161         MOVNTQ" %%mm4, 16%0"
162         :"=m"(*dest)
163         :"m"(*s),"m"(mask24l),
164          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
165         :"memory");
166     dest += 24;
167     s += 32;
168   }
169   __asm __volatile(SFENCE:::"memory");
170   __asm __volatile(EMMS:::"memory");
171 #endif
172   while(s < end)
173   {
174     *dest++ = *s++;
175     *dest++ = *s++;
176     *dest++ = *s++;
177     s++;
178   }
179 }
180
181 /*
182  Original by Strepto/Astral
183  ported to gcc & bugfixed : A'rpi
184  MMX2, 3DNOW optimization by Nick Kurshev
185  32bit c version, and and&add trick by Michael Niedermayer
186 */
187 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
188 {
189 #ifdef HAVE_MMX
190   register int offs=15-src_size;
191   register const char* s=src-offs;
192   register char* d=dst-offs;
193   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
194   __asm __volatile(
195         "movq   %0, %%mm4\n\t"
196         ::"m"(mask15s));
197   while(offs<0)
198   {
199         __asm __volatile(
200                 PREFETCH"       32%1\n\t"
201                 "movq   %1, %%mm0\n\t"
202                 "movq   8%1, %%mm2\n\t"
203                 "movq   %%mm0, %%mm1\n\t"
204                 "movq   %%mm2, %%mm3\n\t"
205                 "pand   %%mm4, %%mm0\n\t"
206                 "pand   %%mm4, %%mm2\n\t"
207                 "paddw  %%mm1, %%mm0\n\t"
208                 "paddw  %%mm3, %%mm2\n\t"
209                 MOVNTQ" %%mm0, %0\n\t"
210                 MOVNTQ" %%mm2, 8%0"
211                 :"=m"(*(d+offs))
212                 :"m"(*(s+offs))
213                 );
214         offs+=16;
215   }
216   __asm __volatile(SFENCE:::"memory");
217   __asm __volatile(EMMS:::"memory");
218 #else
219 #if 0
220    const uint16_t *s1=( uint16_t * )src;
221    uint16_t *d1=( uint16_t * )dst;
222    uint16_t *e=((uint8_t *)s1)+src_size;
223    while( s1<e ){
224      register int x=*( s1++ );
225      /* rrrrrggggggbbbbb
226         0rrrrrgggggbbbbb
227         0111 1111 1110 0000=0x7FE0
228         00000000000001 1111=0x001F */
229      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
230    }
231 #else
232         const unsigned *s1=( unsigned * )src;
233         unsigned *d1=( unsigned * )dst;
234         int i;
235         int size= src_size>>2;
236         for(i=0; i<size; i++)
237         {
238                 register int x= s1[i];
239 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
240                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
241
242         }
243 #endif
244 #endif
245 }
246
247 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
248 {
249 #ifdef HAVE_MMX
250         const uint8_t *s = src;
251         const uint8_t *end,*mm_end;
252         uint16_t *d = (uint16_t *)dst;
253         end = s + src_size;
254         mm_end = end - 15;
255         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
256         __asm __volatile(
257             "movq       %0, %%mm7\n\t"
258             "movq       %1, %%mm6\n\t"
259             ::"m"(red_16mask),"m"(green_16mask));
260         while(s < mm_end)
261         {
262             __asm __volatile(
263                 PREFETCH" 32%1\n\t"
264                 "movd   %1, %%mm0\n\t"
265                 "movd   4%1, %%mm3\n\t"
266                 "punpckldq 8%1, %%mm0\n\t"
267                 "punpckldq 12%1, %%mm3\n\t"
268                 "movq   %%mm0, %%mm1\n\t"
269                 "movq   %%mm0, %%mm2\n\t"
270                 "movq   %%mm3, %%mm4\n\t"
271                 "movq   %%mm3, %%mm5\n\t"
272                 "psrlq  $3, %%mm0\n\t"
273                 "psrlq  $3, %%mm3\n\t"
274                 "pand   %2, %%mm0\n\t"
275                 "pand   %2, %%mm3\n\t"
276                 "psrlq  $5, %%mm1\n\t"
277                 "psrlq  $5, %%mm4\n\t"
278                 "pand   %%mm6, %%mm1\n\t"
279                 "pand   %%mm6, %%mm4\n\t"
280                 "psrlq  $8, %%mm2\n\t"
281                 "psrlq  $8, %%mm5\n\t"
282                 "pand   %%mm7, %%mm2\n\t"
283                 "pand   %%mm7, %%mm5\n\t"
284                 "por    %%mm1, %%mm0\n\t"
285                 "por    %%mm4, %%mm3\n\t"
286                 "por    %%mm2, %%mm0\n\t"
287                 "por    %%mm5, %%mm3\n\t"
288                 "psllq  $16, %%mm3\n\t"
289                 "por    %%mm3, %%mm0\n\t"
290                 MOVNTQ" %%mm0, %0\n\t"
291                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
292                 d += 4;
293                 s += 16;
294         }
295         while(s < end)
296         {
297                 const int b= *s++;
298                 const int g= *s++;
299                 const int r= *s++;
300                 s++;
301                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
302         }
303         __asm __volatile(SFENCE:::"memory");
304         __asm __volatile(EMMS:::"memory");
305 #else
306         unsigned j,i,num_pixels=src_size/4;
307         uint16_t *d = (uint16_t *)dst;
308         for(i=0,j=0; j<num_pixels; i+=4,j++)
309         {
310                 const int b= src[i+0];
311                 const int g= src[i+1];
312                 const int r= src[i+2];
313
314                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
315         }
316 #endif
317 }
318
319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
320 {
321 #ifdef HAVE_MMX
322         const uint8_t *s = src;
323         const uint8_t *end,*mm_end;
324         uint16_t *d = (uint16_t *)dst;
325         end = s + src_size;
326         mm_end = end - 15;
327         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
328         __asm __volatile(
329             "movq       %0, %%mm7\n\t"
330             "movq       %1, %%mm6\n\t"
331             ::"m"(red_15mask),"m"(green_15mask));
332         while(s < mm_end)
333         {
334             __asm __volatile(
335                 PREFETCH" 32%1\n\t"
336                 "movd   %1, %%mm0\n\t"
337                 "movd   4%1, %%mm3\n\t"
338                 "punpckldq 8%1, %%mm0\n\t"
339                 "punpckldq 12%1, %%mm3\n\t"
340                 "movq   %%mm0, %%mm1\n\t"
341                 "movq   %%mm0, %%mm2\n\t"
342                 "movq   %%mm3, %%mm4\n\t"
343                 "movq   %%mm3, %%mm5\n\t"
344                 "psrlq  $3, %%mm0\n\t"
345                 "psrlq  $3, %%mm3\n\t"
346                 "pand   %2, %%mm0\n\t"
347                 "pand   %2, %%mm3\n\t"
348                 "psrlq  $6, %%mm1\n\t"
349                 "psrlq  $6, %%mm4\n\t"
350                 "pand   %%mm6, %%mm1\n\t"
351                 "pand   %%mm6, %%mm4\n\t"
352                 "psrlq  $9, %%mm2\n\t"
353                 "psrlq  $9, %%mm5\n\t"
354                 "pand   %%mm7, %%mm2\n\t"
355                 "pand   %%mm7, %%mm5\n\t"
356                 "por    %%mm1, %%mm0\n\t"
357                 "por    %%mm4, %%mm3\n\t"
358                 "por    %%mm2, %%mm0\n\t"
359                 "por    %%mm5, %%mm3\n\t"
360                 "psllq  $16, %%mm3\n\t"
361                 "por    %%mm3, %%mm0\n\t"
362                 MOVNTQ" %%mm0, %0\n\t"
363                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
364                 d += 4;
365                 s += 16;
366         }
367         while(s < end)
368         {
369                 const int b= *s++;
370                 const int g= *s++;
371                 const int r= *s++;
372                 s++;
373                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
374         }
375         __asm __volatile(SFENCE:::"memory");
376         __asm __volatile(EMMS:::"memory");
377 #else
378         unsigned j,i,num_pixels=src_size/4;
379         uint16_t *d = (uint16_t *)dst;
380         for(i=0,j=0; j<num_pixels; i+=4,j++)
381         {
382                 const int b= src[i+0];
383                 const int g= src[i+1];
384                 const int r= src[i+2];
385
386                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
387         }
388 #endif
389 }
390
391 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
392 {
393 #ifdef HAVE_MMX
394         const uint8_t *s = src;
395         const uint8_t *end,*mm_end;
396         uint16_t *d = (uint16_t *)dst;
397         end = s + src_size;
398         mm_end = end - 11;
399         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
400         __asm __volatile(
401             "movq       %0, %%mm7\n\t"
402             "movq       %1, %%mm6\n\t"
403             ::"m"(red_16mask),"m"(green_16mask));
404         while(s < mm_end)
405         {
406             __asm __volatile(
407                 PREFETCH" 32%1\n\t"
408                 "movd   %1, %%mm0\n\t"
409                 "movd   3%1, %%mm3\n\t"
410                 "punpckldq 6%1, %%mm0\n\t"
411                 "punpckldq 9%1, %%mm3\n\t"
412                 "movq   %%mm0, %%mm1\n\t"
413                 "movq   %%mm0, %%mm2\n\t"
414                 "movq   %%mm3, %%mm4\n\t"
415                 "movq   %%mm3, %%mm5\n\t"
416                 "psrlq  $3, %%mm0\n\t"
417                 "psrlq  $3, %%mm3\n\t"
418                 "pand   %2, %%mm0\n\t"
419                 "pand   %2, %%mm3\n\t"
420                 "psrlq  $5, %%mm1\n\t"
421                 "psrlq  $5, %%mm4\n\t"
422                 "pand   %%mm6, %%mm1\n\t"
423                 "pand   %%mm6, %%mm4\n\t"
424                 "psrlq  $8, %%mm2\n\t"
425                 "psrlq  $8, %%mm5\n\t"
426                 "pand   %%mm7, %%mm2\n\t"
427                 "pand   %%mm7, %%mm5\n\t"
428                 "por    %%mm1, %%mm0\n\t"
429                 "por    %%mm4, %%mm3\n\t"
430                 "por    %%mm2, %%mm0\n\t"
431                 "por    %%mm5, %%mm3\n\t"
432                 "psllq  $16, %%mm3\n\t"
433                 "por    %%mm3, %%mm0\n\t"
434                 MOVNTQ" %%mm0, %0\n\t"
435                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
436                 d += 4;
437                 s += 12;
438         }
439         while(s < end)
440         {
441                 const int b= *s++;
442                 const int g= *s++;
443                 const int r= *s++;
444                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
445         }
446         __asm __volatile(SFENCE:::"memory");
447         __asm __volatile(EMMS:::"memory");
448 #else
449         unsigned j,i,num_pixels=src_size/3;
450         uint16_t *d = (uint16_t *)dst;
451         for(i=0,j=0; j<num_pixels; i+=3,j++)
452         {
453                 const int b= src[i+0];
454                 const int g= src[i+1];
455                 const int r= src[i+2];
456
457                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
458         }
459 #endif
460 }
461
462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
463 {
464 #ifdef HAVE_MMX
465         const uint8_t *s = src;
466         const uint8_t *end,*mm_end;
467         uint16_t *d = (uint16_t *)dst;
468         end = s + src_size;
469         mm_end = end -11;
470         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
471         __asm __volatile(
472             "movq       %0, %%mm7\n\t"
473             "movq       %1, %%mm6\n\t"
474             ::"m"(red_15mask),"m"(green_15mask));
475         while(s < mm_end)
476         {
477             __asm __volatile(
478                 PREFETCH" 32%1\n\t"
479                 "movd   %1, %%mm0\n\t"
480                 "movd   3%1, %%mm3\n\t"
481                 "punpckldq 6%1, %%mm0\n\t"
482                 "punpckldq 9%1, %%mm3\n\t"
483                 "movq   %%mm0, %%mm1\n\t"
484                 "movq   %%mm0, %%mm2\n\t"
485                 "movq   %%mm3, %%mm4\n\t"
486                 "movq   %%mm3, %%mm5\n\t"
487                 "psrlq  $3, %%mm0\n\t"
488                 "psrlq  $3, %%mm3\n\t"
489                 "pand   %2, %%mm0\n\t"
490                 "pand   %2, %%mm3\n\t"
491                 "psrlq  $6, %%mm1\n\t"
492                 "psrlq  $6, %%mm4\n\t"
493                 "pand   %%mm6, %%mm1\n\t"
494                 "pand   %%mm6, %%mm4\n\t"
495                 "psrlq  $9, %%mm2\n\t"
496                 "psrlq  $9, %%mm5\n\t"
497                 "pand   %%mm7, %%mm2\n\t"
498                 "pand   %%mm7, %%mm5\n\t"
499                 "por    %%mm1, %%mm0\n\t"
500                 "por    %%mm4, %%mm3\n\t"
501                 "por    %%mm2, %%mm0\n\t"
502                 "por    %%mm5, %%mm3\n\t"
503                 "psllq  $16, %%mm3\n\t"
504                 "por    %%mm3, %%mm0\n\t"
505                 MOVNTQ" %%mm0, %0\n\t"
506                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
507                 d += 4;
508                 s += 12;
509         }
510         while(s < end)
511         {
512                 const int b= *s++;
513                 const int g= *s++;
514                 const int r= *s++;
515                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
516         }
517         __asm __volatile(SFENCE:::"memory");
518         __asm __volatile(EMMS:::"memory");
519 #else
520         unsigned j,i,num_pixels=src_size/3;
521         uint16_t *d = (uint16_t *)dst;
522         for(i=0,j=0; j<num_pixels; i+=3,j++)
523         {
524                 const int b= src[i+0];
525                 const int g= src[i+1];
526                 const int r= src[i+2];
527
528                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
529         }
530 #endif
531 }
532
533 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
534 {
535 #ifdef HAVE_MMX
536         asm volatile (
537                 "xorl %%eax, %%eax              \n\t"
538                 ".balign 16                     \n\t"
539                 "1:                             \n\t"
540                 PREFETCH" 32(%0, %%eax)         \n\t"
541                 "movq (%0, %%eax), %%mm0        \n\t"
542                 "movq %%mm0, %%mm1              \n\t"
543                 "movq %%mm0, %%mm2              \n\t"
544                 "pslld $16, %%mm0               \n\t"
545                 "psrld $16, %%mm1               \n\t"
546                 "pand "MANGLE(mask32r)", %%mm0          \n\t"
547                 "pand "MANGLE(mask32g)", %%mm2          \n\t"
548                 "pand "MANGLE(mask32b)", %%mm1          \n\t"
549                 "por %%mm0, %%mm2               \n\t"
550                 "por %%mm1, %%mm2               \n\t"
551                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
552                 "addl $8, %%eax                 \n\t"
553                 "cmpl %2, %%eax                 \n\t"
554                 " jb 1b                         \n\t"
555                 :: "r" (src), "r"(dst), "r" (src_size)
556                 : "%eax"
557         );
558
559         __asm __volatile(SFENCE:::"memory");
560         __asm __volatile(EMMS:::"memory");
561 #else
562         int i;
563         int num_pixels= src_size >> 2;
564         for(i=0; i<num_pixels; i++)
565         {
566                 dst[4*i + 0] = src[4*i + 2];
567                 dst[4*i + 1] = src[4*i + 1];
568                 dst[4*i + 2] = src[4*i + 0];
569         }
570 #endif
571 }
572
573 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
574 {
575         int i;
576 #ifdef HAVE_MMX
577         int mmx_size= 23 - src_size;
578         asm volatile (
579                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
580                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
581                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
582                 ".balign 16                     \n\t"
583                 "1:                             \n\t"
584                 PREFETCH" 32(%1, %%eax)         \n\t"
585                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
586                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
587                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
588                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
589                 "pand %%mm5, %%mm0              \n\t"
590                 "pand %%mm6, %%mm1              \n\t"
591                 "pand %%mm7, %%mm2              \n\t"
592                 "por %%mm0, %%mm1               \n\t"
593                 "por %%mm2, %%mm1               \n\t"                
594                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
595                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
596                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
597                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
598                 "pand %%mm7, %%mm0              \n\t"
599                 "pand %%mm5, %%mm1              \n\t"
600                 "pand %%mm6, %%mm2              \n\t"
601                 "por %%mm0, %%mm1               \n\t"
602                 "por %%mm2, %%mm1               \n\t"                
603                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
604                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
605                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
606                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
607                 "pand %%mm6, %%mm0              \n\t"
608                 "pand %%mm7, %%mm1              \n\t"
609                 "pand %%mm5, %%mm2              \n\t"
610                 "por %%mm0, %%mm1               \n\t"
611                 "por %%mm2, %%mm1               \n\t"                
612                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
613                 "addl $24, %%eax                \n\t"
614                 " js 1b                         \n\t"
615                 : "+a" (mmx_size)
616                 : "r" (src-mmx_size), "r"(dst-mmx_size)
617         );
618
619         __asm __volatile(SFENCE:::"memory");
620         __asm __volatile(EMMS:::"memory");
621
622         if(mmx_size==23) return; //finihsed, was multiple of 8
623         src+= src_size;
624         dst+= src_size;
625         src_size= 23 - mmx_size;
626         src-= src_size;
627         dst-= src_size;
628 #endif
629         for(i=0; i<src_size; i+=3)
630         {
631                 register int x;
632                 x          = src[i + 2];
633                 dst[i + 1] = src[i + 1];
634                 dst[i + 2] = src[i + 0];
635                 dst[i + 0] = x;
636         }
637 }
638
639 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
640         unsigned int width, unsigned int height,
641         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
642 {
643         int y;
644         const int chromWidth= width>>1;
645         for(y=0; y<height; y++)
646         {
647 #ifdef HAVE_MMX
648 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
649                 asm volatile(
650                         "xorl %%eax, %%eax              \n\t"
651                         ".balign 16                     \n\t"
652                         "1:                             \n\t"
653                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
654                         PREFETCH" 32(%2, %%eax)         \n\t"
655                         PREFETCH" 32(%3, %%eax)         \n\t"
656                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
657                         "movq %%mm0, %%mm2              \n\t" // U(0)
658                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
659                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
660                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
661
662                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
663                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
664                         "movq %%mm3, %%mm4              \n\t" // Y(0)
665                         "movq %%mm5, %%mm6              \n\t" // Y(8)
666                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
667                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
668                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
669                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
670
671                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
672                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
673                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
674                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
675
676                         "addl $8, %%eax                 \n\t"
677                         "cmpl %4, %%eax                 \n\t"
678                         " jb 1b                         \n\t"
679                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
680                         : "%eax"
681                 );
682 #else
683                 int i;
684                 for(i=0; i<chromWidth; i++)
685                 {
686                         dst[4*i+0] = ysrc[2*i+0];
687                         dst[4*i+1] = usrc[i];
688                         dst[4*i+2] = ysrc[2*i+1];
689                         dst[4*i+3] = vsrc[i];
690                 }
691 #endif
692                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
693                 {
694                         usrc += chromStride;
695                         vsrc += chromStride;
696                 }
697                 ysrc += lumStride;
698                 dst += dstStride;
699         }
700 #ifdef HAVE_MMX
701 asm(    EMMS" \n\t"
702         SFENCE" \n\t"
703         :::"memory");
704 #endif
705 }
706
707 /**
708  *
709  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
710  * problem for anyone then tell me, and ill fix it)
711  */
712 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
713         unsigned int width, unsigned int height,
714         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
715 {
716         //FIXME interpolate chroma
717         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
718 }
719
720 /**
721  *
722  * width should be a multiple of 16
723  */
724 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
725         unsigned int width, unsigned int height,
726         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
727 {
728         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
729 }
730
731 /**
732  *
733  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
734  * problem for anyone then tell me, and ill fix it)
735  */
736 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
737         unsigned int width, unsigned int height,
738         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
739 {
740         int y;
741         const int chromWidth= width>>1;
742         for(y=0; y<height; y+=2)
743         {
744 #ifdef HAVE_MMX
745                 asm volatile(
746                         "xorl %%eax, %%eax              \n\t"
747                         "pcmpeqw %%mm7, %%mm7           \n\t"
748                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
749                         ".balign 16                     \n\t"
750                         "1:                             \n\t"
751                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
752                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
753                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
754                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
755                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
756                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
757                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
758                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
759                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
760                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
761                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
762
763                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
764
765                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
766                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
767                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
768                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
769                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
770                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
771                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
772                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
773                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
774                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
775
776                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
777
778                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
779                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
780                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
781                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
782                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
783                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
784                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
785                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
786
787                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
788                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
789
790                         "addl $8, %%eax                 \n\t"
791                         "cmpl %4, %%eax                 \n\t"
792                         " jb 1b                         \n\t"
793                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
794                         : "memory", "%eax"
795                 );
796
797                 ydst += lumStride;
798                 src  += srcStride;
799
800                 asm volatile(
801                         "xorl %%eax, %%eax              \n\t"
802                         ".balign 16                     \n\t"
803                         "1:                             \n\t"
804                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
805                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
806                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
807                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
808                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
809                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
810                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
811                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
812                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
813                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
814                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
815
816                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
817                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
818
819                         "addl $8, %%eax                 \n\t"
820                         "cmpl %4, %%eax                 \n\t"
821                         " jb 1b                         \n\t"
822
823                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
824                         : "memory", "%eax"
825                 );
826 #else
827                 int i;
828                 for(i=0; i<chromWidth; i++)
829                 {
830                         ydst[2*i+0]     = src[4*i+0];
831                         udst[i]         = src[4*i+1];
832                         ydst[2*i+1]     = src[4*i+2];
833                         vdst[i]         = src[4*i+3];
834                 }
835                 ydst += lumStride;
836                 src  += srcStride;
837
838                 for(i=0; i<chromWidth; i++)
839                 {
840                         ydst[2*i+0]     = src[4*i+0];
841                         ydst[2*i+1]     = src[4*i+2];
842                 }
843 #endif
844                 udst += chromStride;
845                 vdst += chromStride;
846                 ydst += lumStride;
847                 src  += srcStride;
848         }
849 #ifdef HAVE_MMX
850 asm volatile(   EMMS" \n\t"
851                 SFENCE" \n\t"
852                 :::"memory");
853 #endif
854 }
855
856 /**
857  *
858  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
859  * problem for anyone then tell me, and ill fix it)
860  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
861  */
862 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
863         unsigned int width, unsigned int height,
864         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
865 {
866         int y;
867         const int chromWidth= width>>1;
868         for(y=0; y<height; y+=2)
869         {
870 #ifdef HAVE_MMX
871                 asm volatile(
872                         "xorl %%eax, %%eax              \n\t"
873                         "pcmpeqw %%mm7, %%mm7           \n\t"
874                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
875                         ".balign 16                     \n\t"
876                         "1:                             \n\t"
877                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
878                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
879                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
880                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
881                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
882                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
883                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
884                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
885                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
886                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
887                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
888
889                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
890
891                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
892                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
893                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
894                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
895                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
896                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
897                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
898                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
899                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
900                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
901
902                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
903
904                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
905                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
906                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
907                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
908                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
909                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
910                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
911                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
912
913                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
914                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
915
916                         "addl $8, %%eax                 \n\t"
917                         "cmpl %4, %%eax                 \n\t"
918                         " jb 1b                         \n\t"
919                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
920                         : "memory", "%eax"
921                 );
922
923                 ydst += lumStride;
924                 src  += srcStride;
925
926                 asm volatile(
927                         "xorl %%eax, %%eax              \n\t"
928                         ".balign 16                     \n\t"
929                         "1:                             \n\t"
930                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
931                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
932                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
933                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
934                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
935                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
936                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
937                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
938                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
939                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
940                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
941
942                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
943                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
944
945                         "addl $8, %%eax                 \n\t"
946                         "cmpl %4, %%eax                 \n\t"
947                         " jb 1b                         \n\t"
948
949                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
950                         : "memory", "%eax"
951                 );
952 #else
953                 int i;
954                 for(i=0; i<chromWidth; i++)
955                 {
956                         udst[i]         = src[4*i+0];
957                         ydst[2*i+0]     = src[4*i+1];
958                         vdst[i]         = src[4*i+2];
959                         ydst[2*i+1]     = src[4*i+3];
960                 }
961                 ydst += lumStride;
962                 src  += srcStride;
963
964                 for(i=0; i<chromWidth; i++)
965                 {
966                         ydst[2*i+0]     = src[4*i+1];
967                         ydst[2*i+1]     = src[4*i+3];
968                 }
969 #endif
970                 udst += chromStride;
971                 vdst += chromStride;
972                 ydst += lumStride;
973                 src  += srcStride;
974         }
975 #ifdef HAVE_MMX
976 asm volatile(   EMMS" \n\t"
977                 SFENCE" \n\t"
978                 :::"memory");
979 #endif
980 }
981
982 /**
983  *
984  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
985  * problem for anyone then tell me, and ill fix it)
986  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
987  */
988 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
989         unsigned int width, unsigned int height,
990         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
991 {
992         int y;
993         const int chromWidth= width>>1;
994 #ifdef HAVE_MMX
995         for(y=0; y<height-2; y+=2)
996         {
997                 int i;
998                 for(i=0; i<2; i++)
999                 {
1000                         asm volatile(
1001                                 "movl %2, %%eax                 \n\t"
1002                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1003                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1004                                 "pxor %%mm7, %%mm7              \n\t"
1005                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1006                                 ".balign 16                     \n\t"
1007                                 "1:                             \n\t"
1008                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1009                                 "movd (%0, %%ebx), %%mm0        \n\t"
1010                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1011                                 "punpcklbw %%mm7, %%mm0         \n\t"
1012                                 "punpcklbw %%mm7, %%mm1         \n\t"
1013                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1014                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1015                                 "punpcklbw %%mm7, %%mm2         \n\t"
1016                                 "punpcklbw %%mm7, %%mm3         \n\t"
1017                                 "pmaddwd %%mm6, %%mm0           \n\t"
1018                                 "pmaddwd %%mm6, %%mm1           \n\t"
1019                                 "pmaddwd %%mm6, %%mm2           \n\t"
1020                                 "pmaddwd %%mm6, %%mm3           \n\t"
1021 #ifndef FAST_BGR2YV12
1022                                 "psrad $8, %%mm0                \n\t"
1023                                 "psrad $8, %%mm1                \n\t"
1024                                 "psrad $8, %%mm2                \n\t"
1025                                 "psrad $8, %%mm3                \n\t"
1026 #endif
1027                                 "packssdw %%mm1, %%mm0          \n\t"
1028                                 "packssdw %%mm3, %%mm2          \n\t"
1029                                 "pmaddwd %%mm5, %%mm0           \n\t"
1030                                 "pmaddwd %%mm5, %%mm2           \n\t"
1031                                 "packssdw %%mm2, %%mm0          \n\t"
1032                                 "psraw $7, %%mm0                \n\t"
1033
1034                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1035                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1036                                 "punpcklbw %%mm7, %%mm4         \n\t"
1037                                 "punpcklbw %%mm7, %%mm1         \n\t"
1038                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1039                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1040                                 "punpcklbw %%mm7, %%mm2         \n\t"
1041                                 "punpcklbw %%mm7, %%mm3         \n\t"
1042                                 "pmaddwd %%mm6, %%mm4           \n\t"
1043                                 "pmaddwd %%mm6, %%mm1           \n\t"
1044                                 "pmaddwd %%mm6, %%mm2           \n\t"
1045                                 "pmaddwd %%mm6, %%mm3           \n\t"
1046 #ifndef FAST_BGR2YV12
1047                                 "psrad $8, %%mm4                \n\t"
1048                                 "psrad $8, %%mm1                \n\t"
1049                                 "psrad $8, %%mm2                \n\t"
1050                                 "psrad $8, %%mm3                \n\t"
1051 #endif
1052                                 "packssdw %%mm1, %%mm4          \n\t"
1053                                 "packssdw %%mm3, %%mm2          \n\t"
1054                                 "pmaddwd %%mm5, %%mm4           \n\t"
1055                                 "pmaddwd %%mm5, %%mm2           \n\t"
1056                                 "addl $24, %%ebx                \n\t"
1057                                 "packssdw %%mm2, %%mm4          \n\t"
1058                                 "psraw $7, %%mm4                \n\t"
1059
1060                                 "packuswb %%mm4, %%mm0          \n\t"
1061                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1062
1063                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
1064                                 "addl $8, %%eax                 \n\t"
1065                                 " js 1b                         \n\t"
1066                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1067                                 : "%eax", "%ebx"
1068                         );
1069                         ydst += lumStride;
1070                         src  += srcStride;
1071                 }
1072                 src -= srcStride*2;
1073                 asm volatile(
1074                         "movl %4, %%eax                 \n\t"
1075                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1076                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1077                         "pxor %%mm7, %%mm7              \n\t"
1078                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1079                         "addl %%ebx, %%ebx              \n\t"
1080                         ".balign 16                     \n\t"
1081                         "1:                             \n\t"
1082                         PREFETCH" 64(%0, %%ebx)         \n\t"
1083                         PREFETCH" 64(%1, %%ebx)         \n\t"
1084 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1085                         "movq (%0, %%ebx), %%mm0        \n\t"
1086                         "movq (%1, %%ebx), %%mm1        \n\t"
1087                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1088                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1089                         PAVGB" %%mm1, %%mm0             \n\t"
1090                         PAVGB" %%mm3, %%mm2             \n\t"
1091                         "movq %%mm0, %%mm1              \n\t"
1092                         "movq %%mm2, %%mm3              \n\t"
1093                         "psrlq $24, %%mm0               \n\t"
1094                         "psrlq $24, %%mm2               \n\t"
1095                         PAVGB" %%mm1, %%mm0             \n\t"
1096                         PAVGB" %%mm3, %%mm2             \n\t"
1097                         "punpcklbw %%mm7, %%mm0         \n\t"
1098                         "punpcklbw %%mm7, %%mm2         \n\t"
1099 #else
1100                         "movd (%0, %%ebx), %%mm0        \n\t"
1101                         "movd (%1, %%ebx), %%mm1        \n\t"
1102                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1103                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1104                         "punpcklbw %%mm7, %%mm0         \n\t"
1105                         "punpcklbw %%mm7, %%mm1         \n\t"
1106                         "punpcklbw %%mm7, %%mm2         \n\t"
1107                         "punpcklbw %%mm7, %%mm3         \n\t"
1108                         "paddw %%mm1, %%mm0             \n\t"
1109                         "paddw %%mm3, %%mm2             \n\t"
1110                         "paddw %%mm2, %%mm0             \n\t"
1111                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1112                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1113                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1114                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1115                         "punpcklbw %%mm7, %%mm4         \n\t"
1116                         "punpcklbw %%mm7, %%mm1         \n\t"
1117                         "punpcklbw %%mm7, %%mm2         \n\t"
1118                         "punpcklbw %%mm7, %%mm3         \n\t"
1119                         "paddw %%mm1, %%mm4             \n\t"
1120                         "paddw %%mm3, %%mm2             \n\t"
1121                         "paddw %%mm4, %%mm2             \n\t"
1122                         "psrlw $2, %%mm0                \n\t"
1123                         "psrlw $2, %%mm2                \n\t"
1124 #endif
1125                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1126                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1127
1128                         "pmaddwd %%mm0, %%mm1           \n\t"
1129                         "pmaddwd %%mm2, %%mm3           \n\t"
1130                         "pmaddwd %%mm6, %%mm0           \n\t"
1131                         "pmaddwd %%mm6, %%mm2           \n\t"
1132 #ifndef FAST_BGR2YV12
1133                         "psrad $8, %%mm0                \n\t"
1134                         "psrad $8, %%mm1                \n\t"
1135                         "psrad $8, %%mm2                \n\t"
1136                         "psrad $8, %%mm3                \n\t"
1137 #endif
1138                         "packssdw %%mm2, %%mm0          \n\t"
1139                         "packssdw %%mm3, %%mm1          \n\t"
1140                         "pmaddwd %%mm5, %%mm0           \n\t"
1141                         "pmaddwd %%mm5, %%mm1           \n\t"
1142                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1143                         "psraw $7, %%mm0                \n\t"
1144
1145 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1146                         "movq 12(%0, %%ebx), %%mm4      \n\t"
1147                         "movq 12(%1, %%ebx), %%mm1      \n\t"
1148                         "movq 18(%0, %%ebx), %%mm2      \n\t"
1149                         "movq 18(%1, %%ebx), %%mm3      \n\t"
1150                         PAVGB" %%mm1, %%mm4             \n\t"
1151                         PAVGB" %%mm3, %%mm2             \n\t"
1152                         "movq %%mm4, %%mm1              \n\t"
1153                         "movq %%mm2, %%mm3              \n\t"
1154                         "psrlq $24, %%mm4               \n\t"
1155                         "psrlq $24, %%mm2               \n\t"
1156                         PAVGB" %%mm1, %%mm4             \n\t"
1157                         PAVGB" %%mm3, %%mm2             \n\t"
1158                         "punpcklbw %%mm7, %%mm4         \n\t"
1159                         "punpcklbw %%mm7, %%mm2         \n\t"
1160 #else
1161                         "movd 12(%0, %%ebx), %%mm4      \n\t"
1162                         "movd 12(%1, %%ebx), %%mm1      \n\t"
1163                         "movd 15(%0, %%ebx), %%mm2      \n\t"
1164                         "movd 15(%1, %%ebx), %%mm3      \n\t"
1165                         "punpcklbw %%mm7, %%mm4         \n\t"
1166                         "punpcklbw %%mm7, %%mm1         \n\t"
1167                         "punpcklbw %%mm7, %%mm2         \n\t"
1168                         "punpcklbw %%mm7, %%mm3         \n\t"
1169                         "paddw %%mm1, %%mm4             \n\t"
1170                         "paddw %%mm3, %%mm2             \n\t"
1171                         "paddw %%mm2, %%mm4             \n\t"
1172                         "movd 18(%0, %%ebx), %%mm5      \n\t"
1173                         "movd 18(%1, %%ebx), %%mm1      \n\t"
1174                         "movd 21(%0, %%ebx), %%mm2      \n\t"
1175                         "movd 21(%1, %%ebx), %%mm3      \n\t"
1176                         "punpcklbw %%mm7, %%mm5         \n\t"
1177                         "punpcklbw %%mm7, %%mm1         \n\t"
1178                         "punpcklbw %%mm7, %%mm2         \n\t"
1179                         "punpcklbw %%mm7, %%mm3         \n\t"
1180                         "paddw %%mm1, %%mm5             \n\t"
1181                         "paddw %%mm3, %%mm2             \n\t"
1182                         "paddw %%mm5, %%mm2             \n\t"
1183                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1184                         "psrlw $2, %%mm4                \n\t"
1185                         "psrlw $2, %%mm2                \n\t"
1186 #endif
1187                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1188                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1189
1190                         "pmaddwd %%mm4, %%mm1           \n\t"
1191                         "pmaddwd %%mm2, %%mm3           \n\t"
1192                         "pmaddwd %%mm6, %%mm4           \n\t"
1193                         "pmaddwd %%mm6, %%mm2           \n\t"
1194 #ifndef FAST_BGR2YV12
1195                         "psrad $8, %%mm4                \n\t"
1196                         "psrad $8, %%mm1                \n\t"
1197                         "psrad $8, %%mm2                \n\t"
1198                         "psrad $8, %%mm3                \n\t"
1199 #endif
1200                         "packssdw %%mm2, %%mm4          \n\t"
1201                         "packssdw %%mm3, %%mm1          \n\t"
1202                         "pmaddwd %%mm5, %%mm4           \n\t"
1203                         "pmaddwd %%mm5, %%mm1           \n\t"
1204                         "addl $24, %%ebx                \n\t"
1205                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1206                         "psraw $7, %%mm4                \n\t"
1207
1208                         "movq %%mm0, %%mm1              \n\t"
1209                         "punpckldq %%mm4, %%mm0         \n\t"
1210                         "punpckhdq %%mm4, %%mm1         \n\t"
1211                         "packsswb %%mm1, %%mm0          \n\t"
1212                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1213
1214                         "movd %%mm0, (%2, %%eax)        \n\t"
1215                         "punpckhdq %%mm0, %%mm0         \n\t"
1216                         "movd %%mm0, (%3, %%eax)        \n\t"
1217                         "addl $4, %%eax                 \n\t"
1218                         " js 1b                         \n\t"
1219                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1220                         : "%eax", "%ebx"
1221                 );
1222
1223                 udst += chromStride;
1224                 vdst += chromStride;
1225                 src  += srcStride*2;
1226         }
1227
1228         asm volatile(   EMMS" \n\t"
1229                         SFENCE" \n\t"
1230                         :::"memory");
1231 #else
1232         y=0;
1233 #endif
1234         for(; y<height; y+=2)
1235         {
1236                 int i;
1237                 for(i=0; i<chromWidth; i++)
1238                 {
1239                         unsigned int b= src[6*i+0];
1240                         unsigned int g= src[6*i+1];
1241                         unsigned int r= src[6*i+2];
1242
1243                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1244                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1245                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1246
1247                         udst[i]         = U;
1248                         vdst[i]         = V;
1249                         ydst[2*i]       = Y;
1250
1251                         b= src[6*i+3];
1252                         g= src[6*i+4];
1253                         r= src[6*i+5];
1254
1255                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1256                         ydst[2*i+1]     = Y;
1257                 }
1258                 ydst += lumStride;
1259                 src  += srcStride;
1260
1261                 for(i=0; i<chromWidth; i++)
1262                 {
1263                         unsigned int b= src[6*i+0];
1264                         unsigned int g= src[6*i+1];
1265                         unsigned int r= src[6*i+2];
1266
1267                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1268
1269                         ydst[2*i]       = Y;
1270
1271                         b= src[6*i+3];
1272                         g= src[6*i+4];
1273                         r= src[6*i+5];
1274
1275                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1276                         ydst[2*i+1]     = Y;
1277                 }
1278                 udst += chromStride;
1279                 vdst += chromStride;
1280                 ydst += lumStride;
1281                 src  += srcStride;
1282         }
1283 }
1284
1285 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
1286                             int width, int height, int src1Stride, int src2Stride, int dstStride){
1287         int h;
1288
1289         for(h=0; h < height; h++)
1290         {
1291                 int w;
1292
1293 #ifdef HAVE_MMX
1294 #ifdef HAVE_SSE2
1295                 asm(
1296                         "xorl %%eax, %%eax              \n\t"
1297                         "1:                             \n\t"
1298                         PREFETCH" 64(%1, %%eax)         \n\t"
1299                         PREFETCH" 64(%2, %%eax)         \n\t"
1300                         "movdqa (%1, %%eax), %%xmm0     \n\t"
1301                         "movdqa (%1, %%eax), %%xmm1     \n\t"
1302                         "movdqa (%2, %%eax), %%xmm2     \n\t"
1303                         "punpcklbw %%xmm2, %%xmm0       \n\t"
1304                         "punpckhbw %%xmm2, %%xmm1       \n\t"
1305                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
1306                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
1307                         "addl $16, %%eax                        \n\t"
1308                         "cmpl %3, %%eax                 \n\t"
1309                         " jb 1b                         \n\t"
1310                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1311                         : "memory", "%eax"
1312                 );
1313 #else
1314                 asm(
1315                         "xorl %%eax, %%eax              \n\t"
1316                         "1:                             \n\t"
1317                         PREFETCH" 64(%1, %%eax)         \n\t"
1318                         PREFETCH" 64(%2, %%eax)         \n\t"
1319                         "movq (%1, %%eax), %%mm0        \n\t"
1320                         "movq 8(%1, %%eax), %%mm2       \n\t"
1321                         "movq %%mm0, %%mm1              \n\t"
1322                         "movq %%mm2, %%mm3              \n\t"
1323                         "movq (%2, %%eax), %%mm4        \n\t"
1324                         "movq 8(%2, %%eax), %%mm5       \n\t"
1325                         "punpcklbw %%mm4, %%mm0         \n\t"
1326                         "punpckhbw %%mm4, %%mm1         \n\t"
1327                         "punpcklbw %%mm5, %%mm2         \n\t"
1328                         "punpckhbw %%mm5, %%mm3         \n\t"
1329                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
1330                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
1331                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
1332                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
1333                         "addl $16, %%eax                        \n\t"
1334                         "cmpl %3, %%eax                 \n\t"
1335                         " jb 1b                         \n\t"
1336                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1337                         : "memory", "%eax"
1338                 );
1339 #endif
1340                 for(w= (width&(~15)); w < width; w++)
1341                 {
1342                         dest[2*w+0] = src1[w];
1343                         dest[2*w+1] = src2[w];
1344                 }
1345 #else
1346                 for(w=0; w < width; w++)
1347                 {
1348                         dest[2*w+0] = src1[w];
1349                         dest[2*w+1] = src2[w];
1350                 }
1351 #endif
1352                 dest += dstStride;
1353                 src1 += src1Stride;
1354                 src2 += src2Stride;
1355         }
1356 #ifdef HAVE_MMX
1357         asm(
1358                 EMMS" \n\t"
1359                 SFENCE" \n\t"
1360                 ::: "memory"
1361                 );
1362 #endif
1363 }