]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
rgb24toyv12 bugfix
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #undef PREFETCH
12 #undef MOVNTQ
13 #undef EMMS
14 #undef SFENCE
15 #undef MMREG_SIZE
16 #undef PREFETCHW
17 #undef PAVGB
18
19 #ifdef HAVE_SSE2
20 #define MMREG_SIZE 16
21 #else
22 #define MMREG_SIZE 8
23 #endif
24
25 #ifdef HAVE_3DNOW
26 #define PREFETCH  "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB     "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
32 #define PAVGB     "pavgb"
33 #else
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
36 #endif
37
38 #ifdef HAVE_3DNOW
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define EMMS     "femms"
41 #else
42 #define EMMS     "emms"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
48 #else
49 #define MOVNTQ "movq"
50 #define SFENCE "/nop"
51 #endif
52
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
54 {
55   uint8_t *dest = dst;
56   const uint8_t *s = src;
57   const uint8_t *end;
58 #ifdef HAVE_MMX
59   uint8_t *mm_end;
60 #endif
61   end = s + src_size;
62 #ifdef HAVE_MMX
63   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
64   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
65   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
66   if(mm_end == end) mm_end -= MMREG_SIZE*4;
67   while(s < mm_end)
68   {
69     __asm __volatile(
70         PREFETCH"       32%1\n\t"
71         "movd   %1, %%mm0\n\t"
72         "punpckldq 3%1, %%mm0\n\t"
73         "movd   6%1, %%mm1\n\t"
74         "punpckldq 9%1, %%mm1\n\t"
75         "movd   12%1, %%mm2\n\t"
76         "punpckldq 15%1, %%mm2\n\t"
77         "movd   18%1, %%mm3\n\t"
78         "punpckldq 21%1, %%mm3\n\t"
79         "pand   %%mm7, %%mm0\n\t"
80         "pand   %%mm7, %%mm1\n\t"
81         "pand   %%mm7, %%mm2\n\t"
82         "pand   %%mm7, %%mm3\n\t"
83         MOVNTQ" %%mm0, %0\n\t"
84         MOVNTQ" %%mm1, 8%0\n\t"
85         MOVNTQ" %%mm2, 16%0\n\t"
86         MOVNTQ" %%mm3, 24%0"
87         :"=m"(*dest)
88         :"m"(*s)
89         :"memory");
90     dest += 32;
91     s += 24;
92   }
93   __asm __volatile(SFENCE:::"memory");
94   __asm __volatile(EMMS:::"memory");
95 #endif
96   while(s < end)
97   {
98     *dest++ = *s++;
99     *dest++ = *s++;
100     *dest++ = *s++;
101     *dest++ = 0;
102   }
103 }
104
105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
106 {
107   uint8_t *dest = dst;
108   const uint8_t *s = src;
109   const uint8_t *end;
110 #ifdef HAVE_MMX
111   uint8_t *mm_end;
112 #endif
113   end = s + src_size;
114 #ifdef HAVE_MMX
115   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
116   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
117   while(s < mm_end)
118   {
119     __asm __volatile(
120         PREFETCH"       32%1\n\t"
121         "movq   %1, %%mm0\n\t"
122         "movq   8%1, %%mm1\n\t"
123         "movq   16%1, %%mm4\n\t"
124         "movq   24%1, %%mm5\n\t"
125         "movq   %%mm0, %%mm2\n\t"
126         "movq   %%mm1, %%mm3\n\t"
127         "movq   %%mm4, %%mm6\n\t"
128         "movq   %%mm5, %%mm7\n\t"
129         "psrlq  $8, %%mm2\n\t"
130         "psrlq  $8, %%mm3\n\t"
131         "psrlq  $8, %%mm6\n\t"
132         "psrlq  $8, %%mm7\n\t"
133         "pand   %2, %%mm0\n\t"
134         "pand   %2, %%mm1\n\t"
135         "pand   %2, %%mm4\n\t"
136         "pand   %2, %%mm5\n\t"
137         "pand   %3, %%mm2\n\t"
138         "pand   %3, %%mm3\n\t"
139         "pand   %3, %%mm6\n\t"
140         "pand   %3, %%mm7\n\t"
141         "por    %%mm2, %%mm0\n\t"
142         "por    %%mm3, %%mm1\n\t"
143         "por    %%mm6, %%mm4\n\t"
144         "por    %%mm7, %%mm5\n\t"
145
146         "movq   %%mm1, %%mm2\n\t"
147         "movq   %%mm4, %%mm3\n\t"
148         "psllq  $48, %%mm2\n\t"
149         "psllq  $32, %%mm3\n\t"
150         "pand   %4, %%mm2\n\t"
151         "pand   %5, %%mm3\n\t"
152         "por    %%mm2, %%mm0\n\t"
153         "psrlq  $16, %%mm1\n\t"
154         "psrlq  $32, %%mm4\n\t"
155         "psllq  $16, %%mm5\n\t"
156         "por    %%mm3, %%mm1\n\t"
157         "pand   %6, %%mm5\n\t"
158         "por    %%mm5, %%mm4\n\t"
159
160         MOVNTQ" %%mm0, %0\n\t"
161         MOVNTQ" %%mm1, 8%0\n\t"
162         MOVNTQ" %%mm4, 16%0"
163         :"=m"(*dest)
164         :"m"(*s),"m"(mask24l),
165          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
166         :"memory");
167     dest += 24;
168     s += 32;
169   }
170   __asm __volatile(SFENCE:::"memory");
171   __asm __volatile(EMMS:::"memory");
172 #endif
173   while(s < end)
174   {
175     *dest++ = *s++;
176     *dest++ = *s++;
177     *dest++ = *s++;
178     s++;
179   }
180 }
181
182 /*
183  Original by Strepto/Astral
184  ported to gcc & bugfixed : A'rpi
185  MMX2, 3DNOW optimization by Nick Kurshev
186  32bit c version, and and&add trick by Michael Niedermayer
187 */
188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
189 {
190 #ifdef HAVE_MMX
191   register const char* s=src+src_size;
192   register char* d=dst+src_size;
193   register int offs=-src_size;
194   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
195   __asm __volatile(
196         "movq   %0, %%mm4\n\t"
197         ::"m"(mask15s));
198   while(offs<0)
199   {
200         __asm __volatile(
201                 PREFETCH"       32%1\n\t"
202                 "movq   %1, %%mm0\n\t"
203                 "movq   8%1, %%mm2\n\t"
204                 "movq   %%mm0, %%mm1\n\t"
205                 "movq   %%mm2, %%mm3\n\t"
206                 "pand   %%mm4, %%mm0\n\t"
207                 "pand   %%mm4, %%mm2\n\t"
208                 "paddw  %%mm1, %%mm0\n\t"
209                 "paddw  %%mm3, %%mm2\n\t"
210                 MOVNTQ" %%mm0, %0\n\t"
211                 MOVNTQ" %%mm2, 8%0"
212                 :"=m"(*(d+offs))
213                 :"m"(*(s+offs))
214                 );
215         offs+=16;
216   }
217   __asm __volatile(SFENCE:::"memory");
218   __asm __volatile(EMMS:::"memory");
219 #else
220 #if 0
221    const uint16_t *s1=( uint16_t * )src;
222    uint16_t *d1=( uint16_t * )dst;
223    uint16_t *e=((uint8_t *)s1)+src_size;
224    while( s1<e ){
225      register int x=*( s1++ );
226      /* rrrrrggggggbbbbb
227         0rrrrrgggggbbbbb
228         0111 1111 1110 0000=0x7FE0
229         00000000000001 1111=0x001F */
230      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
231    }
232 #else
233         const unsigned *s1=( unsigned * )src;
234         unsigned *d1=( unsigned * )dst;
235         int i;
236         int size= src_size>>2;
237         for(i=0; i<size; i++)
238         {
239                 register int x= s1[i];
240 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
241                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
242
243         }
244 #endif
245 #endif
246 }
247
248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
249 {
250 #ifdef HAVE_MMX
251         const uint8_t *s = src;
252         const uint8_t *end,*mm_end;
253         uint16_t *d = (uint16_t *)dst;
254         end = s + src_size;
255         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
256         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
257         __asm __volatile(
258             "movq       %0, %%mm7\n\t"
259             "movq       %1, %%mm6\n\t"
260             ::"m"(red_16mask),"m"(green_16mask));
261         while(s < mm_end)
262         {
263             __asm __volatile(
264                 PREFETCH" 32%1\n\t"
265                 "movd   %1, %%mm0\n\t"
266                 "movd   4%1, %%mm3\n\t"
267                 "punpckldq 8%1, %%mm0\n\t"
268                 "punpckldq 12%1, %%mm3\n\t"
269                 "movq   %%mm0, %%mm1\n\t"
270                 "movq   %%mm0, %%mm2\n\t"
271                 "movq   %%mm3, %%mm4\n\t"
272                 "movq   %%mm3, %%mm5\n\t"
273                 "psrlq  $3, %%mm0\n\t"
274                 "psrlq  $3, %%mm3\n\t"
275                 "pand   %2, %%mm0\n\t"
276                 "pand   %2, %%mm3\n\t"
277                 "psrlq  $5, %%mm1\n\t"
278                 "psrlq  $5, %%mm4\n\t"
279                 "pand   %%mm6, %%mm1\n\t"
280                 "pand   %%mm6, %%mm4\n\t"
281                 "psrlq  $8, %%mm2\n\t"
282                 "psrlq  $8, %%mm5\n\t"
283                 "pand   %%mm7, %%mm2\n\t"
284                 "pand   %%mm7, %%mm5\n\t"
285                 "por    %%mm1, %%mm0\n\t"
286                 "por    %%mm4, %%mm3\n\t"
287                 "por    %%mm2, %%mm0\n\t"
288                 "por    %%mm5, %%mm3\n\t"
289                 "psllq  $16, %%mm3\n\t"
290                 "por    %%mm3, %%mm0\n\t"
291                 MOVNTQ" %%mm0, %0\n\t"
292                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
293                 d += 4;
294                 s += 16;
295         }
296         while(s < end)
297         {
298                 const int b= *s++;
299                 const int g= *s++;
300                 const int r= *s++;
301                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
302         }
303         __asm __volatile(SFENCE:::"memory");
304         __asm __volatile(EMMS:::"memory");
305 #else
306         unsigned j,i,num_pixels=src_size/4;
307         uint16_t *d = (uint16_t *)dst;
308         for(i=0,j=0; j<num_pixels; i+=4,j++)
309         {
310                 const int b= src[i+0];
311                 const int g= src[i+1];
312                 const int r= src[i+2];
313
314                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
315         }
316 #endif
317 }
318
319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
320 {
321 #ifdef HAVE_MMX
322         const uint8_t *s = src;
323         const uint8_t *end,*mm_end;
324         uint16_t *d = (uint16_t *)dst;
325         end = s + src_size;
326         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
327         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
328         __asm __volatile(
329             "movq       %0, %%mm7\n\t"
330             "movq       %1, %%mm6\n\t"
331             ::"m"(red_15mask),"m"(green_15mask));
332         while(s < mm_end)
333         {
334             __asm __volatile(
335                 PREFETCH" 32%1\n\t"
336                 "movd   %1, %%mm0\n\t"
337                 "movd   4%1, %%mm3\n\t"
338                 "punpckldq 8%1, %%mm0\n\t"
339                 "punpckldq 12%1, %%mm3\n\t"
340                 "movq   %%mm0, %%mm1\n\t"
341                 "movq   %%mm0, %%mm2\n\t"
342                 "movq   %%mm3, %%mm4\n\t"
343                 "movq   %%mm3, %%mm5\n\t"
344                 "psrlq  $3, %%mm0\n\t"
345                 "psrlq  $3, %%mm3\n\t"
346                 "pand   %2, %%mm0\n\t"
347                 "pand   %2, %%mm3\n\t"
348                 "psrlq  $6, %%mm1\n\t"
349                 "psrlq  $6, %%mm4\n\t"
350                 "pand   %%mm6, %%mm1\n\t"
351                 "pand   %%mm6, %%mm4\n\t"
352                 "psrlq  $9, %%mm2\n\t"
353                 "psrlq  $9, %%mm5\n\t"
354                 "pand   %%mm7, %%mm2\n\t"
355                 "pand   %%mm7, %%mm5\n\t"
356                 "por    %%mm1, %%mm0\n\t"
357                 "por    %%mm4, %%mm3\n\t"
358                 "por    %%mm2, %%mm0\n\t"
359                 "por    %%mm5, %%mm3\n\t"
360                 "psllq  $16, %%mm3\n\t"
361                 "por    %%mm3, %%mm0\n\t"
362                 MOVNTQ" %%mm0, %0\n\t"
363                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
364                 d += 4;
365                 s += 16;
366         }
367         while(s < end)
368         {
369                 const int b= *s++;
370                 const int g= *s++;
371                 const int r= *s++;
372                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
373         }
374         __asm __volatile(SFENCE:::"memory");
375         __asm __volatile(EMMS:::"memory");
376 #else
377         unsigned j,i,num_pixels=src_size/4;
378         uint16_t *d = (uint16_t *)dst;
379         for(i=0,j=0; j<num_pixels; i+=4,j++)
380         {
381                 const int b= src[i+0];
382                 const int g= src[i+1];
383                 const int r= src[i+2];
384
385                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
386         }
387 #endif
388 }
389
390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
391 {
392 #ifdef HAVE_MMX
393         const uint8_t *s = src;
394         const uint8_t *end,*mm_end;
395         uint16_t *d = (uint16_t *)dst;
396         end = s + src_size;
397         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
398         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
399         __asm __volatile(
400             "movq       %0, %%mm7\n\t"
401             "movq       %1, %%mm6\n\t"
402             ::"m"(red_16mask),"m"(green_16mask));
403         if(mm_end == end) mm_end -= MMREG_SIZE*2;
404         while(s < mm_end)
405         {
406             __asm __volatile(
407                 PREFETCH" 32%1\n\t"
408                 "movd   %1, %%mm0\n\t"
409                 "movd   3%1, %%mm3\n\t"
410                 "punpckldq 6%1, %%mm0\n\t"
411                 "punpckldq 9%1, %%mm3\n\t"
412                 "movq   %%mm0, %%mm1\n\t"
413                 "movq   %%mm0, %%mm2\n\t"
414                 "movq   %%mm3, %%mm4\n\t"
415                 "movq   %%mm3, %%mm5\n\t"
416                 "psrlq  $3, %%mm0\n\t"
417                 "psrlq  $3, %%mm3\n\t"
418                 "pand   %2, %%mm0\n\t"
419                 "pand   %2, %%mm3\n\t"
420                 "psrlq  $5, %%mm1\n\t"
421                 "psrlq  $5, %%mm4\n\t"
422                 "pand   %%mm6, %%mm1\n\t"
423                 "pand   %%mm6, %%mm4\n\t"
424                 "psrlq  $8, %%mm2\n\t"
425                 "psrlq  $8, %%mm5\n\t"
426                 "pand   %%mm7, %%mm2\n\t"
427                 "pand   %%mm7, %%mm5\n\t"
428                 "por    %%mm1, %%mm0\n\t"
429                 "por    %%mm4, %%mm3\n\t"
430                 "por    %%mm2, %%mm0\n\t"
431                 "por    %%mm5, %%mm3\n\t"
432                 "psllq  $16, %%mm3\n\t"
433                 "por    %%mm3, %%mm0\n\t"
434                 MOVNTQ" %%mm0, %0\n\t"
435                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
436                 d += 4;
437                 s += 12;
438         }
439         while(s < end)
440         {
441                 const int b= *s++;
442                 const int g= *s++;
443                 const int r= *s++;
444                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
445         }
446         __asm __volatile(SFENCE:::"memory");
447         __asm __volatile(EMMS:::"memory");
448 #else
449         unsigned j,i,num_pixels=src_size/3;
450         uint16_t *d = (uint16_t *)dst;
451         for(i=0,j=0; j<num_pixels; i+=3,j++)
452         {
453                 const int b= src[i+0];
454                 const int g= src[i+1];
455                 const int r= src[i+2];
456
457                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
458         }
459 #endif
460 }
461
462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
463 {
464 #ifdef HAVE_MMX
465         const uint8_t *s = src;
466         const uint8_t *end,*mm_end;
467         uint16_t *d = (uint16_t *)dst;
468         end = s + src_size;
469         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
470         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
471         __asm __volatile(
472             "movq       %0, %%mm7\n\t"
473             "movq       %1, %%mm6\n\t"
474             ::"m"(red_15mask),"m"(green_15mask));
475         if(mm_end == end) mm_end -= MMREG_SIZE*2;
476         while(s < mm_end)
477         {
478             __asm __volatile(
479                 PREFETCH" 32%1\n\t"
480                 "movd   %1, %%mm0\n\t"
481                 "movd   3%1, %%mm3\n\t"
482                 "punpckldq 6%1, %%mm0\n\t"
483                 "punpckldq 9%1, %%mm3\n\t"
484                 "movq   %%mm0, %%mm1\n\t"
485                 "movq   %%mm0, %%mm2\n\t"
486                 "movq   %%mm3, %%mm4\n\t"
487                 "movq   %%mm3, %%mm5\n\t"
488                 "psrlq  $3, %%mm0\n\t"
489                 "psrlq  $3, %%mm3\n\t"
490                 "pand   %2, %%mm0\n\t"
491                 "pand   %2, %%mm3\n\t"
492                 "psrlq  $6, %%mm1\n\t"
493                 "psrlq  $6, %%mm4\n\t"
494                 "pand   %%mm6, %%mm1\n\t"
495                 "pand   %%mm6, %%mm4\n\t"
496                 "psrlq  $9, %%mm2\n\t"
497                 "psrlq  $9, %%mm5\n\t"
498                 "pand   %%mm7, %%mm2\n\t"
499                 "pand   %%mm7, %%mm5\n\t"
500                 "por    %%mm1, %%mm0\n\t"
501                 "por    %%mm4, %%mm3\n\t"
502                 "por    %%mm2, %%mm0\n\t"
503                 "por    %%mm5, %%mm3\n\t"
504                 "psllq  $16, %%mm3\n\t"
505                 "por    %%mm3, %%mm0\n\t"
506                 MOVNTQ" %%mm0, %0\n\t"
507                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
508                 d += 4;
509                 s += 12;
510         }
511         while(s < end)
512         {
513                 const int b= *s++;
514                 const int g= *s++;
515                 const int r= *s++;
516                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
517         }
518         __asm __volatile(SFENCE:::"memory");
519         __asm __volatile(EMMS:::"memory");
520 #else
521         unsigned j,i,num_pixels=src_size/3;
522         uint16_t *d = (uint16_t *)dst;
523         for(i=0,j=0; j<num_pixels; i+=3,j++)
524         {
525                 const int b= src[i+0];
526                 const int g= src[i+1];
527                 const int r= src[i+2];
528
529                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
530         }
531 #endif
532 }
533
534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
535 {
536         int num_pixels= src_size >> 2;
537 #ifdef HAVE_MMX
538         asm volatile (
539                 "xorl %%eax, %%eax              \n\t"
540                 ".balign 16                     \n\t"
541                 "1:                             \n\t"
542                 PREFETCH" 32(%0, %%eax)         \n\t"
543                 "movq (%0, %%eax), %%mm0        \n\t"
544                 "movq %%mm0, %%mm1              \n\t"
545                 "movq %%mm0, %%mm2              \n\t"
546                 "pslld $16, %%mm0               \n\t"
547                 "psrld $16, %%mm1               \n\t"
548                 "pand mask32r, %%mm0            \n\t"
549                 "pand mask32g, %%mm2            \n\t"
550                 "pand mask32b, %%mm1            \n\t"
551                 "por %%mm0, %%mm2               \n\t"
552                 "por %%mm1, %%mm2               \n\t"
553                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
554                 "addl $2, %%eax                 \n\t"
555                 "cmpl %2, %%eax                 \n\t"
556                 " jb 1b                         \n\t"
557                 :: "r" (src), "r"(dst), "r" (num_pixels)
558                 : "%eax"
559         );
560
561         __asm __volatile(SFENCE:::"memory");
562         __asm __volatile(EMMS:::"memory");
563 #else
564         int i;
565         for(i=0; i<num_pixels; i++)
566         {
567                 dst[4*i + 0] = src[4*i + 2];
568                 dst[4*i + 1] = src[4*i + 1];
569                 dst[4*i + 2] = src[4*i + 0];
570         }
571 #endif
572 }
573
574 /**
575  *
576  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
577  * problem for anyone then tell me, and ill fix it)
578  */
579 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
580         unsigned int width, unsigned int height,
581         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
582 {
583         int y;
584         const int chromWidth= width>>1;
585         for(y=0; y<height; y++)
586         {
587 #ifdef HAVE_MMX
588 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
589                 asm volatile(
590                         "xorl %%eax, %%eax              \n\t"
591                         ".balign 16                     \n\t"
592                         "1:                             \n\t"
593                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
594                         PREFETCH" 32(%2, %%eax)         \n\t"
595                         PREFETCH" 32(%3, %%eax)         \n\t"
596                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
597                         "movq %%mm0, %%mm2              \n\t" // U(0)
598                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
599                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
600                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
601
602                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
603                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
604                         "movq %%mm3, %%mm4              \n\t" // Y(0)
605                         "movq %%mm5, %%mm6              \n\t" // Y(8)
606                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
607                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
608                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
609                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
610
611                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
612                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
613                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
614                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
615
616                         "addl $8, %%eax                 \n\t"
617                         "cmpl %4, %%eax                 \n\t"
618                         " jb 1b                         \n\t"
619                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
620                         : "%eax"
621                 );
622 #else
623                 int i;
624                 for(i=0; i<chromWidth; i++)
625                 {
626                         dst[4*i+0] = ysrc[2*i+0];
627                         dst[4*i+1] = usrc[i];
628                         dst[4*i+2] = ysrc[2*i+1];
629                         dst[4*i+3] = vsrc[i];
630                 }
631 #endif
632                 if(y&1)
633                 {
634                         usrc += chromStride;
635                         vsrc += chromStride;
636                 }
637                 ysrc += lumStride;
638                 dst += dstStride;
639         }
640 #ifdef HAVE_MMX
641 asm(    EMMS" \n\t"
642         SFENCE" \n\t"
643         :::"memory");
644 #endif
645 }
646
647 /**
648  *
649  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
650  * problem for anyone then tell me, and ill fix it)
651  */
652 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
653         unsigned int width, unsigned int height,
654         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
655 {
656         int y;
657         const int chromWidth= width>>1;
658         for(y=0; y<height; y+=2)
659         {
660 #ifdef HAVE_MMX
661                 asm volatile(
662                         "xorl %%eax, %%eax              \n\t"
663                         "pcmpeqw %%mm7, %%mm7           \n\t"
664                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
665                         ".balign 16                     \n\t"
666                         "1:                             \n\t"
667                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
668                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
669                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
670                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
671                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
672                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
673                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
674                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
675                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
676                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
677                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
678
679                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
680
681                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
682                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
683                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
684                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
685                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
686                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
687                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
688                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
689                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
690                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
691
692                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
693
694                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
695                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
696                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
697                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
698                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
699                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
700                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
701                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
702
703                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
704                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
705
706                         "addl $8, %%eax                 \n\t"
707                         "cmpl %4, %%eax                 \n\t"
708                         " jb 1b                         \n\t"
709                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
710                         : "memory", "%eax"
711                 );
712
713                 ydst += lumStride;
714                 src  += srcStride;
715
716                 asm volatile(
717                         "xorl %%eax, %%eax              \n\t"
718                         ".balign 16                     \n\t"
719                         "1:                             \n\t"
720                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
721                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
722                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
723                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
724                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
725                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
726                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
727                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
728                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
729                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
730                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
731
732                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
733                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
734
735                         "addl $8, %%eax                 \n\t"
736                         "cmpl %4, %%eax                 \n\t"
737                         " jb 1b                         \n\t"
738
739                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
740                         : "memory", "%eax"
741                 );
742 #else
743                 int i;
744                 for(i=0; i<chromWidth; i++)
745                 {
746                         ydst[2*i+0]     = src[4*i+0];
747                         udst[i]         = src[4*i+1];
748                         ydst[2*i+1]     = src[4*i+2];
749                         vdst[i]         = src[4*i+3];
750                 }
751                 ydst += lumStride;
752                 src  += srcStride;
753
754                 for(i=0; i<chromWidth; i++)
755                 {
756                         ydst[2*i+0]     = src[4*i+0];
757                         ydst[2*i+1]     = src[4*i+2];
758                 }
759 #endif
760                 udst += chromStride;
761                 vdst += chromStride;
762                 ydst += lumStride;
763                 src  += srcStride;
764         }
765 #ifdef HAVE_MMX
766 asm volatile(   EMMS" \n\t"
767                 SFENCE" \n\t"
768                 :::"memory");
769 #endif
770 }
771
772 /**
773  *
774  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
775  * problem for anyone then tell me, and ill fix it)
776  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
777  */
778 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
779         unsigned int width, unsigned int height,
780         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
781 {
782         int y;
783         const int chromWidth= width>>1;
784         for(y=0; y<height; y+=2)
785         {
786 #ifdef HAVE_MMX
787                 asm volatile(
788                         "xorl %%eax, %%eax              \n\t"
789                         "pcmpeqw %%mm7, %%mm7           \n\t"
790                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
791                         ".balign 16                     \n\t"
792                         "1:                             \n\t"
793                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
794                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
795                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
796                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
797                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
798                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
799                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
800                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
801                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
802                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
803                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
804
805                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
806
807                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
808                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
809                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
810                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
811                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
812                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
813                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
814                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
815                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
816                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
817
818                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
819
820                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
821                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
822                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
823                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
824                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
825                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
826                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
827                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
828
829                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
830                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
831
832                         "addl $8, %%eax                 \n\t"
833                         "cmpl %4, %%eax                 \n\t"
834                         " jb 1b                         \n\t"
835                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
836                         : "memory", "%eax"
837                 );
838
839                 ydst += lumStride;
840                 src  += srcStride;
841
842                 asm volatile(
843                         "xorl %%eax, %%eax              \n\t"
844                         ".balign 16                     \n\t"
845                         "1:                             \n\t"
846                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
847                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
848                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
849                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
850                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
851                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
852                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
853                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
854                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
855                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
856                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
857
858                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
859                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
860
861                         "addl $8, %%eax                 \n\t"
862                         "cmpl %4, %%eax                 \n\t"
863                         " jb 1b                         \n\t"
864
865                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
866                         : "memory", "%eax"
867                 );
868 #else
869                 int i;
870                 for(i=0; i<chromWidth; i++)
871                 {
872                         udst[i]         = src[4*i+0];
873                         ydst[2*i+0]     = src[4*i+1];
874                         vdst[i]         = src[4*i+2];
875                         ydst[2*i+1]     = src[4*i+3];
876                 }
877                 ydst += lumStride;
878                 src  += srcStride;
879
880                 for(i=0; i<chromWidth; i++)
881                 {
882                         ydst[2*i+0]     = src[4*i+1];
883                         ydst[2*i+1]     = src[4*i+3];
884                 }
885 #endif
886                 udst += chromStride;
887                 vdst += chromStride;
888                 ydst += lumStride;
889                 src  += srcStride;
890         }
891 #ifdef HAVE_MMX
892 asm volatile(   EMMS" \n\t"
893                 SFENCE" \n\t"
894                 :::"memory");
895 #endif
896 }
897
898 /**
899  *
900  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
901  * problem for anyone then tell me, and ill fix it)
902  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
903  */
904 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
905         unsigned int width, unsigned int height,
906         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
907 {
908         int y;
909         const int chromWidth= width>>1;
910         for(y=0; y<height; y+=2)
911         {
912                 int i;
913                 for(i=0; i<chromWidth; i++)
914                 {
915                         unsigned int b= src[6*i+0];
916                         unsigned int g= src[6*i+1];
917                         unsigned int r= src[6*i+2];
918
919                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
920                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
921                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
922
923                         udst[i]         = U;
924                         vdst[i]         = V;
925                         ydst[2*i]       = Y;
926
927                         b= src[6*i+3];
928                         g= src[6*i+4];
929                         r= src[6*i+5];
930
931                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
932                         ydst[2*i+1]     = Y;
933                 }
934                 ydst += lumStride;
935                 src  += srcStride;
936
937                 for(i=0; i<chromWidth; i++)
938                 {
939                         unsigned int b= src[6*i+0];
940                         unsigned int g= src[6*i+1];
941                         unsigned int r= src[6*i+2];
942
943                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
944
945                         ydst[2*i]       = Y;
946
947                         b= src[6*i+3];
948                         g= src[6*i+4];
949                         r= src[6*i+5];
950
951                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
952                         ydst[2*i+1]     = Y;
953                 }
954                 udst += chromStride;
955                 vdst += chromStride;
956                 ydst += lumStride;
957                 src  += srcStride;
958         }
959 }