]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
fixing RGB32->RGB16 on big endian patch by (Colin Leroy <colin at colino dot net>)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245 {
246         unsigned j,i,num_pixels=src_size/3;
247         for(i=0,j=0; j<num_pixels; i+=3,j+=3)
248         {
249                 dst[j+0] = src[i+2];
250                 dst[j+1] = src[i+1];
251                 dst[j+2] = src[i+0];
252         }
253 }
254
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256 {
257   register const uint8_t* s=src;
258   register uint8_t* d=dst;
259   register const uint8_t *end;
260   const uint8_t *mm_end;
261   end = s + src_size;
262 #ifdef HAVE_MMX
263   __asm __volatile(PREFETCH"    %0"::"m"(*s));
264   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
265   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
266   mm_end = end - 15;
267   while(s<mm_end)
268   {
269         __asm __volatile(
270                 PREFETCH"       32%1\n\t"
271                 "movq   %1, %%mm0\n\t"
272                 "movq   8%1, %%mm2\n\t"
273                 "movq   %%mm0, %%mm1\n\t"
274                 "movq   %%mm2, %%mm3\n\t"
275                 "psrlq  $1, %%mm0\n\t"
276                 "psrlq  $1, %%mm2\n\t"
277                 "pand   %%mm7, %%mm0\n\t"
278                 "pand   %%mm7, %%mm2\n\t"
279                 "pand   %%mm6, %%mm1\n\t"
280                 "pand   %%mm6, %%mm3\n\t"
281                 "por    %%mm1, %%mm0\n\t"
282                 "por    %%mm3, %%mm2\n\t"
283                 MOVNTQ" %%mm0, %0\n\t"
284                 MOVNTQ" %%mm2, 8%0"
285                 :"=m"(*d)
286                 :"m"(*s)
287                 );
288         d+=16;
289         s+=16;
290   }
291   __asm __volatile(SFENCE:::"memory");
292   __asm __volatile(EMMS:::"memory");
293 #endif
294     mm_end = end - 3;
295     while(s < mm_end)
296     {
297         register uint32_t x= *((uint32_t *)s);
298         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
299         s+=4;
300         d+=4;
301     }
302     if(s < end)
303     {
304         register uint16_t x= *((uint16_t *)s);
305         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
306         s+=2;
307         d+=2;
308     }
309 }
310
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
312 {
313         const uint8_t *s = src;
314         const uint8_t *end;
315 #ifdef HAVE_MMX
316         const uint8_t *mm_end;
317 #endif
318         uint16_t *d = (uint16_t *)dst;
319         end = s + src_size;
320 #ifdef HAVE_MMX
321         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
322         __asm __volatile(
323             "movq       %0, %%mm7\n\t"
324             "movq       %1, %%mm6\n\t"
325             ::"m"(red_16mask),"m"(green_16mask));
326         mm_end = end - 15;
327         while(s < mm_end)
328         {
329             __asm __volatile(
330                 PREFETCH" 32%1\n\t"
331                 "movd   %1, %%mm0\n\t"
332                 "movd   4%1, %%mm3\n\t"
333                 "punpckldq 8%1, %%mm0\n\t"
334                 "punpckldq 12%1, %%mm3\n\t"
335                 "movq   %%mm0, %%mm1\n\t"
336                 "movq   %%mm0, %%mm2\n\t"
337                 "movq   %%mm3, %%mm4\n\t"
338                 "movq   %%mm3, %%mm5\n\t"
339                 "psrlq  $3, %%mm0\n\t"
340                 "psrlq  $3, %%mm3\n\t"
341                 "pand   %2, %%mm0\n\t"
342                 "pand   %2, %%mm3\n\t"
343                 "psrlq  $5, %%mm1\n\t"
344                 "psrlq  $5, %%mm4\n\t"
345                 "pand   %%mm6, %%mm1\n\t"
346                 "pand   %%mm6, %%mm4\n\t"
347                 "psrlq  $8, %%mm2\n\t"
348                 "psrlq  $8, %%mm5\n\t"
349                 "pand   %%mm7, %%mm2\n\t"
350                 "pand   %%mm7, %%mm5\n\t"
351                 "por    %%mm1, %%mm0\n\t"
352                 "por    %%mm4, %%mm3\n\t"
353                 "por    %%mm2, %%mm0\n\t"
354                 "por    %%mm5, %%mm3\n\t"
355                 "psllq  $16, %%mm3\n\t"
356                 "por    %%mm3, %%mm0\n\t"
357                 MOVNTQ" %%mm0, %0\n\t"
358                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359                 d += 4;
360                 s += 16;
361         }
362         __asm __volatile(SFENCE:::"memory");
363         __asm __volatile(EMMS:::"memory");
364 #endif
365         while(s < end)
366         {
367 #ifndef WORDS_BIGENDIAN
368                 const int b= *s++;
369                 const int g= *s++;
370                 const int r= *s++;
371 #else
372                 const int a= *s++; /*skip*/
373                 const int r= *s++;
374                 const int g= *s++;
375                 const int b= *s++;
376 #endif          
377                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
378 #ifndef WORDS_BIGENDIAN
379                 s++;
380 #endif
381         }
382 }
383
384 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
385 {
386         const uint8_t *s = src;
387         const uint8_t *end;
388 #ifdef HAVE_MMX
389         const uint8_t *mm_end;
390 #endif
391         uint16_t *d = (uint16_t *)dst;
392         end = s + src_size;
393 #ifdef HAVE_MMX
394         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
395         __asm __volatile(
396             "movq       %0, %%mm7\n\t"
397             "movq       %1, %%mm6\n\t"
398             ::"m"(red_16mask),"m"(green_16mask));
399         mm_end = end - 15;
400         while(s < mm_end)
401         {
402             __asm __volatile(
403                 PREFETCH" 32%1\n\t"
404                 "movd   %1, %%mm0\n\t"
405                 "movd   4%1, %%mm3\n\t"
406                 "punpckldq 8%1, %%mm0\n\t"
407                 "punpckldq 12%1, %%mm3\n\t"
408                 "movq   %%mm0, %%mm1\n\t"
409                 "movq   %%mm0, %%mm2\n\t"
410                 "movq   %%mm3, %%mm4\n\t"
411                 "movq   %%mm3, %%mm5\n\t"
412                 "psllq  $8, %%mm0\n\t"
413                 "psllq  $8, %%mm3\n\t"
414                 "pand   %%mm7, %%mm0\n\t"
415                 "pand   %%mm7, %%mm3\n\t"
416                 "psrlq  $5, %%mm1\n\t"
417                 "psrlq  $5, %%mm4\n\t"
418                 "pand   %%mm6, %%mm1\n\t"
419                 "pand   %%mm6, %%mm4\n\t"
420                 "psrlq  $19, %%mm2\n\t"
421                 "psrlq  $19, %%mm5\n\t"
422                 "pand   %2, %%mm2\n\t"
423                 "pand   %2, %%mm5\n\t"
424                 "por    %%mm1, %%mm0\n\t"
425                 "por    %%mm4, %%mm3\n\t"
426                 "por    %%mm2, %%mm0\n\t"
427                 "por    %%mm5, %%mm3\n\t"
428                 "psllq  $16, %%mm3\n\t"
429                 "por    %%mm3, %%mm0\n\t"
430                 MOVNTQ" %%mm0, %0\n\t"
431                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
432                 d += 4;
433                 s += 16;
434         }
435         __asm __volatile(SFENCE:::"memory");
436         __asm __volatile(EMMS:::"memory");
437 #endif
438         while(s < end)
439         {
440                 const int r= *s++;
441                 const int g= *s++;
442                 const int b= *s++;
443                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
444                 s++;
445         }
446 }
447
448 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
449 {
450         const uint8_t *s = src;
451         const uint8_t *end;
452 #ifdef HAVE_MMX
453         const uint8_t *mm_end;
454 #endif
455         uint16_t *d = (uint16_t *)dst;
456         end = s + src_size;
457 #ifdef HAVE_MMX
458         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
459         __asm __volatile(
460             "movq       %0, %%mm7\n\t"
461             "movq       %1, %%mm6\n\t"
462             ::"m"(red_15mask),"m"(green_15mask));
463         mm_end = end - 15;
464         while(s < mm_end)
465         {
466             __asm __volatile(
467                 PREFETCH" 32%1\n\t"
468                 "movd   %1, %%mm0\n\t"
469                 "movd   4%1, %%mm3\n\t"
470                 "punpckldq 8%1, %%mm0\n\t"
471                 "punpckldq 12%1, %%mm3\n\t"
472                 "movq   %%mm0, %%mm1\n\t"
473                 "movq   %%mm0, %%mm2\n\t"
474                 "movq   %%mm3, %%mm4\n\t"
475                 "movq   %%mm3, %%mm5\n\t"
476                 "psrlq  $3, %%mm0\n\t"
477                 "psrlq  $3, %%mm3\n\t"
478                 "pand   %2, %%mm0\n\t"
479                 "pand   %2, %%mm3\n\t"
480                 "psrlq  $6, %%mm1\n\t"
481                 "psrlq  $6, %%mm4\n\t"
482                 "pand   %%mm6, %%mm1\n\t"
483                 "pand   %%mm6, %%mm4\n\t"
484                 "psrlq  $9, %%mm2\n\t"
485                 "psrlq  $9, %%mm5\n\t"
486                 "pand   %%mm7, %%mm2\n\t"
487                 "pand   %%mm7, %%mm5\n\t"
488                 "por    %%mm1, %%mm0\n\t"
489                 "por    %%mm4, %%mm3\n\t"
490                 "por    %%mm2, %%mm0\n\t"
491                 "por    %%mm5, %%mm3\n\t"
492                 "psllq  $16, %%mm3\n\t"
493                 "por    %%mm3, %%mm0\n\t"
494                 MOVNTQ" %%mm0, %0\n\t"
495                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
496                 d += 4;
497                 s += 16;
498         }
499         __asm __volatile(SFENCE:::"memory");
500         __asm __volatile(EMMS:::"memory");
501 #endif
502         while(s < end)
503         {
504                 const int b= *s++;
505                 const int g= *s++;
506                 const int r= *s++;
507                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
508                 s++;
509         }
510 }
511
512 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
513 {
514         const uint8_t *s = src;
515         const uint8_t *end;
516 #ifdef HAVE_MMX
517         const uint8_t *mm_end;
518 #endif
519         uint16_t *d = (uint16_t *)dst;
520         end = s + src_size;
521 #ifdef HAVE_MMX
522         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
523         __asm __volatile(
524             "movq       %0, %%mm7\n\t"
525             "movq       %1, %%mm6\n\t"
526             ::"m"(red_15mask),"m"(green_15mask));
527         mm_end = end - 15;
528         while(s < mm_end)
529         {
530             __asm __volatile(
531                 PREFETCH" 32%1\n\t"
532                 "movd   %1, %%mm0\n\t"
533                 "movd   4%1, %%mm3\n\t"
534                 "punpckldq 8%1, %%mm0\n\t"
535                 "punpckldq 12%1, %%mm3\n\t"
536                 "movq   %%mm0, %%mm1\n\t"
537                 "movq   %%mm0, %%mm2\n\t"
538                 "movq   %%mm3, %%mm4\n\t"
539                 "movq   %%mm3, %%mm5\n\t"
540                 "psllq  $7, %%mm0\n\t"
541                 "psllq  $7, %%mm3\n\t"
542                 "pand   %%mm7, %%mm0\n\t"
543                 "pand   %%mm7, %%mm3\n\t"
544                 "psrlq  $6, %%mm1\n\t"
545                 "psrlq  $6, %%mm4\n\t"
546                 "pand   %%mm6, %%mm1\n\t"
547                 "pand   %%mm6, %%mm4\n\t"
548                 "psrlq  $19, %%mm2\n\t"
549                 "psrlq  $19, %%mm5\n\t"
550                 "pand   %2, %%mm2\n\t"
551                 "pand   %2, %%mm5\n\t"
552                 "por    %%mm1, %%mm0\n\t"
553                 "por    %%mm4, %%mm3\n\t"
554                 "por    %%mm2, %%mm0\n\t"
555                 "por    %%mm5, %%mm3\n\t"
556                 "psllq  $16, %%mm3\n\t"
557                 "por    %%mm3, %%mm0\n\t"
558                 MOVNTQ" %%mm0, %0\n\t"
559                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
560                 d += 4;
561                 s += 16;
562         }
563         __asm __volatile(SFENCE:::"memory");
564         __asm __volatile(EMMS:::"memory");
565 #endif
566         while(s < end)
567         {
568                 const int r= *s++;
569                 const int g= *s++;
570                 const int b= *s++;
571                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
572                 s++;
573         }
574 }
575
576 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
577 {
578         const uint8_t *s = src;
579         const uint8_t *end;
580 #ifdef HAVE_MMX
581         const uint8_t *mm_end;
582 #endif
583         uint16_t *d = (uint16_t *)dst;
584         end = s + src_size;
585 #ifdef HAVE_MMX
586         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
587         __asm __volatile(
588             "movq       %0, %%mm7\n\t"
589             "movq       %1, %%mm6\n\t"
590             ::"m"(red_16mask),"m"(green_16mask));
591         mm_end = end - 11;
592         while(s < mm_end)
593         {
594             __asm __volatile(
595                 PREFETCH" 32%1\n\t"
596                 "movd   %1, %%mm0\n\t"
597                 "movd   3%1, %%mm3\n\t"
598                 "punpckldq 6%1, %%mm0\n\t"
599                 "punpckldq 9%1, %%mm3\n\t"
600                 "movq   %%mm0, %%mm1\n\t"
601                 "movq   %%mm0, %%mm2\n\t"
602                 "movq   %%mm3, %%mm4\n\t"
603                 "movq   %%mm3, %%mm5\n\t"
604                 "psrlq  $3, %%mm0\n\t"
605                 "psrlq  $3, %%mm3\n\t"
606                 "pand   %2, %%mm0\n\t"
607                 "pand   %2, %%mm3\n\t"
608                 "psrlq  $5, %%mm1\n\t"
609                 "psrlq  $5, %%mm4\n\t"
610                 "pand   %%mm6, %%mm1\n\t"
611                 "pand   %%mm6, %%mm4\n\t"
612                 "psrlq  $8, %%mm2\n\t"
613                 "psrlq  $8, %%mm5\n\t"
614                 "pand   %%mm7, %%mm2\n\t"
615                 "pand   %%mm7, %%mm5\n\t"
616                 "por    %%mm1, %%mm0\n\t"
617                 "por    %%mm4, %%mm3\n\t"
618                 "por    %%mm2, %%mm0\n\t"
619                 "por    %%mm5, %%mm3\n\t"
620                 "psllq  $16, %%mm3\n\t"
621                 "por    %%mm3, %%mm0\n\t"
622                 MOVNTQ" %%mm0, %0\n\t"
623                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
624                 d += 4;
625                 s += 12;
626         }
627         __asm __volatile(SFENCE:::"memory");
628         __asm __volatile(EMMS:::"memory");
629 #endif
630         while(s < end)
631         {
632                 const int b= *s++;
633                 const int g= *s++;
634                 const int r= *s++;
635                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
636         }
637 }
638
639 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
640 {
641         const uint8_t *s = src;
642         const uint8_t *end;
643 #ifdef HAVE_MMX
644         const uint8_t *mm_end;
645 #endif
646         uint16_t *d = (uint16_t *)dst;
647         end = s + src_size;
648 #ifdef HAVE_MMX
649         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
650         __asm __volatile(
651             "movq       %0, %%mm7\n\t"
652             "movq       %1, %%mm6\n\t"
653             ::"m"(red_16mask),"m"(green_16mask));
654         mm_end = end - 15;
655         while(s < mm_end)
656         {
657             __asm __volatile(
658                 PREFETCH" 32%1\n\t"
659                 "movd   %1, %%mm0\n\t"
660                 "movd   3%1, %%mm3\n\t"
661                 "punpckldq 6%1, %%mm0\n\t"
662                 "punpckldq 9%1, %%mm3\n\t"
663                 "movq   %%mm0, %%mm1\n\t"
664                 "movq   %%mm0, %%mm2\n\t"
665                 "movq   %%mm3, %%mm4\n\t"
666                 "movq   %%mm3, %%mm5\n\t"
667                 "psllq  $8, %%mm0\n\t"
668                 "psllq  $8, %%mm3\n\t"
669                 "pand   %%mm7, %%mm0\n\t"
670                 "pand   %%mm7, %%mm3\n\t"
671                 "psrlq  $5, %%mm1\n\t"
672                 "psrlq  $5, %%mm4\n\t"
673                 "pand   %%mm6, %%mm1\n\t"
674                 "pand   %%mm6, %%mm4\n\t"
675                 "psrlq  $19, %%mm2\n\t"
676                 "psrlq  $19, %%mm5\n\t"
677                 "pand   %2, %%mm2\n\t"
678                 "pand   %2, %%mm5\n\t"
679                 "por    %%mm1, %%mm0\n\t"
680                 "por    %%mm4, %%mm3\n\t"
681                 "por    %%mm2, %%mm0\n\t"
682                 "por    %%mm5, %%mm3\n\t"
683                 "psllq  $16, %%mm3\n\t"
684                 "por    %%mm3, %%mm0\n\t"
685                 MOVNTQ" %%mm0, %0\n\t"
686                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
687                 d += 4;
688                 s += 12;
689         }
690         __asm __volatile(SFENCE:::"memory");
691         __asm __volatile(EMMS:::"memory");
692 #endif
693         while(s < end)
694         {
695                 const int r= *s++;
696                 const int g= *s++;
697                 const int b= *s++;
698                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
699         }
700 }
701
702 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
703 {
704         const uint8_t *s = src;
705         const uint8_t *end;
706 #ifdef HAVE_MMX
707         const uint8_t *mm_end;
708 #endif
709         uint16_t *d = (uint16_t *)dst;
710         end = s + src_size;
711 #ifdef HAVE_MMX
712         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
713         __asm __volatile(
714             "movq       %0, %%mm7\n\t"
715             "movq       %1, %%mm6\n\t"
716             ::"m"(red_15mask),"m"(green_15mask));
717         mm_end = end - 11;
718         while(s < mm_end)
719         {
720             __asm __volatile(
721                 PREFETCH" 32%1\n\t"
722                 "movd   %1, %%mm0\n\t"
723                 "movd   3%1, %%mm3\n\t"
724                 "punpckldq 6%1, %%mm0\n\t"
725                 "punpckldq 9%1, %%mm3\n\t"
726                 "movq   %%mm0, %%mm1\n\t"
727                 "movq   %%mm0, %%mm2\n\t"
728                 "movq   %%mm3, %%mm4\n\t"
729                 "movq   %%mm3, %%mm5\n\t"
730                 "psrlq  $3, %%mm0\n\t"
731                 "psrlq  $3, %%mm3\n\t"
732                 "pand   %2, %%mm0\n\t"
733                 "pand   %2, %%mm3\n\t"
734                 "psrlq  $6, %%mm1\n\t"
735                 "psrlq  $6, %%mm4\n\t"
736                 "pand   %%mm6, %%mm1\n\t"
737                 "pand   %%mm6, %%mm4\n\t"
738                 "psrlq  $9, %%mm2\n\t"
739                 "psrlq  $9, %%mm5\n\t"
740                 "pand   %%mm7, %%mm2\n\t"
741                 "pand   %%mm7, %%mm5\n\t"
742                 "por    %%mm1, %%mm0\n\t"
743                 "por    %%mm4, %%mm3\n\t"
744                 "por    %%mm2, %%mm0\n\t"
745                 "por    %%mm5, %%mm3\n\t"
746                 "psllq  $16, %%mm3\n\t"
747                 "por    %%mm3, %%mm0\n\t"
748                 MOVNTQ" %%mm0, %0\n\t"
749                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
750                 d += 4;
751                 s += 12;
752         }
753         __asm __volatile(SFENCE:::"memory");
754         __asm __volatile(EMMS:::"memory");
755 #endif
756         while(s < end)
757         {
758                 const int b= *s++;
759                 const int g= *s++;
760                 const int r= *s++;
761                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
762         }
763 }
764
765 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
766 {
767         const uint8_t *s = src;
768         const uint8_t *end;
769 #ifdef HAVE_MMX
770         const uint8_t *mm_end;
771 #endif
772         uint16_t *d = (uint16_t *)dst;
773         end = s + src_size;
774 #ifdef HAVE_MMX
775         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
776         __asm __volatile(
777             "movq       %0, %%mm7\n\t"
778             "movq       %1, %%mm6\n\t"
779             ::"m"(red_15mask),"m"(green_15mask));
780         mm_end = end - 15;
781         while(s < mm_end)
782         {
783             __asm __volatile(
784                 PREFETCH" 32%1\n\t"
785                 "movd   %1, %%mm0\n\t"
786                 "movd   3%1, %%mm3\n\t"
787                 "punpckldq 6%1, %%mm0\n\t"
788                 "punpckldq 9%1, %%mm3\n\t"
789                 "movq   %%mm0, %%mm1\n\t"
790                 "movq   %%mm0, %%mm2\n\t"
791                 "movq   %%mm3, %%mm4\n\t"
792                 "movq   %%mm3, %%mm5\n\t"
793                 "psllq  $7, %%mm0\n\t"
794                 "psllq  $7, %%mm3\n\t"
795                 "pand   %%mm7, %%mm0\n\t"
796                 "pand   %%mm7, %%mm3\n\t"
797                 "psrlq  $6, %%mm1\n\t"
798                 "psrlq  $6, %%mm4\n\t"
799                 "pand   %%mm6, %%mm1\n\t"
800                 "pand   %%mm6, %%mm4\n\t"
801                 "psrlq  $19, %%mm2\n\t"
802                 "psrlq  $19, %%mm5\n\t"
803                 "pand   %2, %%mm2\n\t"
804                 "pand   %2, %%mm5\n\t"
805                 "por    %%mm1, %%mm0\n\t"
806                 "por    %%mm4, %%mm3\n\t"
807                 "por    %%mm2, %%mm0\n\t"
808                 "por    %%mm5, %%mm3\n\t"
809                 "psllq  $16, %%mm3\n\t"
810                 "por    %%mm3, %%mm0\n\t"
811                 MOVNTQ" %%mm0, %0\n\t"
812                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
813                 d += 4;
814                 s += 12;
815         }
816         __asm __volatile(SFENCE:::"memory");
817         __asm __volatile(EMMS:::"memory");
818 #endif
819         while(s < end)
820         {
821                 const int r= *s++;
822                 const int g= *s++;
823                 const int b= *s++;
824                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
825         }
826 }
827
828 /*
829   I use here less accurate approximation by simply
830  left-shifting the input
831   value and filling the low order bits with
832  zeroes. This method improves png's
833   compression but this scheme cannot reproduce white exactly, since it does not
834   generate an all-ones maximum value; the net effect is to darken the
835   image slightly.
836
837   The better method should be "left bit replication":
838
839    4 3 2 1 0
840    ---------
841    1 1 0 1 1
842
843    7 6 5 4 3  2 1 0
844    ----------------
845    1 1 0 1 1  1 1 0
846    |=======|  |===|
847        |      Leftmost Bits Repeated to Fill Open Bits
848        |
849    Original Bits
850 */
851 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
852 {
853         const uint16_t *end;
854 #ifdef HAVE_MMX
855         const uint16_t *mm_end;
856 #endif
857         uint8_t *d = (uint8_t *)dst;
858         const uint16_t *s = (uint16_t *)src;
859         end = s + src_size/2;
860 #ifdef HAVE_MMX
861         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
862         mm_end = end - 7;
863         while(s < mm_end)
864         {
865             __asm __volatile(
866                 PREFETCH" 32%1\n\t"
867                 "movq   %1, %%mm0\n\t"
868                 "movq   %1, %%mm1\n\t"
869                 "movq   %1, %%mm2\n\t"
870                 "pand   %2, %%mm0\n\t"
871                 "pand   %3, %%mm1\n\t"
872                 "pand   %4, %%mm2\n\t"
873                 "psllq  $3, %%mm0\n\t"
874                 "psrlq  $2, %%mm1\n\t"
875                 "psrlq  $7, %%mm2\n\t"
876                 "movq   %%mm0, %%mm3\n\t"
877                 "movq   %%mm1, %%mm4\n\t"
878                 "movq   %%mm2, %%mm5\n\t"
879                 "punpcklwd %5, %%mm0\n\t"
880                 "punpcklwd %5, %%mm1\n\t"
881                 "punpcklwd %5, %%mm2\n\t"
882                 "punpckhwd %5, %%mm3\n\t"
883                 "punpckhwd %5, %%mm4\n\t"
884                 "punpckhwd %5, %%mm5\n\t"
885                 "psllq  $8, %%mm1\n\t"
886                 "psllq  $16, %%mm2\n\t"
887                 "por    %%mm1, %%mm0\n\t"
888                 "por    %%mm2, %%mm0\n\t"
889                 "psllq  $8, %%mm4\n\t"
890                 "psllq  $16, %%mm5\n\t"
891                 "por    %%mm4, %%mm3\n\t"
892                 "por    %%mm5, %%mm3\n\t"
893
894                 "movq   %%mm0, %%mm6\n\t"
895                 "movq   %%mm3, %%mm7\n\t"
896                 
897                 "movq   8%1, %%mm0\n\t"
898                 "movq   8%1, %%mm1\n\t"
899                 "movq   8%1, %%mm2\n\t"
900                 "pand   %2, %%mm0\n\t"
901                 "pand   %3, %%mm1\n\t"
902                 "pand   %4, %%mm2\n\t"
903                 "psllq  $3, %%mm0\n\t"
904                 "psrlq  $2, %%mm1\n\t"
905                 "psrlq  $7, %%mm2\n\t"
906                 "movq   %%mm0, %%mm3\n\t"
907                 "movq   %%mm1, %%mm4\n\t"
908                 "movq   %%mm2, %%mm5\n\t"
909                 "punpcklwd %5, %%mm0\n\t"
910                 "punpcklwd %5, %%mm1\n\t"
911                 "punpcklwd %5, %%mm2\n\t"
912                 "punpckhwd %5, %%mm3\n\t"
913                 "punpckhwd %5, %%mm4\n\t"
914                 "punpckhwd %5, %%mm5\n\t"
915                 "psllq  $8, %%mm1\n\t"
916                 "psllq  $16, %%mm2\n\t"
917                 "por    %%mm1, %%mm0\n\t"
918                 "por    %%mm2, %%mm0\n\t"
919                 "psllq  $8, %%mm4\n\t"
920                 "psllq  $16, %%mm5\n\t"
921                 "por    %%mm4, %%mm3\n\t"
922                 "por    %%mm5, %%mm3\n\t"
923
924                 :"=m"(*d)
925                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
926                 :"memory");
927             /* Borrowed 32 to 24 */
928             __asm __volatile(
929                 "movq   %%mm0, %%mm4\n\t"
930                 "movq   %%mm3, %%mm5\n\t"
931                 "movq   %%mm6, %%mm0\n\t"
932                 "movq   %%mm7, %%mm1\n\t"
933                 
934                 "movq   %%mm4, %%mm6\n\t"
935                 "movq   %%mm5, %%mm7\n\t"
936                 "movq   %%mm0, %%mm2\n\t"
937                 "movq   %%mm1, %%mm3\n\t"
938
939                 "psrlq  $8, %%mm2\n\t"
940                 "psrlq  $8, %%mm3\n\t"
941                 "psrlq  $8, %%mm6\n\t"
942                 "psrlq  $8, %%mm7\n\t"
943                 "pand   %2, %%mm0\n\t"
944                 "pand   %2, %%mm1\n\t"
945                 "pand   %2, %%mm4\n\t"
946                 "pand   %2, %%mm5\n\t"
947                 "pand   %3, %%mm2\n\t"
948                 "pand   %3, %%mm3\n\t"
949                 "pand   %3, %%mm6\n\t"
950                 "pand   %3, %%mm7\n\t"
951                 "por    %%mm2, %%mm0\n\t"
952                 "por    %%mm3, %%mm1\n\t"
953                 "por    %%mm6, %%mm4\n\t"
954                 "por    %%mm7, %%mm5\n\t"
955
956                 "movq   %%mm1, %%mm2\n\t"
957                 "movq   %%mm4, %%mm3\n\t"
958                 "psllq  $48, %%mm2\n\t"
959                 "psllq  $32, %%mm3\n\t"
960                 "pand   %4, %%mm2\n\t"
961                 "pand   %5, %%mm3\n\t"
962                 "por    %%mm2, %%mm0\n\t"
963                 "psrlq  $16, %%mm1\n\t"
964                 "psrlq  $32, %%mm4\n\t"
965                 "psllq  $16, %%mm5\n\t"
966                 "por    %%mm3, %%mm1\n\t"
967                 "pand   %6, %%mm5\n\t"
968                 "por    %%mm5, %%mm4\n\t"
969
970                 MOVNTQ" %%mm0, %0\n\t"
971                 MOVNTQ" %%mm1, 8%0\n\t"
972                 MOVNTQ" %%mm4, 16%0"
973
974                 :"=m"(*d)
975                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
976                 :"memory");
977                 d += 24;
978                 s += 8;
979         }
980         __asm __volatile(SFENCE:::"memory");
981         __asm __volatile(EMMS:::"memory");
982 #endif
983         while(s < end)
984         {
985                 register uint16_t bgr;
986                 bgr = *s++;
987                 *d++ = (bgr&0x1F)<<3;
988                 *d++ = (bgr&0x3E0)>>2;
989                 *d++ = (bgr&0x7C00)>>7;
990         }
991 }
992
993 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
994 {
995         const uint16_t *end;
996 #ifdef HAVE_MMX
997         const uint16_t *mm_end;
998 #endif
999         uint8_t *d = (uint8_t *)dst;
1000         const uint16_t *s = (const uint16_t *)src;
1001         end = s + src_size/2;
1002 #ifdef HAVE_MMX
1003         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1004         mm_end = end - 7;
1005         while(s < mm_end)
1006         {
1007             __asm __volatile(
1008                 PREFETCH" 32%1\n\t"
1009                 "movq   %1, %%mm0\n\t"
1010                 "movq   %1, %%mm1\n\t"
1011                 "movq   %1, %%mm2\n\t"
1012                 "pand   %2, %%mm0\n\t"
1013                 "pand   %3, %%mm1\n\t"
1014                 "pand   %4, %%mm2\n\t"
1015                 "psllq  $3, %%mm0\n\t"
1016                 "psrlq  $3, %%mm1\n\t"
1017                 "psrlq  $8, %%mm2\n\t"
1018                 "movq   %%mm0, %%mm3\n\t"
1019                 "movq   %%mm1, %%mm4\n\t"
1020                 "movq   %%mm2, %%mm5\n\t"
1021                 "punpcklwd %5, %%mm0\n\t"
1022                 "punpcklwd %5, %%mm1\n\t"
1023                 "punpcklwd %5, %%mm2\n\t"
1024                 "punpckhwd %5, %%mm3\n\t"
1025                 "punpckhwd %5, %%mm4\n\t"
1026                 "punpckhwd %5, %%mm5\n\t"
1027                 "psllq  $8, %%mm1\n\t"
1028                 "psllq  $16, %%mm2\n\t"
1029                 "por    %%mm1, %%mm0\n\t"
1030                 "por    %%mm2, %%mm0\n\t"
1031                 "psllq  $8, %%mm4\n\t"
1032                 "psllq  $16, %%mm5\n\t"
1033                 "por    %%mm4, %%mm3\n\t"
1034                 "por    %%mm5, %%mm3\n\t"
1035                 
1036                 "movq   %%mm0, %%mm6\n\t"
1037                 "movq   %%mm3, %%mm7\n\t"
1038
1039                 "movq   8%1, %%mm0\n\t"
1040                 "movq   8%1, %%mm1\n\t"
1041                 "movq   8%1, %%mm2\n\t"
1042                 "pand   %2, %%mm0\n\t"
1043                 "pand   %3, %%mm1\n\t"
1044                 "pand   %4, %%mm2\n\t"
1045                 "psllq  $3, %%mm0\n\t"
1046                 "psrlq  $3, %%mm1\n\t"
1047                 "psrlq  $8, %%mm2\n\t"
1048                 "movq   %%mm0, %%mm3\n\t"
1049                 "movq   %%mm1, %%mm4\n\t"
1050                 "movq   %%mm2, %%mm5\n\t"
1051                 "punpcklwd %5, %%mm0\n\t"
1052                 "punpcklwd %5, %%mm1\n\t"
1053                 "punpcklwd %5, %%mm2\n\t"
1054                 "punpckhwd %5, %%mm3\n\t"
1055                 "punpckhwd %5, %%mm4\n\t"
1056                 "punpckhwd %5, %%mm5\n\t"
1057                 "psllq  $8, %%mm1\n\t"
1058                 "psllq  $16, %%mm2\n\t"
1059                 "por    %%mm1, %%mm0\n\t"
1060                 "por    %%mm2, %%mm0\n\t"
1061                 "psllq  $8, %%mm4\n\t"
1062                 "psllq  $16, %%mm5\n\t"
1063                 "por    %%mm4, %%mm3\n\t"
1064                 "por    %%mm5, %%mm3\n\t"
1065                 :"=m"(*d)
1066                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1067                 :"memory");
1068             /* Borrowed 32 to 24 */
1069             __asm __volatile(
1070                 "movq   %%mm0, %%mm4\n\t"
1071                 "movq   %%mm3, %%mm5\n\t"
1072                 "movq   %%mm6, %%mm0\n\t"
1073                 "movq   %%mm7, %%mm1\n\t"
1074                 
1075                 "movq   %%mm4, %%mm6\n\t"
1076                 "movq   %%mm5, %%mm7\n\t"
1077                 "movq   %%mm0, %%mm2\n\t"
1078                 "movq   %%mm1, %%mm3\n\t"
1079
1080                 "psrlq  $8, %%mm2\n\t"
1081                 "psrlq  $8, %%mm3\n\t"
1082                 "psrlq  $8, %%mm6\n\t"
1083                 "psrlq  $8, %%mm7\n\t"
1084                 "pand   %2, %%mm0\n\t"
1085                 "pand   %2, %%mm1\n\t"
1086                 "pand   %2, %%mm4\n\t"
1087                 "pand   %2, %%mm5\n\t"
1088                 "pand   %3, %%mm2\n\t"
1089                 "pand   %3, %%mm3\n\t"
1090                 "pand   %3, %%mm6\n\t"
1091                 "pand   %3, %%mm7\n\t"
1092                 "por    %%mm2, %%mm0\n\t"
1093                 "por    %%mm3, %%mm1\n\t"
1094                 "por    %%mm6, %%mm4\n\t"
1095                 "por    %%mm7, %%mm5\n\t"
1096
1097                 "movq   %%mm1, %%mm2\n\t"
1098                 "movq   %%mm4, %%mm3\n\t"
1099                 "psllq  $48, %%mm2\n\t"
1100                 "psllq  $32, %%mm3\n\t"
1101                 "pand   %4, %%mm2\n\t"
1102                 "pand   %5, %%mm3\n\t"
1103                 "por    %%mm2, %%mm0\n\t"
1104                 "psrlq  $16, %%mm1\n\t"
1105                 "psrlq  $32, %%mm4\n\t"
1106                 "psllq  $16, %%mm5\n\t"
1107                 "por    %%mm3, %%mm1\n\t"
1108                 "pand   %6, %%mm5\n\t"
1109                 "por    %%mm5, %%mm4\n\t"
1110
1111                 MOVNTQ" %%mm0, %0\n\t"
1112                 MOVNTQ" %%mm1, 8%0\n\t"
1113                 MOVNTQ" %%mm4, 16%0"
1114
1115                 :"=m"(*d)
1116                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1117                 :"memory");
1118                 d += 24;
1119                 s += 8;
1120         }
1121         __asm __volatile(SFENCE:::"memory");
1122         __asm __volatile(EMMS:::"memory");
1123 #endif
1124         while(s < end)
1125         {
1126                 register uint16_t bgr;
1127                 bgr = *s++;
1128                 *d++ = (bgr&0x1F)<<3;
1129                 *d++ = (bgr&0x7E0)>>3;
1130                 *d++ = (bgr&0xF800)>>8;
1131         }
1132 }
1133
1134 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1135 {
1136         const uint16_t *end;
1137 #ifdef HAVE_MMX
1138         const uint16_t *mm_end;
1139 #endif
1140         uint8_t *d = (uint8_t *)dst;
1141         const uint16_t *s = (const uint16_t *)src;
1142         end = s + src_size/2;
1143 #ifdef HAVE_MMX
1144         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1145         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1146         mm_end = end - 3;
1147         while(s < mm_end)
1148         {
1149             __asm __volatile(
1150                 PREFETCH" 32%1\n\t"
1151                 "movq   %1, %%mm0\n\t"
1152                 "movq   %1, %%mm1\n\t"
1153                 "movq   %1, %%mm2\n\t"
1154                 "pand   %2, %%mm0\n\t"
1155                 "pand   %3, %%mm1\n\t"
1156                 "pand   %4, %%mm2\n\t"
1157                 "psllq  $3, %%mm0\n\t"
1158                 "psrlq  $2, %%mm1\n\t"
1159                 "psrlq  $7, %%mm2\n\t"
1160                 "movq   %%mm0, %%mm3\n\t"
1161                 "movq   %%mm1, %%mm4\n\t"
1162                 "movq   %%mm2, %%mm5\n\t"
1163                 "punpcklwd %%mm7, %%mm0\n\t"
1164                 "punpcklwd %%mm7, %%mm1\n\t"
1165                 "punpcklwd %%mm7, %%mm2\n\t"
1166                 "punpckhwd %%mm7, %%mm3\n\t"
1167                 "punpckhwd %%mm7, %%mm4\n\t"
1168                 "punpckhwd %%mm7, %%mm5\n\t"
1169                 "psllq  $8, %%mm1\n\t"
1170                 "psllq  $16, %%mm2\n\t"
1171                 "por    %%mm1, %%mm0\n\t"
1172                 "por    %%mm2, %%mm0\n\t"
1173                 "psllq  $8, %%mm4\n\t"
1174                 "psllq  $16, %%mm5\n\t"
1175                 "por    %%mm4, %%mm3\n\t"
1176                 "por    %%mm5, %%mm3\n\t"
1177                 MOVNTQ" %%mm0, %0\n\t"
1178                 MOVNTQ" %%mm3, 8%0\n\t"
1179                 :"=m"(*d)
1180                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1181                 :"memory");
1182                 d += 16;
1183                 s += 4;
1184         }
1185         __asm __volatile(SFENCE:::"memory");
1186         __asm __volatile(EMMS:::"memory");
1187 #endif
1188         while(s < end)
1189         {
1190                 register uint16_t bgr;
1191                 bgr = *s++;
1192                 *d++ = (bgr&0x1F)<<3;
1193                 *d++ = (bgr&0x3E0)>>2;
1194                 *d++ = (bgr&0x7C00)>>7;
1195                 *d++ = 0;
1196         }
1197 }
1198
1199 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1200 {
1201         const uint16_t *end;
1202 #ifdef HAVE_MMX
1203         const uint16_t *mm_end;
1204 #endif
1205         uint8_t *d = (uint8_t *)dst;
1206         const uint16_t *s = (uint16_t *)src;
1207         end = s + src_size/2;
1208 #ifdef HAVE_MMX
1209         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1210         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1211         mm_end = end - 3;
1212         while(s < mm_end)
1213         {
1214             __asm __volatile(
1215                 PREFETCH" 32%1\n\t"
1216                 "movq   %1, %%mm0\n\t"
1217                 "movq   %1, %%mm1\n\t"
1218                 "movq   %1, %%mm2\n\t"
1219                 "pand   %2, %%mm0\n\t"
1220                 "pand   %3, %%mm1\n\t"
1221                 "pand   %4, %%mm2\n\t"
1222                 "psllq  $3, %%mm0\n\t"
1223                 "psrlq  $3, %%mm1\n\t"
1224                 "psrlq  $8, %%mm2\n\t"
1225                 "movq   %%mm0, %%mm3\n\t"
1226                 "movq   %%mm1, %%mm4\n\t"
1227                 "movq   %%mm2, %%mm5\n\t"
1228                 "punpcklwd %%mm7, %%mm0\n\t"
1229                 "punpcklwd %%mm7, %%mm1\n\t"
1230                 "punpcklwd %%mm7, %%mm2\n\t"
1231                 "punpckhwd %%mm7, %%mm3\n\t"
1232                 "punpckhwd %%mm7, %%mm4\n\t"
1233                 "punpckhwd %%mm7, %%mm5\n\t"
1234                 "psllq  $8, %%mm1\n\t"
1235                 "psllq  $16, %%mm2\n\t"
1236                 "por    %%mm1, %%mm0\n\t"
1237                 "por    %%mm2, %%mm0\n\t"
1238                 "psllq  $8, %%mm4\n\t"
1239                 "psllq  $16, %%mm5\n\t"
1240                 "por    %%mm4, %%mm3\n\t"
1241                 "por    %%mm5, %%mm3\n\t"
1242                 MOVNTQ" %%mm0, %0\n\t"
1243                 MOVNTQ" %%mm3, 8%0\n\t"
1244                 :"=m"(*d)
1245                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1246                 :"memory");
1247                 d += 16;
1248                 s += 4;
1249         }
1250         __asm __volatile(SFENCE:::"memory");
1251         __asm __volatile(EMMS:::"memory");
1252 #endif
1253         while(s < end)
1254         {
1255                 register uint16_t bgr;
1256                 bgr = *s++;
1257                 *d++ = (bgr&0x1F)<<3;
1258                 *d++ = (bgr&0x7E0)>>3;
1259                 *d++ = (bgr&0xF800)>>8;
1260                 *d++ = 0;
1261         }
1262 }
1263
1264 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1265 {
1266 #ifdef HAVE_MMX
1267 /* TODO: unroll this loop */
1268         asm volatile (
1269                 "xorl %%eax, %%eax              \n\t"
1270                 ".balign 16                     \n\t"
1271                 "1:                             \n\t"
1272                 PREFETCH" 32(%0, %%eax)         \n\t"
1273                 "movq (%0, %%eax), %%mm0        \n\t"
1274                 "movq %%mm0, %%mm1              \n\t"
1275                 "movq %%mm0, %%mm2              \n\t"
1276                 "pslld $16, %%mm0               \n\t"
1277                 "psrld $16, %%mm1               \n\t"
1278                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1279                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1280                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1281                 "por %%mm0, %%mm2               \n\t"
1282                 "por %%mm1, %%mm2               \n\t"
1283                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1284                 "addl $8, %%eax                 \n\t"
1285                 "cmpl %2, %%eax                 \n\t"
1286                 " jb 1b                         \n\t"
1287                 :: "r" (src), "r"(dst), "r" (src_size-7)
1288                 : "%eax"
1289         );
1290
1291         __asm __volatile(SFENCE:::"memory");
1292         __asm __volatile(EMMS:::"memory");
1293 #else
1294         unsigned i;
1295         unsigned num_pixels = src_size >> 2;
1296         for(i=0; i<num_pixels; i++)
1297         {
1298                 dst[4*i + 0] = src[4*i + 2];
1299                 dst[4*i + 1] = src[4*i + 1];
1300                 dst[4*i + 2] = src[4*i + 0];
1301         }
1302 #endif
1303 }
1304
1305 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1306 {
1307         unsigned i;
1308 #ifdef HAVE_MMX
1309         int mmx_size= 23 - src_size;
1310         asm volatile (
1311                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1312                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1313                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1314                 ".balign 16                     \n\t"
1315                 "1:                             \n\t"
1316                 PREFETCH" 32(%1, %%eax)         \n\t"
1317                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1318                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1319                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1320                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1321                 "pand %%mm5, %%mm0              \n\t"
1322                 "pand %%mm6, %%mm1              \n\t"
1323                 "pand %%mm7, %%mm2              \n\t"
1324                 "por %%mm0, %%mm1               \n\t"
1325                 "por %%mm2, %%mm1               \n\t"                
1326                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1327                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1328                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1329                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1330                 "pand %%mm7, %%mm0              \n\t"
1331                 "pand %%mm5, %%mm1              \n\t"
1332                 "pand %%mm6, %%mm2              \n\t"
1333                 "por %%mm0, %%mm1               \n\t"
1334                 "por %%mm2, %%mm1               \n\t"                
1335                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1336                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1337                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1338                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1339                 "pand %%mm6, %%mm0              \n\t"
1340                 "pand %%mm7, %%mm1              \n\t"
1341                 "pand %%mm5, %%mm2              \n\t"
1342                 "por %%mm0, %%mm1               \n\t"
1343                 "por %%mm2, %%mm1               \n\t"                
1344                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1345                 "addl $24, %%eax                \n\t"
1346                 " js 1b                         \n\t"
1347                 : "+a" (mmx_size)
1348                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1349         );
1350
1351         __asm __volatile(SFENCE:::"memory");
1352         __asm __volatile(EMMS:::"memory");
1353
1354         if(mmx_size==23) return; //finihsed, was multiple of 8
1355
1356         src+= src_size;
1357         dst+= src_size;
1358         src_size= 23-mmx_size;
1359         src-= src_size;
1360         dst-= src_size;
1361 #endif
1362         for(i=0; i<src_size; i+=3)
1363         {
1364                 register uint8_t x;
1365                 x          = src[i + 2];
1366                 dst[i + 1] = src[i + 1];
1367                 dst[i + 2] = src[i + 0];
1368                 dst[i + 0] = x;
1369         }
1370 }
1371
1372 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1373         unsigned int width, unsigned int height,
1374         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1375 {
1376         unsigned y;
1377         const unsigned chromWidth= width>>1;
1378         for(y=0; y<height; y++)
1379         {
1380 #ifdef HAVE_MMX
1381 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1382                 asm volatile(
1383                         "xorl %%eax, %%eax              \n\t"
1384                         ".balign 16                     \n\t"
1385                         "1:                             \n\t"
1386                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1387                         PREFETCH" 32(%2, %%eax)         \n\t"
1388                         PREFETCH" 32(%3, %%eax)         \n\t"
1389                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1390                         "movq %%mm0, %%mm2              \n\t" // U(0)
1391                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1392                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1393                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1394
1395                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1396                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1397                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1398                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1399                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1400                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1401                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1402                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1403
1404                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1405                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1406                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1408
1409                         "addl $8, %%eax                 \n\t"
1410                         "cmpl %4, %%eax                 \n\t"
1411                         " jb 1b                         \n\t"
1412                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1413                         : "%eax"
1414                 );
1415 #else
1416 #if __WORDSIZE >= 64
1417                 int i;
1418                 uint64_t *ldst = (uint64_t *) dst;
1419                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420                 for(i = 0; i < chromWidth; i += 2){
1421                         uint64_t k, l;
1422                         k = yc[0] + (uc[0] << 8) +
1423                             (yc[1] << 16) + (vc[0] << 24);
1424                         l = yc[2] + (uc[1] << 8) +
1425                             (yc[3] << 16) + (vc[1] << 24);
1426                         *ldst++ = k + (l << 32);
1427                         yc += 4;
1428                         uc += 2;
1429                         vc += 2;
1430                 }
1431
1432 #else
1433                 int i, *idst = (int32_t *) dst;
1434                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435                 for(i = 0; i < chromWidth; i++){
1436                         *idst++ = yc[0] + (uc[0] << 8) +
1437                             (yc[1] << 16) + (vc[0] << 24);
1438                         yc += 2;
1439                         uc++;
1440                         vc++;
1441                 }
1442 #endif
1443 #endif
1444                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1445                 {
1446                         usrc += chromStride;
1447                         vsrc += chromStride;
1448                 }
1449                 ysrc += lumStride;
1450                 dst += dstStride;
1451         }
1452 #ifdef HAVE_MMX
1453 asm(    EMMS" \n\t"
1454         SFENCE" \n\t"
1455         :::"memory");
1456 #endif
1457 }
1458
1459 /**
1460  *
1461  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462  * problem for anyone then tell me, and ill fix it)
1463  */
1464 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465         unsigned int width, unsigned int height,
1466         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1467 {
1468         //FIXME interpolate chroma
1469         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1470 }
1471
1472 /**
1473  *
1474  * width should be a multiple of 16
1475  */
1476 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477         unsigned int width, unsigned int height,
1478         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1479 {
1480         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1481 }
1482
1483 /**
1484  *
1485  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486  * problem for anyone then tell me, and ill fix it)
1487  */
1488 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1489         unsigned int width, unsigned int height,
1490         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1491 {
1492         unsigned y;
1493         const unsigned chromWidth= width>>1;
1494         for(y=0; y<height; y+=2)
1495         {
1496 #ifdef HAVE_MMX
1497                 asm volatile(
1498                         "xorl %%eax, %%eax              \n\t"
1499                         "pcmpeqw %%mm7, %%mm7           \n\t"
1500                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1501                         ".balign 16                     \n\t"
1502                         "1:                             \n\t"
1503                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1504                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1505                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1506                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1507                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1508                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1509                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1510                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1511                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1512                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1513                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1514
1515                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1516
1517                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1518                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1519                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1520                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1521                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1522                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1523                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1524                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1525                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1526                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1527
1528                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1529
1530                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1531                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1532                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1533                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1534                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1535                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1536                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1537                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1538
1539                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1540                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1541
1542                         "addl $8, %%eax                 \n\t"
1543                         "cmpl %4, %%eax                 \n\t"
1544                         " jb 1b                         \n\t"
1545                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1546                         : "memory", "%eax"
1547                 );
1548
1549                 ydst += lumStride;
1550                 src  += srcStride;
1551
1552                 asm volatile(
1553                         "xorl %%eax, %%eax              \n\t"
1554                         ".balign 16                     \n\t"
1555                         "1:                             \n\t"
1556                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1557                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1558                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1559                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1560                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1561                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1562                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1563                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1564                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1565                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1566                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1567
1568                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1569                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1570
1571                         "addl $8, %%eax                 \n\t"
1572                         "cmpl %4, %%eax                 \n\t"
1573                         " jb 1b                         \n\t"
1574
1575                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1576                         : "memory", "%eax"
1577                 );
1578 #else
1579                 unsigned i;
1580                 for(i=0; i<chromWidth; i++)
1581                 {
1582                         ydst[2*i+0]     = src[4*i+0];
1583                         udst[i]         = src[4*i+1];
1584                         ydst[2*i+1]     = src[4*i+2];
1585                         vdst[i]         = src[4*i+3];
1586                 }
1587                 ydst += lumStride;
1588                 src  += srcStride;
1589
1590                 for(i=0; i<chromWidth; i++)
1591                 {
1592                         ydst[2*i+0]     = src[4*i+0];
1593                         ydst[2*i+1]     = src[4*i+2];
1594                 }
1595 #endif
1596                 udst += chromStride;
1597                 vdst += chromStride;
1598                 ydst += lumStride;
1599                 src  += srcStride;
1600         }
1601 #ifdef HAVE_MMX
1602 asm volatile(   EMMS" \n\t"
1603                 SFENCE" \n\t"
1604                 :::"memory");
1605 #endif
1606 }
1607
1608 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610         unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1611 {
1612         /* Y Plane */
1613         memcpy(ydst, ysrc, width*height);
1614
1615         /* XXX: implement upscaling for U,V */
1616 }
1617
1618 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1619 {
1620         int x,y;
1621         
1622         // first line
1623         for(x=0; x<srcWidth; x++){
1624                 dst[2*x+0]=
1625                 dst[2*x+1]= src[x];
1626         }
1627         dst+= dstStride;
1628
1629         for(y=1; y<srcHeight; y++){
1630 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631                 const int mmxSize= srcWidth;
1632                 asm volatile(
1633                         "movl %4, %%eax                 \n\t"
1634                         "1:                             \n\t"
1635                         "movq (%0, %%eax), %%mm0        \n\t"
1636                         "movq (%1, %%eax), %%mm1        \n\t"
1637                         "movq 1(%0, %%eax), %%mm2       \n\t"
1638                         "movq 1(%1, %%eax), %%mm3       \n\t"
1639                         "movq %%mm0, %%mm4              \n\t"
1640                         "movq %%mm1, %%mm5              \n\t"
1641                         PAVGB" %%mm3, %%mm0             \n\t"
1642                         PAVGB" %%mm3, %%mm0             \n\t"
1643                         PAVGB" %%mm4, %%mm3             \n\t"
1644                         PAVGB" %%mm4, %%mm3             \n\t"
1645                         PAVGB" %%mm2, %%mm1             \n\t"
1646                         PAVGB" %%mm2, %%mm1             \n\t"
1647                         PAVGB" %%mm5, %%mm2             \n\t"
1648                         PAVGB" %%mm5, %%mm2             \n\t"
1649                         "movq %%mm3, %%mm4              \n\t"
1650                         "movq %%mm2, %%mm5              \n\t"
1651                         "punpcklbw %%mm1, %%mm3         \n\t"
1652                         "punpckhbw %%mm1, %%mm4         \n\t"
1653                         "punpcklbw %%mm0, %%mm2         \n\t"
1654                         "punpckhbw %%mm0, %%mm5         \n\t"
1655 #if 1
1656                         MOVNTQ" %%mm3, (%2, %%eax, 2)   \n\t"
1657                         MOVNTQ" %%mm4, 8(%2, %%eax, 2)  \n\t"
1658                         MOVNTQ" %%mm2, (%3, %%eax, 2)   \n\t"
1659                         MOVNTQ" %%mm5, 8(%3, %%eax, 2)  \n\t"
1660 #else
1661                         "movq %%mm3, (%2, %%eax, 2)     \n\t"
1662                         "movq %%mm4, 8(%2, %%eax, 2)    \n\t"
1663                         "movq %%mm2, (%3, %%eax, 2)     \n\t"
1664                         "movq %%mm5, 8(%3, %%eax, 2)    \n\t"
1665 #endif
1666                         "addl $8, %%eax                 \n\t"
1667                         " js 1b                         \n\t"
1668                         :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1669                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1670                            "g" (-mmxSize)
1671                         : "%eax"
1672
1673                 );
1674                 dst[0]= 
1675                 dst[dstStride]= src[0];
1676 #else
1677                 dst[0]= 
1678                 dst[dstStride]= src[0];
1679
1680                 for(x=0; x<srcWidth-1; x++){
1681                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1682                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1683                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1684                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1685                 }
1686 #endif
1687                 dst[srcWidth*2 -1]= 
1688                 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1689
1690                 dst+=dstStride*2;
1691                 src+=srcStride;
1692         }
1693         src-=srcStride;
1694         
1695         // last line
1696         for(x=0; x<srcWidth; x++){
1697                 dst[2*x+0]=
1698                 dst[2*x+1]= src[x];
1699         }
1700 #ifdef HAVE_MMX
1701 asm volatile(   EMMS" \n\t"
1702                 SFENCE" \n\t"
1703                 :::"memory");
1704 #endif
1705 }
1706
1707 /**
1708  *
1709  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1710  * problem for anyone then tell me, and ill fix it)
1711  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1712  */
1713 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1714         unsigned int width, unsigned int height,
1715         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1716 {
1717         unsigned y;
1718         const unsigned chromWidth= width>>1;
1719         for(y=0; y<height; y+=2)
1720         {
1721 #ifdef HAVE_MMX
1722                 asm volatile(
1723                         "xorl %%eax, %%eax              \n\t"
1724                         "pcmpeqw %%mm7, %%mm7           \n\t"
1725                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1726                         ".balign 16                     \n\t"
1727                         "1:                             \n\t"
1728                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1729                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1730                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1731                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1732                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1733                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1734                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1735                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1736                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1737                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1738                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1739
1740                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1741
1742                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1743                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1744                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1745                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1746                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1747                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1748                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1749                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1750                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1751                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1752
1753                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1754
1755                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1756                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1757                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1758                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1759                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1760                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1761                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1762                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1763
1764                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1765                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1766
1767                         "addl $8, %%eax                 \n\t"
1768                         "cmpl %4, %%eax                 \n\t"
1769                         " jb 1b                         \n\t"
1770                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1771                         : "memory", "%eax"
1772                 );
1773
1774                 ydst += lumStride;
1775                 src  += srcStride;
1776
1777                 asm volatile(
1778                         "xorl %%eax, %%eax              \n\t"
1779                         ".balign 16                     \n\t"
1780                         "1:                             \n\t"
1781                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1782                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1783                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1784                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1785                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1786                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1787                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1788                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1789                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1790                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1791                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1792
1793                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1794                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1795
1796                         "addl $8, %%eax                 \n\t"
1797                         "cmpl %4, %%eax                 \n\t"
1798                         " jb 1b                         \n\t"
1799
1800                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1801                         : "memory", "%eax"
1802                 );
1803 #else
1804                 unsigned i;
1805                 for(i=0; i<chromWidth; i++)
1806                 {
1807                         udst[i]         = src[4*i+0];
1808                         ydst[2*i+0]     = src[4*i+1];
1809                         vdst[i]         = src[4*i+2];
1810                         ydst[2*i+1]     = src[4*i+3];
1811                 }
1812                 ydst += lumStride;
1813                 src  += srcStride;
1814
1815                 for(i=0; i<chromWidth; i++)
1816                 {
1817                         ydst[2*i+0]     = src[4*i+1];
1818                         ydst[2*i+1]     = src[4*i+3];
1819                 }
1820 #endif
1821                 udst += chromStride;
1822                 vdst += chromStride;
1823                 ydst += lumStride;
1824                 src  += srcStride;
1825         }
1826 #ifdef HAVE_MMX
1827 asm volatile(   EMMS" \n\t"
1828                 SFENCE" \n\t"
1829                 :::"memory");
1830 #endif
1831 }
1832
1833 /**
1834  *
1835  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1836  * problem for anyone then tell me, and ill fix it)
1837  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1838  */
1839 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1840         unsigned int width, unsigned int height,
1841         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1842 {
1843         unsigned y;
1844         const unsigned chromWidth= width>>1;
1845 #ifdef HAVE_MMX
1846         for(y=0; y<height-2; y+=2)
1847         {
1848                 unsigned i;
1849                 for(i=0; i<2; i++)
1850                 {
1851                         asm volatile(
1852                                 "movl %2, %%eax                 \n\t"
1853                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1854                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1855                                 "pxor %%mm7, %%mm7              \n\t"
1856                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1857                                 ".balign 16                     \n\t"
1858                                 "1:                             \n\t"
1859                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1860                                 "movd (%0, %%ebx), %%mm0        \n\t"
1861                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1862                                 "punpcklbw %%mm7, %%mm0         \n\t"
1863                                 "punpcklbw %%mm7, %%mm1         \n\t"
1864                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1865                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1866                                 "punpcklbw %%mm7, %%mm2         \n\t"
1867                                 "punpcklbw %%mm7, %%mm3         \n\t"
1868                                 "pmaddwd %%mm6, %%mm0           \n\t"
1869                                 "pmaddwd %%mm6, %%mm1           \n\t"
1870                                 "pmaddwd %%mm6, %%mm2           \n\t"
1871                                 "pmaddwd %%mm6, %%mm3           \n\t"
1872 #ifndef FAST_BGR2YV12
1873                                 "psrad $8, %%mm0                \n\t"
1874                                 "psrad $8, %%mm1                \n\t"
1875                                 "psrad $8, %%mm2                \n\t"
1876                                 "psrad $8, %%mm3                \n\t"
1877 #endif
1878                                 "packssdw %%mm1, %%mm0          \n\t"
1879                                 "packssdw %%mm3, %%mm2          \n\t"
1880                                 "pmaddwd %%mm5, %%mm0           \n\t"
1881                                 "pmaddwd %%mm5, %%mm2           \n\t"
1882                                 "packssdw %%mm2, %%mm0          \n\t"
1883                                 "psraw $7, %%mm0                \n\t"
1884
1885                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1886                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1887                                 "punpcklbw %%mm7, %%mm4         \n\t"
1888                                 "punpcklbw %%mm7, %%mm1         \n\t"
1889                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1890                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1891                                 "punpcklbw %%mm7, %%mm2         \n\t"
1892                                 "punpcklbw %%mm7, %%mm3         \n\t"
1893                                 "pmaddwd %%mm6, %%mm4           \n\t"
1894                                 "pmaddwd %%mm6, %%mm1           \n\t"
1895                                 "pmaddwd %%mm6, %%mm2           \n\t"
1896                                 "pmaddwd %%mm6, %%mm3           \n\t"
1897 #ifndef FAST_BGR2YV12
1898                                 "psrad $8, %%mm4                \n\t"
1899                                 "psrad $8, %%mm1                \n\t"
1900                                 "psrad $8, %%mm2                \n\t"
1901                                 "psrad $8, %%mm3                \n\t"
1902 #endif
1903                                 "packssdw %%mm1, %%mm4          \n\t"
1904                                 "packssdw %%mm3, %%mm2          \n\t"
1905                                 "pmaddwd %%mm5, %%mm4           \n\t"
1906                                 "pmaddwd %%mm5, %%mm2           \n\t"
1907                                 "addl $24, %%ebx                \n\t"
1908                                 "packssdw %%mm2, %%mm4          \n\t"
1909                                 "psraw $7, %%mm4                \n\t"
1910
1911                                 "packuswb %%mm4, %%mm0          \n\t"
1912                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1913
1914                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
1915                                 "addl $8, %%eax                 \n\t"
1916                                 " js 1b                         \n\t"
1917                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1918                                 : "%eax", "%ebx"
1919                         );
1920                         ydst += lumStride;
1921                         src  += srcStride;
1922                 }
1923                 src -= srcStride*2;
1924                 asm volatile(
1925                         "movl %4, %%eax                 \n\t"
1926                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1927                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1928                         "pxor %%mm7, %%mm7              \n\t"
1929                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1930                         "addl %%ebx, %%ebx              \n\t"
1931                         ".balign 16                     \n\t"
1932                         "1:                             \n\t"
1933                         PREFETCH" 64(%0, %%ebx)         \n\t"
1934                         PREFETCH" 64(%1, %%ebx)         \n\t"
1935 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1936                         "movq (%0, %%ebx), %%mm0        \n\t"
1937                         "movq (%1, %%ebx), %%mm1        \n\t"
1938                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1939                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1940                         PAVGB" %%mm1, %%mm0             \n\t"
1941                         PAVGB" %%mm3, %%mm2             \n\t"
1942                         "movq %%mm0, %%mm1              \n\t"
1943                         "movq %%mm2, %%mm3              \n\t"
1944                         "psrlq $24, %%mm0               \n\t"
1945                         "psrlq $24, %%mm2               \n\t"
1946                         PAVGB" %%mm1, %%mm0             \n\t"
1947                         PAVGB" %%mm3, %%mm2             \n\t"
1948                         "punpcklbw %%mm7, %%mm0         \n\t"
1949                         "punpcklbw %%mm7, %%mm2         \n\t"
1950 #else
1951                         "movd (%0, %%ebx), %%mm0        \n\t"
1952                         "movd (%1, %%ebx), %%mm1        \n\t"
1953                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1954                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1955                         "punpcklbw %%mm7, %%mm0         \n\t"
1956                         "punpcklbw %%mm7, %%mm1         \n\t"
1957                         "punpcklbw %%mm7, %%mm2         \n\t"
1958                         "punpcklbw %%mm7, %%mm3         \n\t"
1959                         "paddw %%mm1, %%mm0             \n\t"
1960                         "paddw %%mm3, %%mm2             \n\t"
1961                         "paddw %%mm2, %%mm0             \n\t"
1962                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1963                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1964                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1965                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1966                         "punpcklbw %%mm7, %%mm4         \n\t"
1967                         "punpcklbw %%mm7, %%mm1         \n\t"
1968                         "punpcklbw %%mm7, %%mm2         \n\t"
1969                         "punpcklbw %%mm7, %%mm3         \n\t"
1970                         "paddw %%mm1, %%mm4             \n\t"
1971                         "paddw %%mm3, %%mm2             \n\t"
1972                         "paddw %%mm4, %%mm2             \n\t"
1973                         "psrlw $2, %%mm0                \n\t"
1974                         "psrlw $2, %%mm2                \n\t"
1975 #endif
1976                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1977                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1978
1979                         "pmaddwd %%mm0, %%mm1           \n\t"
1980                         "pmaddwd %%mm2, %%mm3           \n\t"
1981                         "pmaddwd %%mm6, %%mm0           \n\t"
1982                         "pmaddwd %%mm6, %%mm2           \n\t"
1983 #ifndef FAST_BGR2YV12
1984                         "psrad $8, %%mm0                \n\t"
1985                         "psrad $8, %%mm1                \n\t"
1986                         "psrad $8, %%mm2                \n\t"
1987                         "psrad $8, %%mm3                \n\t"
1988 #endif
1989                         "packssdw %%mm2, %%mm0          \n\t"
1990                         "packssdw %%mm3, %%mm1          \n\t"
1991                         "pmaddwd %%mm5, %%mm0           \n\t"
1992                         "pmaddwd %%mm5, %%mm1           \n\t"
1993                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1994                         "psraw $7, %%mm0                \n\t"
1995
1996 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1997                         "movq 12(%0, %%ebx), %%mm4      \n\t"
1998                         "movq 12(%1, %%ebx), %%mm1      \n\t"
1999                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2000                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2001                         PAVGB" %%mm1, %%mm4             \n\t"
2002                         PAVGB" %%mm3, %%mm2             \n\t"
2003                         "movq %%mm4, %%mm1              \n\t"
2004                         "movq %%mm2, %%mm3              \n\t"
2005                         "psrlq $24, %%mm4               \n\t"
2006                         "psrlq $24, %%mm2               \n\t"
2007                         PAVGB" %%mm1, %%mm4             \n\t"
2008                         PAVGB" %%mm3, %%mm2             \n\t"
2009                         "punpcklbw %%mm7, %%mm4         \n\t"
2010                         "punpcklbw %%mm7, %%mm2         \n\t"
2011 #else
2012                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2013                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2014                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2015                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2016                         "punpcklbw %%mm7, %%mm4         \n\t"
2017                         "punpcklbw %%mm7, %%mm1         \n\t"
2018                         "punpcklbw %%mm7, %%mm2         \n\t"
2019                         "punpcklbw %%mm7, %%mm3         \n\t"
2020                         "paddw %%mm1, %%mm4             \n\t"
2021                         "paddw %%mm3, %%mm2             \n\t"
2022                         "paddw %%mm2, %%mm4             \n\t"
2023                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2024                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2025                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2026                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2027                         "punpcklbw %%mm7, %%mm5         \n\t"
2028                         "punpcklbw %%mm7, %%mm1         \n\t"
2029                         "punpcklbw %%mm7, %%mm2         \n\t"
2030                         "punpcklbw %%mm7, %%mm3         \n\t"
2031                         "paddw %%mm1, %%mm5             \n\t"
2032                         "paddw %%mm3, %%mm2             \n\t"
2033                         "paddw %%mm5, %%mm2             \n\t"
2034                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2035                         "psrlw $2, %%mm4                \n\t"
2036                         "psrlw $2, %%mm2                \n\t"
2037 #endif
2038                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2039                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2040
2041                         "pmaddwd %%mm4, %%mm1           \n\t"
2042                         "pmaddwd %%mm2, %%mm3           \n\t"
2043                         "pmaddwd %%mm6, %%mm4           \n\t"
2044                         "pmaddwd %%mm6, %%mm2           \n\t"
2045 #ifndef FAST_BGR2YV12
2046                         "psrad $8, %%mm4                \n\t"
2047                         "psrad $8, %%mm1                \n\t"
2048                         "psrad $8, %%mm2                \n\t"
2049                         "psrad $8, %%mm3                \n\t"
2050 #endif
2051                         "packssdw %%mm2, %%mm4          \n\t"
2052                         "packssdw %%mm3, %%mm1          \n\t"
2053                         "pmaddwd %%mm5, %%mm4           \n\t"
2054                         "pmaddwd %%mm5, %%mm1           \n\t"
2055                         "addl $24, %%ebx                \n\t"
2056                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2057                         "psraw $7, %%mm4                \n\t"
2058
2059                         "movq %%mm0, %%mm1              \n\t"
2060                         "punpckldq %%mm4, %%mm0         \n\t"
2061                         "punpckhdq %%mm4, %%mm1         \n\t"
2062                         "packsswb %%mm1, %%mm0          \n\t"
2063                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2064
2065                         "movd %%mm0, (%2, %%eax)        \n\t"
2066                         "punpckhdq %%mm0, %%mm0         \n\t"
2067                         "movd %%mm0, (%3, %%eax)        \n\t"
2068                         "addl $4, %%eax                 \n\t"
2069                         " js 1b                         \n\t"
2070                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2071                         : "%eax", "%ebx"
2072                 );
2073
2074                 udst += chromStride;
2075                 vdst += chromStride;
2076                 src  += srcStride*2;
2077         }
2078
2079         asm volatile(   EMMS" \n\t"
2080                         SFENCE" \n\t"
2081                         :::"memory");
2082 #else
2083         y=0;
2084 #endif
2085         for(; y<height; y+=2)
2086         {
2087                 unsigned i;
2088                 for(i=0; i<chromWidth; i++)
2089                 {
2090                         unsigned int b= src[6*i+0];
2091                         unsigned int g= src[6*i+1];
2092                         unsigned int r= src[6*i+2];
2093
2094                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2095                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2096                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2097
2098                         udst[i]         = U;
2099                         vdst[i]         = V;
2100                         ydst[2*i]       = Y;
2101
2102                         b= src[6*i+3];
2103                         g= src[6*i+4];
2104                         r= src[6*i+5];
2105
2106                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2107                         ydst[2*i+1]     = Y;
2108                 }
2109                 ydst += lumStride;
2110                 src  += srcStride;
2111
2112                 for(i=0; i<chromWidth; i++)
2113                 {
2114                         unsigned int b= src[6*i+0];
2115                         unsigned int g= src[6*i+1];
2116                         unsigned int r= src[6*i+2];
2117
2118                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2119
2120                         ydst[2*i]       = Y;
2121
2122                         b= src[6*i+3];
2123                         g= src[6*i+4];
2124                         r= src[6*i+5];
2125
2126                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2127                         ydst[2*i+1]     = Y;
2128                 }
2129                 udst += chromStride;
2130                 vdst += chromStride;
2131                 ydst += lumStride;
2132                 src  += srcStride;
2133         }
2134 }
2135
2136 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2137                             unsigned width, unsigned height, unsigned src1Stride,
2138                             unsigned src2Stride, unsigned dstStride){
2139         unsigned h;
2140
2141         for(h=0; h < height; h++)
2142         {
2143                 unsigned w;
2144
2145 #ifdef HAVE_MMX
2146 #ifdef HAVE_SSE2
2147                 asm(
2148                         "xorl %%eax, %%eax              \n\t"
2149                         "1:                             \n\t"
2150                         PREFETCH" 64(%1, %%eax)         \n\t"
2151                         PREFETCH" 64(%2, %%eax)         \n\t"
2152                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2153                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2154                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2155                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2156                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2157                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2158                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2159                         "addl $16, %%eax                        \n\t"
2160                         "cmpl %3, %%eax                 \n\t"
2161                         " jb 1b                         \n\t"
2162                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2163                         : "memory", "%eax"
2164                 );
2165 #else
2166                 asm(
2167                         "xorl %%eax, %%eax              \n\t"
2168                         "1:                             \n\t"
2169                         PREFETCH" 64(%1, %%eax)         \n\t"
2170                         PREFETCH" 64(%2, %%eax)         \n\t"
2171                         "movq (%1, %%eax), %%mm0        \n\t"
2172                         "movq 8(%1, %%eax), %%mm2       \n\t"
2173                         "movq %%mm0, %%mm1              \n\t"
2174                         "movq %%mm2, %%mm3              \n\t"
2175                         "movq (%2, %%eax), %%mm4        \n\t"
2176                         "movq 8(%2, %%eax), %%mm5       \n\t"
2177                         "punpcklbw %%mm4, %%mm0         \n\t"
2178                         "punpckhbw %%mm4, %%mm1         \n\t"
2179                         "punpcklbw %%mm5, %%mm2         \n\t"
2180                         "punpckhbw %%mm5, %%mm3         \n\t"
2181                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2182                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2183                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2184                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2185                         "addl $16, %%eax                        \n\t"
2186                         "cmpl %3, %%eax                 \n\t"
2187                         " jb 1b                         \n\t"
2188                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2189                         : "memory", "%eax"
2190                 );
2191 #endif
2192                 for(w= (width&(~15)); w < width; w++)
2193                 {
2194                         dest[2*w+0] = src1[w];
2195                         dest[2*w+1] = src2[w];
2196                 }
2197 #else
2198                 for(w=0; w < width; w++)
2199                 {
2200                         dest[2*w+0] = src1[w];
2201                         dest[2*w+1] = src2[w];
2202                 }
2203 #endif
2204                 dest += dstStride;
2205                 src1 += src1Stride;
2206                 src2 += src2Stride;
2207         }
2208 #ifdef HAVE_MMX
2209         asm(
2210                 EMMS" \n\t"
2211                 SFENCE" \n\t"
2212                 ::: "memory"
2213                 );
2214 #endif
2215 }
2216
2217 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2218                         uint8_t *dst1, uint8_t *dst2,
2219                         unsigned width, unsigned height,
2220                         unsigned srcStride1, unsigned srcStride2,
2221                         unsigned dstStride1, unsigned dstStride2)
2222 {
2223     unsigned y,x,w,h;
2224     w=width/2; h=height/2;
2225 #ifdef HAVE_MMX
2226     asm volatile(
2227         PREFETCH" %0\n\t"
2228         PREFETCH" %1\n\t"
2229         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2230 #endif
2231     for(y=0;y<h;y++){
2232         const uint8_t* s1=src1+srcStride1*(y>>1);
2233         uint8_t* d=dst1+dstStride1*y;
2234         x=0;
2235 #ifdef HAVE_MMX
2236         if(w > 32)
2237         for(;x<w;x+=32)
2238         {
2239             asm volatile(
2240                 PREFETCH" 32%1\n\t"
2241                 "movq   %1, %%mm0\n\t"
2242                 "movq   8%1, %%mm2\n\t"
2243                 "movq   16%1, %%mm4\n\t"
2244                 "movq   24%1, %%mm6\n\t"
2245                 "movq   %%mm0, %%mm1\n\t"
2246                 "movq   %%mm2, %%mm3\n\t"
2247                 "movq   %%mm4, %%mm5\n\t"
2248                 "movq   %%mm6, %%mm7\n\t"
2249                 "punpcklbw %%mm0, %%mm0\n\t"
2250                 "punpckhbw %%mm1, %%mm1\n\t"
2251                 "punpcklbw %%mm2, %%mm2\n\t"
2252                 "punpckhbw %%mm3, %%mm3\n\t"
2253                 "punpcklbw %%mm4, %%mm4\n\t"
2254                 "punpckhbw %%mm5, %%mm5\n\t"
2255                 "punpcklbw %%mm6, %%mm6\n\t"
2256                 "punpckhbw %%mm7, %%mm7\n\t"
2257                 MOVNTQ" %%mm0, %0\n\t"
2258                 MOVNTQ" %%mm1, 8%0\n\t"
2259                 MOVNTQ" %%mm2, 16%0\n\t"
2260                 MOVNTQ" %%mm3, 24%0\n\t"
2261                 MOVNTQ" %%mm4, 32%0\n\t"
2262                 MOVNTQ" %%mm5, 40%0\n\t"
2263                 MOVNTQ" %%mm6, 48%0\n\t"
2264                 MOVNTQ" %%mm7, 56%0"
2265                 :"=m"(d[2*x])
2266                 :"m"(s1[x])
2267                 :"memory");
2268         }
2269 #endif
2270         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2271     }
2272     for(y=0;y<h;y++){
2273         const uint8_t* s2=src2+srcStride2*(y>>1);
2274         uint8_t* d=dst2+dstStride2*y;
2275         x=0;
2276 #ifdef HAVE_MMX
2277         if(w > 32)
2278         for(;x<w;x+=32)
2279         {
2280             asm volatile(
2281                 PREFETCH" 32%1\n\t"
2282                 "movq   %1, %%mm0\n\t"
2283                 "movq   8%1, %%mm2\n\t"
2284                 "movq   16%1, %%mm4\n\t"
2285                 "movq   24%1, %%mm6\n\t"
2286                 "movq   %%mm0, %%mm1\n\t"
2287                 "movq   %%mm2, %%mm3\n\t"
2288                 "movq   %%mm4, %%mm5\n\t"
2289                 "movq   %%mm6, %%mm7\n\t"
2290                 "punpcklbw %%mm0, %%mm0\n\t"
2291                 "punpckhbw %%mm1, %%mm1\n\t"
2292                 "punpcklbw %%mm2, %%mm2\n\t"
2293                 "punpckhbw %%mm3, %%mm3\n\t"
2294                 "punpcklbw %%mm4, %%mm4\n\t"
2295                 "punpckhbw %%mm5, %%mm5\n\t"
2296                 "punpcklbw %%mm6, %%mm6\n\t"
2297                 "punpckhbw %%mm7, %%mm7\n\t"
2298                 MOVNTQ" %%mm0, %0\n\t"
2299                 MOVNTQ" %%mm1, 8%0\n\t"
2300                 MOVNTQ" %%mm2, 16%0\n\t"
2301                 MOVNTQ" %%mm3, 24%0\n\t"
2302                 MOVNTQ" %%mm4, 32%0\n\t"
2303                 MOVNTQ" %%mm5, 40%0\n\t"
2304                 MOVNTQ" %%mm6, 48%0\n\t"
2305                 MOVNTQ" %%mm7, 56%0"
2306                 :"=m"(d[2*x])
2307                 :"m"(s2[x])
2308                 :"memory");
2309         }
2310 #endif
2311         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2312     }
2313 #ifdef HAVE_MMX
2314         asm(
2315                 EMMS" \n\t"
2316                 SFENCE" \n\t"
2317                 ::: "memory"
2318                 );
2319 #endif
2320 }
2321
2322 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2323                         uint8_t *dst,
2324                         unsigned width, unsigned height,
2325                         unsigned srcStride1, unsigned srcStride2,
2326                         unsigned srcStride3, unsigned dstStride)
2327 {
2328     unsigned y,x,x2,w,h;
2329     w=width/2; h=height;
2330 #ifdef HAVE_MMX
2331     asm volatile(
2332         PREFETCH" %0\n\t"
2333         PREFETCH" %1\n\t"
2334         PREFETCH" %2\n\t"
2335         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2336 #endif
2337     for(y=0;y<h;y++){
2338         const uint8_t* yp=src1+srcStride1*y;
2339         const uint8_t* up=src2+srcStride2*(y>>2);
2340         const uint8_t* vp=src3+srcStride3*(y>>2);
2341         uint8_t* d=dst+dstStride*y;
2342         x2=0;
2343         x=0;
2344 #ifdef HAVE_MMX
2345         for(;x<w;x+=8,x2+=32)
2346         {
2347             asm volatile(
2348                 PREFETCH" 32%1\n\t"
2349                 PREFETCH" 32%2\n\t"
2350                 PREFETCH" 32%3\n\t"
2351                 "movq   %1, %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2352                 "movq   %2, %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2353                 "movq   %3, %%mm2\n\t"       /* V0V1V2V3V4V5V6V7 */
2354                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2355                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2356                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2357                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2358                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2359                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2360                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2361
2362                 "movq   %%mm1, %%mm6\n\t"
2363                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2364                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2365                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2366                 MOVNTQ" %%mm0, %0\n\t"
2367                 MOVNTQ" %%mm3, 8%0\n\t"
2368                 
2369                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2370                 "movq   8%1, %%mm0\n\t"
2371                 "movq   %%mm0, %%mm3\n\t"
2372                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2373                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2374                 MOVNTQ" %%mm0, 16%0\n\t"
2375                 MOVNTQ" %%mm3, 24%0\n\t"
2376
2377                 "movq   %%mm4, %%mm6\n\t"
2378                 "movq   16%1, %%mm0\n\t"
2379                 "movq   %%mm0, %%mm3\n\t"
2380                 "punpcklbw %%mm5, %%mm4\n\t"
2381                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2382                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2383                 MOVNTQ" %%mm0, 32%0\n\t"
2384                 MOVNTQ" %%mm3, 40%0\n\t"
2385                 
2386                 "punpckhbw %%mm5, %%mm6\n\t"
2387                 "movq   24%1, %%mm0\n\t"
2388                 "movq   %%mm0, %%mm3\n\t"
2389                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2390                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2391                 MOVNTQ" %%mm0, 48%0\n\t"
2392                 MOVNTQ" %%mm3, 56%0\n\t"
2393
2394                 :"=m"(d[8*x])
2395                 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2396                 :"memory");
2397         }
2398 #endif
2399         for(;x<w;x++,x2+=4)
2400         {
2401             d[8*x+0]=yp[x2];
2402             d[8*x+1]=up[x];
2403             d[8*x+2]=yp[x2+1];
2404             d[8*x+3]=vp[x];
2405             d[8*x+4]=yp[x2+2];
2406             d[8*x+5]=up[x];
2407             d[8*x+6]=yp[x2+3];
2408             d[8*x+7]=vp[x];
2409         }
2410     }
2411 #ifdef HAVE_MMX
2412         asm(
2413                 EMMS" \n\t"
2414                 SFENCE" \n\t"
2415                 ::: "memory"
2416                 );
2417 #endif
2418 }