]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
cleanup (unsigned stride -> int stride)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245 {
246         unsigned j,i,num_pixels=src_size/3;
247         for(i=0,j=0; j<num_pixels; i+=3,j+=3)
248         {
249                 dst[j+0] = src[i+2];
250                 dst[j+1] = src[i+1];
251                 dst[j+2] = src[i+0];
252         }
253 }
254
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256 {
257   register const uint8_t* s=src;
258   register uint8_t* d=dst;
259   register const uint8_t *end;
260   const uint8_t *mm_end;
261   end = s + src_size;
262 #ifdef HAVE_MMX
263   __asm __volatile(PREFETCH"    %0"::"m"(*s));
264   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
265   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
266   mm_end = end - 15;
267   while(s<mm_end)
268   {
269         __asm __volatile(
270                 PREFETCH"       32%1\n\t"
271                 "movq   %1, %%mm0\n\t"
272                 "movq   8%1, %%mm2\n\t"
273                 "movq   %%mm0, %%mm1\n\t"
274                 "movq   %%mm2, %%mm3\n\t"
275                 "psrlq  $1, %%mm0\n\t"
276                 "psrlq  $1, %%mm2\n\t"
277                 "pand   %%mm7, %%mm0\n\t"
278                 "pand   %%mm7, %%mm2\n\t"
279                 "pand   %%mm6, %%mm1\n\t"
280                 "pand   %%mm6, %%mm3\n\t"
281                 "por    %%mm1, %%mm0\n\t"
282                 "por    %%mm3, %%mm2\n\t"
283                 MOVNTQ" %%mm0, %0\n\t"
284                 MOVNTQ" %%mm2, 8%0"
285                 :"=m"(*d)
286                 :"m"(*s)
287                 );
288         d+=16;
289         s+=16;
290   }
291   __asm __volatile(SFENCE:::"memory");
292   __asm __volatile(EMMS:::"memory");
293 #endif
294     mm_end = end - 3;
295     while(s < mm_end)
296     {
297         register uint32_t x= *((uint32_t *)s);
298         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
299         s+=4;
300         d+=4;
301     }
302     if(s < end)
303     {
304         register uint16_t x= *((uint16_t *)s);
305         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
306         s+=2;
307         d+=2;
308     }
309 }
310
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
312 {
313         const uint8_t *s = src;
314         const uint8_t *end;
315 #ifdef HAVE_MMX
316         const uint8_t *mm_end;
317 #endif
318         uint16_t *d = (uint16_t *)dst;
319         end = s + src_size;
320 #ifdef HAVE_MMX
321         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
322         __asm __volatile(
323             "movq       %0, %%mm7\n\t"
324             "movq       %1, %%mm6\n\t"
325             ::"m"(red_16mask),"m"(green_16mask));
326         mm_end = end - 15;
327         while(s < mm_end)
328         {
329             __asm __volatile(
330                 PREFETCH" 32%1\n\t"
331                 "movd   %1, %%mm0\n\t"
332                 "movd   4%1, %%mm3\n\t"
333                 "punpckldq 8%1, %%mm0\n\t"
334                 "punpckldq 12%1, %%mm3\n\t"
335                 "movq   %%mm0, %%mm1\n\t"
336                 "movq   %%mm0, %%mm2\n\t"
337                 "movq   %%mm3, %%mm4\n\t"
338                 "movq   %%mm3, %%mm5\n\t"
339                 "psrlq  $3, %%mm0\n\t"
340                 "psrlq  $3, %%mm3\n\t"
341                 "pand   %2, %%mm0\n\t"
342                 "pand   %2, %%mm3\n\t"
343                 "psrlq  $5, %%mm1\n\t"
344                 "psrlq  $5, %%mm4\n\t"
345                 "pand   %%mm6, %%mm1\n\t"
346                 "pand   %%mm6, %%mm4\n\t"
347                 "psrlq  $8, %%mm2\n\t"
348                 "psrlq  $8, %%mm5\n\t"
349                 "pand   %%mm7, %%mm2\n\t"
350                 "pand   %%mm7, %%mm5\n\t"
351                 "por    %%mm1, %%mm0\n\t"
352                 "por    %%mm4, %%mm3\n\t"
353                 "por    %%mm2, %%mm0\n\t"
354                 "por    %%mm5, %%mm3\n\t"
355                 "psllq  $16, %%mm3\n\t"
356                 "por    %%mm3, %%mm0\n\t"
357                 MOVNTQ" %%mm0, %0\n\t"
358                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359                 d += 4;
360                 s += 16;
361         }
362         __asm __volatile(SFENCE:::"memory");
363         __asm __volatile(EMMS:::"memory");
364 #endif
365         while(s < end)
366         {
367 #ifndef WORDS_BIGENDIAN
368                 const int b= *s++;
369                 const int g= *s++;
370                 const int r= *s++;
371 #else
372                 const int a= *s++; /*skip*/
373                 const int r= *s++;
374                 const int g= *s++;
375                 const int b= *s++;
376 #endif          
377                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
378 #ifndef WORDS_BIGENDIAN
379                 s++;
380 #endif
381         }
382 }
383
384 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
385 {
386         const uint8_t *s = src;
387         const uint8_t *end;
388 #ifdef HAVE_MMX
389         const uint8_t *mm_end;
390 #endif
391         uint16_t *d = (uint16_t *)dst;
392         end = s + src_size;
393 #ifdef HAVE_MMX
394         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
395         __asm __volatile(
396             "movq       %0, %%mm7\n\t"
397             "movq       %1, %%mm6\n\t"
398             ::"m"(red_16mask),"m"(green_16mask));
399         mm_end = end - 15;
400         while(s < mm_end)
401         {
402             __asm __volatile(
403                 PREFETCH" 32%1\n\t"
404                 "movd   %1, %%mm0\n\t"
405                 "movd   4%1, %%mm3\n\t"
406                 "punpckldq 8%1, %%mm0\n\t"
407                 "punpckldq 12%1, %%mm3\n\t"
408                 "movq   %%mm0, %%mm1\n\t"
409                 "movq   %%mm0, %%mm2\n\t"
410                 "movq   %%mm3, %%mm4\n\t"
411                 "movq   %%mm3, %%mm5\n\t"
412                 "psllq  $8, %%mm0\n\t"
413                 "psllq  $8, %%mm3\n\t"
414                 "pand   %%mm7, %%mm0\n\t"
415                 "pand   %%mm7, %%mm3\n\t"
416                 "psrlq  $5, %%mm1\n\t"
417                 "psrlq  $5, %%mm4\n\t"
418                 "pand   %%mm6, %%mm1\n\t"
419                 "pand   %%mm6, %%mm4\n\t"
420                 "psrlq  $19, %%mm2\n\t"
421                 "psrlq  $19, %%mm5\n\t"
422                 "pand   %2, %%mm2\n\t"
423                 "pand   %2, %%mm5\n\t"
424                 "por    %%mm1, %%mm0\n\t"
425                 "por    %%mm4, %%mm3\n\t"
426                 "por    %%mm2, %%mm0\n\t"
427                 "por    %%mm5, %%mm3\n\t"
428                 "psllq  $16, %%mm3\n\t"
429                 "por    %%mm3, %%mm0\n\t"
430                 MOVNTQ" %%mm0, %0\n\t"
431                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
432                 d += 4;
433                 s += 16;
434         }
435         __asm __volatile(SFENCE:::"memory");
436         __asm __volatile(EMMS:::"memory");
437 #endif
438         while(s < end)
439         {
440                 const int r= *s++;
441                 const int g= *s++;
442                 const int b= *s++;
443                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
444                 s++;
445         }
446 }
447
448 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
449 {
450         const uint8_t *s = src;
451         const uint8_t *end;
452 #ifdef HAVE_MMX
453         const uint8_t *mm_end;
454 #endif
455         uint16_t *d = (uint16_t *)dst;
456         end = s + src_size;
457 #ifdef HAVE_MMX
458         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
459         __asm __volatile(
460             "movq       %0, %%mm7\n\t"
461             "movq       %1, %%mm6\n\t"
462             ::"m"(red_15mask),"m"(green_15mask));
463         mm_end = end - 15;
464         while(s < mm_end)
465         {
466             __asm __volatile(
467                 PREFETCH" 32%1\n\t"
468                 "movd   %1, %%mm0\n\t"
469                 "movd   4%1, %%mm3\n\t"
470                 "punpckldq 8%1, %%mm0\n\t"
471                 "punpckldq 12%1, %%mm3\n\t"
472                 "movq   %%mm0, %%mm1\n\t"
473                 "movq   %%mm0, %%mm2\n\t"
474                 "movq   %%mm3, %%mm4\n\t"
475                 "movq   %%mm3, %%mm5\n\t"
476                 "psrlq  $3, %%mm0\n\t"
477                 "psrlq  $3, %%mm3\n\t"
478                 "pand   %2, %%mm0\n\t"
479                 "pand   %2, %%mm3\n\t"
480                 "psrlq  $6, %%mm1\n\t"
481                 "psrlq  $6, %%mm4\n\t"
482                 "pand   %%mm6, %%mm1\n\t"
483                 "pand   %%mm6, %%mm4\n\t"
484                 "psrlq  $9, %%mm2\n\t"
485                 "psrlq  $9, %%mm5\n\t"
486                 "pand   %%mm7, %%mm2\n\t"
487                 "pand   %%mm7, %%mm5\n\t"
488                 "por    %%mm1, %%mm0\n\t"
489                 "por    %%mm4, %%mm3\n\t"
490                 "por    %%mm2, %%mm0\n\t"
491                 "por    %%mm5, %%mm3\n\t"
492                 "psllq  $16, %%mm3\n\t"
493                 "por    %%mm3, %%mm0\n\t"
494                 MOVNTQ" %%mm0, %0\n\t"
495                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
496                 d += 4;
497                 s += 16;
498         }
499         __asm __volatile(SFENCE:::"memory");
500         __asm __volatile(EMMS:::"memory");
501 #endif
502         while(s < end)
503         {
504                 const int b= *s++;
505                 const int g= *s++;
506                 const int r= *s++;
507                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
508                 s++;
509         }
510 }
511
512 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
513 {
514         const uint8_t *s = src;
515         const uint8_t *end;
516 #ifdef HAVE_MMX
517         const uint8_t *mm_end;
518 #endif
519         uint16_t *d = (uint16_t *)dst;
520         end = s + src_size;
521 #ifdef HAVE_MMX
522         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
523         __asm __volatile(
524             "movq       %0, %%mm7\n\t"
525             "movq       %1, %%mm6\n\t"
526             ::"m"(red_15mask),"m"(green_15mask));
527         mm_end = end - 15;
528         while(s < mm_end)
529         {
530             __asm __volatile(
531                 PREFETCH" 32%1\n\t"
532                 "movd   %1, %%mm0\n\t"
533                 "movd   4%1, %%mm3\n\t"
534                 "punpckldq 8%1, %%mm0\n\t"
535                 "punpckldq 12%1, %%mm3\n\t"
536                 "movq   %%mm0, %%mm1\n\t"
537                 "movq   %%mm0, %%mm2\n\t"
538                 "movq   %%mm3, %%mm4\n\t"
539                 "movq   %%mm3, %%mm5\n\t"
540                 "psllq  $7, %%mm0\n\t"
541                 "psllq  $7, %%mm3\n\t"
542                 "pand   %%mm7, %%mm0\n\t"
543                 "pand   %%mm7, %%mm3\n\t"
544                 "psrlq  $6, %%mm1\n\t"
545                 "psrlq  $6, %%mm4\n\t"
546                 "pand   %%mm6, %%mm1\n\t"
547                 "pand   %%mm6, %%mm4\n\t"
548                 "psrlq  $19, %%mm2\n\t"
549                 "psrlq  $19, %%mm5\n\t"
550                 "pand   %2, %%mm2\n\t"
551                 "pand   %2, %%mm5\n\t"
552                 "por    %%mm1, %%mm0\n\t"
553                 "por    %%mm4, %%mm3\n\t"
554                 "por    %%mm2, %%mm0\n\t"
555                 "por    %%mm5, %%mm3\n\t"
556                 "psllq  $16, %%mm3\n\t"
557                 "por    %%mm3, %%mm0\n\t"
558                 MOVNTQ" %%mm0, %0\n\t"
559                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
560                 d += 4;
561                 s += 16;
562         }
563         __asm __volatile(SFENCE:::"memory");
564         __asm __volatile(EMMS:::"memory");
565 #endif
566         while(s < end)
567         {
568                 const int r= *s++;
569                 const int g= *s++;
570                 const int b= *s++;
571                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
572                 s++;
573         }
574 }
575
576 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
577 {
578         const uint8_t *s = src;
579         const uint8_t *end;
580 #ifdef HAVE_MMX
581         const uint8_t *mm_end;
582 #endif
583         uint16_t *d = (uint16_t *)dst;
584         end = s + src_size;
585 #ifdef HAVE_MMX
586         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
587         __asm __volatile(
588             "movq       %0, %%mm7\n\t"
589             "movq       %1, %%mm6\n\t"
590             ::"m"(red_16mask),"m"(green_16mask));
591         mm_end = end - 11;
592         while(s < mm_end)
593         {
594             __asm __volatile(
595                 PREFETCH" 32%1\n\t"
596                 "movd   %1, %%mm0\n\t"
597                 "movd   3%1, %%mm3\n\t"
598                 "punpckldq 6%1, %%mm0\n\t"
599                 "punpckldq 9%1, %%mm3\n\t"
600                 "movq   %%mm0, %%mm1\n\t"
601                 "movq   %%mm0, %%mm2\n\t"
602                 "movq   %%mm3, %%mm4\n\t"
603                 "movq   %%mm3, %%mm5\n\t"
604                 "psrlq  $3, %%mm0\n\t"
605                 "psrlq  $3, %%mm3\n\t"
606                 "pand   %2, %%mm0\n\t"
607                 "pand   %2, %%mm3\n\t"
608                 "psrlq  $5, %%mm1\n\t"
609                 "psrlq  $5, %%mm4\n\t"
610                 "pand   %%mm6, %%mm1\n\t"
611                 "pand   %%mm6, %%mm4\n\t"
612                 "psrlq  $8, %%mm2\n\t"
613                 "psrlq  $8, %%mm5\n\t"
614                 "pand   %%mm7, %%mm2\n\t"
615                 "pand   %%mm7, %%mm5\n\t"
616                 "por    %%mm1, %%mm0\n\t"
617                 "por    %%mm4, %%mm3\n\t"
618                 "por    %%mm2, %%mm0\n\t"
619                 "por    %%mm5, %%mm3\n\t"
620                 "psllq  $16, %%mm3\n\t"
621                 "por    %%mm3, %%mm0\n\t"
622                 MOVNTQ" %%mm0, %0\n\t"
623                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
624                 d += 4;
625                 s += 12;
626         }
627         __asm __volatile(SFENCE:::"memory");
628         __asm __volatile(EMMS:::"memory");
629 #endif
630         while(s < end)
631         {
632                 const int b= *s++;
633                 const int g= *s++;
634                 const int r= *s++;
635                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
636         }
637 }
638
639 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
640 {
641         const uint8_t *s = src;
642         const uint8_t *end;
643 #ifdef HAVE_MMX
644         const uint8_t *mm_end;
645 #endif
646         uint16_t *d = (uint16_t *)dst;
647         end = s + src_size;
648 #ifdef HAVE_MMX
649         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
650         __asm __volatile(
651             "movq       %0, %%mm7\n\t"
652             "movq       %1, %%mm6\n\t"
653             ::"m"(red_16mask),"m"(green_16mask));
654         mm_end = end - 15;
655         while(s < mm_end)
656         {
657             __asm __volatile(
658                 PREFETCH" 32%1\n\t"
659                 "movd   %1, %%mm0\n\t"
660                 "movd   3%1, %%mm3\n\t"
661                 "punpckldq 6%1, %%mm0\n\t"
662                 "punpckldq 9%1, %%mm3\n\t"
663                 "movq   %%mm0, %%mm1\n\t"
664                 "movq   %%mm0, %%mm2\n\t"
665                 "movq   %%mm3, %%mm4\n\t"
666                 "movq   %%mm3, %%mm5\n\t"
667                 "psllq  $8, %%mm0\n\t"
668                 "psllq  $8, %%mm3\n\t"
669                 "pand   %%mm7, %%mm0\n\t"
670                 "pand   %%mm7, %%mm3\n\t"
671                 "psrlq  $5, %%mm1\n\t"
672                 "psrlq  $5, %%mm4\n\t"
673                 "pand   %%mm6, %%mm1\n\t"
674                 "pand   %%mm6, %%mm4\n\t"
675                 "psrlq  $19, %%mm2\n\t"
676                 "psrlq  $19, %%mm5\n\t"
677                 "pand   %2, %%mm2\n\t"
678                 "pand   %2, %%mm5\n\t"
679                 "por    %%mm1, %%mm0\n\t"
680                 "por    %%mm4, %%mm3\n\t"
681                 "por    %%mm2, %%mm0\n\t"
682                 "por    %%mm5, %%mm3\n\t"
683                 "psllq  $16, %%mm3\n\t"
684                 "por    %%mm3, %%mm0\n\t"
685                 MOVNTQ" %%mm0, %0\n\t"
686                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
687                 d += 4;
688                 s += 12;
689         }
690         __asm __volatile(SFENCE:::"memory");
691         __asm __volatile(EMMS:::"memory");
692 #endif
693         while(s < end)
694         {
695                 const int r= *s++;
696                 const int g= *s++;
697                 const int b= *s++;
698                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
699         }
700 }
701
702 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
703 {
704         const uint8_t *s = src;
705         const uint8_t *end;
706 #ifdef HAVE_MMX
707         const uint8_t *mm_end;
708 #endif
709         uint16_t *d = (uint16_t *)dst;
710         end = s + src_size;
711 #ifdef HAVE_MMX
712         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
713         __asm __volatile(
714             "movq       %0, %%mm7\n\t"
715             "movq       %1, %%mm6\n\t"
716             ::"m"(red_15mask),"m"(green_15mask));
717         mm_end = end - 11;
718         while(s < mm_end)
719         {
720             __asm __volatile(
721                 PREFETCH" 32%1\n\t"
722                 "movd   %1, %%mm0\n\t"
723                 "movd   3%1, %%mm3\n\t"
724                 "punpckldq 6%1, %%mm0\n\t"
725                 "punpckldq 9%1, %%mm3\n\t"
726                 "movq   %%mm0, %%mm1\n\t"
727                 "movq   %%mm0, %%mm2\n\t"
728                 "movq   %%mm3, %%mm4\n\t"
729                 "movq   %%mm3, %%mm5\n\t"
730                 "psrlq  $3, %%mm0\n\t"
731                 "psrlq  $3, %%mm3\n\t"
732                 "pand   %2, %%mm0\n\t"
733                 "pand   %2, %%mm3\n\t"
734                 "psrlq  $6, %%mm1\n\t"
735                 "psrlq  $6, %%mm4\n\t"
736                 "pand   %%mm6, %%mm1\n\t"
737                 "pand   %%mm6, %%mm4\n\t"
738                 "psrlq  $9, %%mm2\n\t"
739                 "psrlq  $9, %%mm5\n\t"
740                 "pand   %%mm7, %%mm2\n\t"
741                 "pand   %%mm7, %%mm5\n\t"
742                 "por    %%mm1, %%mm0\n\t"
743                 "por    %%mm4, %%mm3\n\t"
744                 "por    %%mm2, %%mm0\n\t"
745                 "por    %%mm5, %%mm3\n\t"
746                 "psllq  $16, %%mm3\n\t"
747                 "por    %%mm3, %%mm0\n\t"
748                 MOVNTQ" %%mm0, %0\n\t"
749                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
750                 d += 4;
751                 s += 12;
752         }
753         __asm __volatile(SFENCE:::"memory");
754         __asm __volatile(EMMS:::"memory");
755 #endif
756         while(s < end)
757         {
758                 const int b= *s++;
759                 const int g= *s++;
760                 const int r= *s++;
761                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
762         }
763 }
764
765 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
766 {
767         const uint8_t *s = src;
768         const uint8_t *end;
769 #ifdef HAVE_MMX
770         const uint8_t *mm_end;
771 #endif
772         uint16_t *d = (uint16_t *)dst;
773         end = s + src_size;
774 #ifdef HAVE_MMX
775         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
776         __asm __volatile(
777             "movq       %0, %%mm7\n\t"
778             "movq       %1, %%mm6\n\t"
779             ::"m"(red_15mask),"m"(green_15mask));
780         mm_end = end - 15;
781         while(s < mm_end)
782         {
783             __asm __volatile(
784                 PREFETCH" 32%1\n\t"
785                 "movd   %1, %%mm0\n\t"
786                 "movd   3%1, %%mm3\n\t"
787                 "punpckldq 6%1, %%mm0\n\t"
788                 "punpckldq 9%1, %%mm3\n\t"
789                 "movq   %%mm0, %%mm1\n\t"
790                 "movq   %%mm0, %%mm2\n\t"
791                 "movq   %%mm3, %%mm4\n\t"
792                 "movq   %%mm3, %%mm5\n\t"
793                 "psllq  $7, %%mm0\n\t"
794                 "psllq  $7, %%mm3\n\t"
795                 "pand   %%mm7, %%mm0\n\t"
796                 "pand   %%mm7, %%mm3\n\t"
797                 "psrlq  $6, %%mm1\n\t"
798                 "psrlq  $6, %%mm4\n\t"
799                 "pand   %%mm6, %%mm1\n\t"
800                 "pand   %%mm6, %%mm4\n\t"
801                 "psrlq  $19, %%mm2\n\t"
802                 "psrlq  $19, %%mm5\n\t"
803                 "pand   %2, %%mm2\n\t"
804                 "pand   %2, %%mm5\n\t"
805                 "por    %%mm1, %%mm0\n\t"
806                 "por    %%mm4, %%mm3\n\t"
807                 "por    %%mm2, %%mm0\n\t"
808                 "por    %%mm5, %%mm3\n\t"
809                 "psllq  $16, %%mm3\n\t"
810                 "por    %%mm3, %%mm0\n\t"
811                 MOVNTQ" %%mm0, %0\n\t"
812                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
813                 d += 4;
814                 s += 12;
815         }
816         __asm __volatile(SFENCE:::"memory");
817         __asm __volatile(EMMS:::"memory");
818 #endif
819         while(s < end)
820         {
821                 const int r= *s++;
822                 const int g= *s++;
823                 const int b= *s++;
824                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
825         }
826 }
827
828 /*
829   I use here less accurate approximation by simply
830  left-shifting the input
831   value and filling the low order bits with
832  zeroes. This method improves png's
833   compression but this scheme cannot reproduce white exactly, since it does not
834   generate an all-ones maximum value; the net effect is to darken the
835   image slightly.
836
837   The better method should be "left bit replication":
838
839    4 3 2 1 0
840    ---------
841    1 1 0 1 1
842
843    7 6 5 4 3  2 1 0
844    ----------------
845    1 1 0 1 1  1 1 0
846    |=======|  |===|
847        |      Leftmost Bits Repeated to Fill Open Bits
848        |
849    Original Bits
850 */
851 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
852 {
853         const uint16_t *end;
854 #ifdef HAVE_MMX
855         const uint16_t *mm_end;
856 #endif
857         uint8_t *d = (uint8_t *)dst;
858         const uint16_t *s = (uint16_t *)src;
859         end = s + src_size/2;
860 #ifdef HAVE_MMX
861         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
862         mm_end = end - 7;
863         while(s < mm_end)
864         {
865             __asm __volatile(
866                 PREFETCH" 32%1\n\t"
867                 "movq   %1, %%mm0\n\t"
868                 "movq   %1, %%mm1\n\t"
869                 "movq   %1, %%mm2\n\t"
870                 "pand   %2, %%mm0\n\t"
871                 "pand   %3, %%mm1\n\t"
872                 "pand   %4, %%mm2\n\t"
873                 "psllq  $3, %%mm0\n\t"
874                 "psrlq  $2, %%mm1\n\t"
875                 "psrlq  $7, %%mm2\n\t"
876                 "movq   %%mm0, %%mm3\n\t"
877                 "movq   %%mm1, %%mm4\n\t"
878                 "movq   %%mm2, %%mm5\n\t"
879                 "punpcklwd %5, %%mm0\n\t"
880                 "punpcklwd %5, %%mm1\n\t"
881                 "punpcklwd %5, %%mm2\n\t"
882                 "punpckhwd %5, %%mm3\n\t"
883                 "punpckhwd %5, %%mm4\n\t"
884                 "punpckhwd %5, %%mm5\n\t"
885                 "psllq  $8, %%mm1\n\t"
886                 "psllq  $16, %%mm2\n\t"
887                 "por    %%mm1, %%mm0\n\t"
888                 "por    %%mm2, %%mm0\n\t"
889                 "psllq  $8, %%mm4\n\t"
890                 "psllq  $16, %%mm5\n\t"
891                 "por    %%mm4, %%mm3\n\t"
892                 "por    %%mm5, %%mm3\n\t"
893
894                 "movq   %%mm0, %%mm6\n\t"
895                 "movq   %%mm3, %%mm7\n\t"
896                 
897                 "movq   8%1, %%mm0\n\t"
898                 "movq   8%1, %%mm1\n\t"
899                 "movq   8%1, %%mm2\n\t"
900                 "pand   %2, %%mm0\n\t"
901                 "pand   %3, %%mm1\n\t"
902                 "pand   %4, %%mm2\n\t"
903                 "psllq  $3, %%mm0\n\t"
904                 "psrlq  $2, %%mm1\n\t"
905                 "psrlq  $7, %%mm2\n\t"
906                 "movq   %%mm0, %%mm3\n\t"
907                 "movq   %%mm1, %%mm4\n\t"
908                 "movq   %%mm2, %%mm5\n\t"
909                 "punpcklwd %5, %%mm0\n\t"
910                 "punpcklwd %5, %%mm1\n\t"
911                 "punpcklwd %5, %%mm2\n\t"
912                 "punpckhwd %5, %%mm3\n\t"
913                 "punpckhwd %5, %%mm4\n\t"
914                 "punpckhwd %5, %%mm5\n\t"
915                 "psllq  $8, %%mm1\n\t"
916                 "psllq  $16, %%mm2\n\t"
917                 "por    %%mm1, %%mm0\n\t"
918                 "por    %%mm2, %%mm0\n\t"
919                 "psllq  $8, %%mm4\n\t"
920                 "psllq  $16, %%mm5\n\t"
921                 "por    %%mm4, %%mm3\n\t"
922                 "por    %%mm5, %%mm3\n\t"
923
924                 :"=m"(*d)
925                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
926                 :"memory");
927             /* Borrowed 32 to 24 */
928             __asm __volatile(
929                 "movq   %%mm0, %%mm4\n\t"
930                 "movq   %%mm3, %%mm5\n\t"
931                 "movq   %%mm6, %%mm0\n\t"
932                 "movq   %%mm7, %%mm1\n\t"
933                 
934                 "movq   %%mm4, %%mm6\n\t"
935                 "movq   %%mm5, %%mm7\n\t"
936                 "movq   %%mm0, %%mm2\n\t"
937                 "movq   %%mm1, %%mm3\n\t"
938
939                 "psrlq  $8, %%mm2\n\t"
940                 "psrlq  $8, %%mm3\n\t"
941                 "psrlq  $8, %%mm6\n\t"
942                 "psrlq  $8, %%mm7\n\t"
943                 "pand   %2, %%mm0\n\t"
944                 "pand   %2, %%mm1\n\t"
945                 "pand   %2, %%mm4\n\t"
946                 "pand   %2, %%mm5\n\t"
947                 "pand   %3, %%mm2\n\t"
948                 "pand   %3, %%mm3\n\t"
949                 "pand   %3, %%mm6\n\t"
950                 "pand   %3, %%mm7\n\t"
951                 "por    %%mm2, %%mm0\n\t"
952                 "por    %%mm3, %%mm1\n\t"
953                 "por    %%mm6, %%mm4\n\t"
954                 "por    %%mm7, %%mm5\n\t"
955
956                 "movq   %%mm1, %%mm2\n\t"
957                 "movq   %%mm4, %%mm3\n\t"
958                 "psllq  $48, %%mm2\n\t"
959                 "psllq  $32, %%mm3\n\t"
960                 "pand   %4, %%mm2\n\t"
961                 "pand   %5, %%mm3\n\t"
962                 "por    %%mm2, %%mm0\n\t"
963                 "psrlq  $16, %%mm1\n\t"
964                 "psrlq  $32, %%mm4\n\t"
965                 "psllq  $16, %%mm5\n\t"
966                 "por    %%mm3, %%mm1\n\t"
967                 "pand   %6, %%mm5\n\t"
968                 "por    %%mm5, %%mm4\n\t"
969
970                 MOVNTQ" %%mm0, %0\n\t"
971                 MOVNTQ" %%mm1, 8%0\n\t"
972                 MOVNTQ" %%mm4, 16%0"
973
974                 :"=m"(*d)
975                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
976                 :"memory");
977                 d += 24;
978                 s += 8;
979         }
980         __asm __volatile(SFENCE:::"memory");
981         __asm __volatile(EMMS:::"memory");
982 #endif
983         while(s < end)
984         {
985                 register uint16_t bgr;
986                 bgr = *s++;
987                 *d++ = (bgr&0x1F)<<3;
988                 *d++ = (bgr&0x3E0)>>2;
989                 *d++ = (bgr&0x7C00)>>7;
990         }
991 }
992
993 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
994 {
995         const uint16_t *end;
996 #ifdef HAVE_MMX
997         const uint16_t *mm_end;
998 #endif
999         uint8_t *d = (uint8_t *)dst;
1000         const uint16_t *s = (const uint16_t *)src;
1001         end = s + src_size/2;
1002 #ifdef HAVE_MMX
1003         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1004         mm_end = end - 7;
1005         while(s < mm_end)
1006         {
1007             __asm __volatile(
1008                 PREFETCH" 32%1\n\t"
1009                 "movq   %1, %%mm0\n\t"
1010                 "movq   %1, %%mm1\n\t"
1011                 "movq   %1, %%mm2\n\t"
1012                 "pand   %2, %%mm0\n\t"
1013                 "pand   %3, %%mm1\n\t"
1014                 "pand   %4, %%mm2\n\t"
1015                 "psllq  $3, %%mm0\n\t"
1016                 "psrlq  $3, %%mm1\n\t"
1017                 "psrlq  $8, %%mm2\n\t"
1018                 "movq   %%mm0, %%mm3\n\t"
1019                 "movq   %%mm1, %%mm4\n\t"
1020                 "movq   %%mm2, %%mm5\n\t"
1021                 "punpcklwd %5, %%mm0\n\t"
1022                 "punpcklwd %5, %%mm1\n\t"
1023                 "punpcklwd %5, %%mm2\n\t"
1024                 "punpckhwd %5, %%mm3\n\t"
1025                 "punpckhwd %5, %%mm4\n\t"
1026                 "punpckhwd %5, %%mm5\n\t"
1027                 "psllq  $8, %%mm1\n\t"
1028                 "psllq  $16, %%mm2\n\t"
1029                 "por    %%mm1, %%mm0\n\t"
1030                 "por    %%mm2, %%mm0\n\t"
1031                 "psllq  $8, %%mm4\n\t"
1032                 "psllq  $16, %%mm5\n\t"
1033                 "por    %%mm4, %%mm3\n\t"
1034                 "por    %%mm5, %%mm3\n\t"
1035                 
1036                 "movq   %%mm0, %%mm6\n\t"
1037                 "movq   %%mm3, %%mm7\n\t"
1038
1039                 "movq   8%1, %%mm0\n\t"
1040                 "movq   8%1, %%mm1\n\t"
1041                 "movq   8%1, %%mm2\n\t"
1042                 "pand   %2, %%mm0\n\t"
1043                 "pand   %3, %%mm1\n\t"
1044                 "pand   %4, %%mm2\n\t"
1045                 "psllq  $3, %%mm0\n\t"
1046                 "psrlq  $3, %%mm1\n\t"
1047                 "psrlq  $8, %%mm2\n\t"
1048                 "movq   %%mm0, %%mm3\n\t"
1049                 "movq   %%mm1, %%mm4\n\t"
1050                 "movq   %%mm2, %%mm5\n\t"
1051                 "punpcklwd %5, %%mm0\n\t"
1052                 "punpcklwd %5, %%mm1\n\t"
1053                 "punpcklwd %5, %%mm2\n\t"
1054                 "punpckhwd %5, %%mm3\n\t"
1055                 "punpckhwd %5, %%mm4\n\t"
1056                 "punpckhwd %5, %%mm5\n\t"
1057                 "psllq  $8, %%mm1\n\t"
1058                 "psllq  $16, %%mm2\n\t"
1059                 "por    %%mm1, %%mm0\n\t"
1060                 "por    %%mm2, %%mm0\n\t"
1061                 "psllq  $8, %%mm4\n\t"
1062                 "psllq  $16, %%mm5\n\t"
1063                 "por    %%mm4, %%mm3\n\t"
1064                 "por    %%mm5, %%mm3\n\t"
1065                 :"=m"(*d)
1066                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1067                 :"memory");
1068             /* Borrowed 32 to 24 */
1069             __asm __volatile(
1070                 "movq   %%mm0, %%mm4\n\t"
1071                 "movq   %%mm3, %%mm5\n\t"
1072                 "movq   %%mm6, %%mm0\n\t"
1073                 "movq   %%mm7, %%mm1\n\t"
1074                 
1075                 "movq   %%mm4, %%mm6\n\t"
1076                 "movq   %%mm5, %%mm7\n\t"
1077                 "movq   %%mm0, %%mm2\n\t"
1078                 "movq   %%mm1, %%mm3\n\t"
1079
1080                 "psrlq  $8, %%mm2\n\t"
1081                 "psrlq  $8, %%mm3\n\t"
1082                 "psrlq  $8, %%mm6\n\t"
1083                 "psrlq  $8, %%mm7\n\t"
1084                 "pand   %2, %%mm0\n\t"
1085                 "pand   %2, %%mm1\n\t"
1086                 "pand   %2, %%mm4\n\t"
1087                 "pand   %2, %%mm5\n\t"
1088                 "pand   %3, %%mm2\n\t"
1089                 "pand   %3, %%mm3\n\t"
1090                 "pand   %3, %%mm6\n\t"
1091                 "pand   %3, %%mm7\n\t"
1092                 "por    %%mm2, %%mm0\n\t"
1093                 "por    %%mm3, %%mm1\n\t"
1094                 "por    %%mm6, %%mm4\n\t"
1095                 "por    %%mm7, %%mm5\n\t"
1096
1097                 "movq   %%mm1, %%mm2\n\t"
1098                 "movq   %%mm4, %%mm3\n\t"
1099                 "psllq  $48, %%mm2\n\t"
1100                 "psllq  $32, %%mm3\n\t"
1101                 "pand   %4, %%mm2\n\t"
1102                 "pand   %5, %%mm3\n\t"
1103                 "por    %%mm2, %%mm0\n\t"
1104                 "psrlq  $16, %%mm1\n\t"
1105                 "psrlq  $32, %%mm4\n\t"
1106                 "psllq  $16, %%mm5\n\t"
1107                 "por    %%mm3, %%mm1\n\t"
1108                 "pand   %6, %%mm5\n\t"
1109                 "por    %%mm5, %%mm4\n\t"
1110
1111                 MOVNTQ" %%mm0, %0\n\t"
1112                 MOVNTQ" %%mm1, 8%0\n\t"
1113                 MOVNTQ" %%mm4, 16%0"
1114
1115                 :"=m"(*d)
1116                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1117                 :"memory");
1118                 d += 24;
1119                 s += 8;
1120         }
1121         __asm __volatile(SFENCE:::"memory");
1122         __asm __volatile(EMMS:::"memory");
1123 #endif
1124         while(s < end)
1125         {
1126                 register uint16_t bgr;
1127                 bgr = *s++;
1128                 *d++ = (bgr&0x1F)<<3;
1129                 *d++ = (bgr&0x7E0)>>3;
1130                 *d++ = (bgr&0xF800)>>8;
1131         }
1132 }
1133
1134 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1135 {
1136         const uint16_t *end;
1137 #ifdef HAVE_MMX
1138         const uint16_t *mm_end;
1139 #endif
1140         uint8_t *d = (uint8_t *)dst;
1141         const uint16_t *s = (const uint16_t *)src;
1142         end = s + src_size/2;
1143 #ifdef HAVE_MMX
1144         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1145         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1146         mm_end = end - 3;
1147         while(s < mm_end)
1148         {
1149             __asm __volatile(
1150                 PREFETCH" 32%1\n\t"
1151                 "movq   %1, %%mm0\n\t"
1152                 "movq   %1, %%mm1\n\t"
1153                 "movq   %1, %%mm2\n\t"
1154                 "pand   %2, %%mm0\n\t"
1155                 "pand   %3, %%mm1\n\t"
1156                 "pand   %4, %%mm2\n\t"
1157                 "psllq  $3, %%mm0\n\t"
1158                 "psrlq  $2, %%mm1\n\t"
1159                 "psrlq  $7, %%mm2\n\t"
1160                 "movq   %%mm0, %%mm3\n\t"
1161                 "movq   %%mm1, %%mm4\n\t"
1162                 "movq   %%mm2, %%mm5\n\t"
1163                 "punpcklwd %%mm7, %%mm0\n\t"
1164                 "punpcklwd %%mm7, %%mm1\n\t"
1165                 "punpcklwd %%mm7, %%mm2\n\t"
1166                 "punpckhwd %%mm7, %%mm3\n\t"
1167                 "punpckhwd %%mm7, %%mm4\n\t"
1168                 "punpckhwd %%mm7, %%mm5\n\t"
1169                 "psllq  $8, %%mm1\n\t"
1170                 "psllq  $16, %%mm2\n\t"
1171                 "por    %%mm1, %%mm0\n\t"
1172                 "por    %%mm2, %%mm0\n\t"
1173                 "psllq  $8, %%mm4\n\t"
1174                 "psllq  $16, %%mm5\n\t"
1175                 "por    %%mm4, %%mm3\n\t"
1176                 "por    %%mm5, %%mm3\n\t"
1177                 MOVNTQ" %%mm0, %0\n\t"
1178                 MOVNTQ" %%mm3, 8%0\n\t"
1179                 :"=m"(*d)
1180                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1181                 :"memory");
1182                 d += 16;
1183                 s += 4;
1184         }
1185         __asm __volatile(SFENCE:::"memory");
1186         __asm __volatile(EMMS:::"memory");
1187 #endif
1188         while(s < end)
1189         {
1190                 register uint16_t bgr;
1191                 bgr = *s++;
1192                 *d++ = (bgr&0x1F)<<3;
1193                 *d++ = (bgr&0x3E0)>>2;
1194                 *d++ = (bgr&0x7C00)>>7;
1195                 *d++ = 0;
1196         }
1197 }
1198
1199 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1200 {
1201         const uint16_t *end;
1202 #ifdef HAVE_MMX
1203         const uint16_t *mm_end;
1204 #endif
1205         uint8_t *d = (uint8_t *)dst;
1206         const uint16_t *s = (uint16_t *)src;
1207         end = s + src_size/2;
1208 #ifdef HAVE_MMX
1209         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1210         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1211         mm_end = end - 3;
1212         while(s < mm_end)
1213         {
1214             __asm __volatile(
1215                 PREFETCH" 32%1\n\t"
1216                 "movq   %1, %%mm0\n\t"
1217                 "movq   %1, %%mm1\n\t"
1218                 "movq   %1, %%mm2\n\t"
1219                 "pand   %2, %%mm0\n\t"
1220                 "pand   %3, %%mm1\n\t"
1221                 "pand   %4, %%mm2\n\t"
1222                 "psllq  $3, %%mm0\n\t"
1223                 "psrlq  $3, %%mm1\n\t"
1224                 "psrlq  $8, %%mm2\n\t"
1225                 "movq   %%mm0, %%mm3\n\t"
1226                 "movq   %%mm1, %%mm4\n\t"
1227                 "movq   %%mm2, %%mm5\n\t"
1228                 "punpcklwd %%mm7, %%mm0\n\t"
1229                 "punpcklwd %%mm7, %%mm1\n\t"
1230                 "punpcklwd %%mm7, %%mm2\n\t"
1231                 "punpckhwd %%mm7, %%mm3\n\t"
1232                 "punpckhwd %%mm7, %%mm4\n\t"
1233                 "punpckhwd %%mm7, %%mm5\n\t"
1234                 "psllq  $8, %%mm1\n\t"
1235                 "psllq  $16, %%mm2\n\t"
1236                 "por    %%mm1, %%mm0\n\t"
1237                 "por    %%mm2, %%mm0\n\t"
1238                 "psllq  $8, %%mm4\n\t"
1239                 "psllq  $16, %%mm5\n\t"
1240                 "por    %%mm4, %%mm3\n\t"
1241                 "por    %%mm5, %%mm3\n\t"
1242                 MOVNTQ" %%mm0, %0\n\t"
1243                 MOVNTQ" %%mm3, 8%0\n\t"
1244                 :"=m"(*d)
1245                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1246                 :"memory");
1247                 d += 16;
1248                 s += 4;
1249         }
1250         __asm __volatile(SFENCE:::"memory");
1251         __asm __volatile(EMMS:::"memory");
1252 #endif
1253         while(s < end)
1254         {
1255                 register uint16_t bgr;
1256                 bgr = *s++;
1257                 *d++ = (bgr&0x1F)<<3;
1258                 *d++ = (bgr&0x7E0)>>3;
1259                 *d++ = (bgr&0xF800)>>8;
1260                 *d++ = 0;
1261         }
1262 }
1263
1264 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1265 {
1266 #ifdef HAVE_MMX
1267 /* TODO: unroll this loop */
1268         asm volatile (
1269                 "xorl %%eax, %%eax              \n\t"
1270                 ".balign 16                     \n\t"
1271                 "1:                             \n\t"
1272                 PREFETCH" 32(%0, %%eax)         \n\t"
1273                 "movq (%0, %%eax), %%mm0        \n\t"
1274                 "movq %%mm0, %%mm1              \n\t"
1275                 "movq %%mm0, %%mm2              \n\t"
1276                 "pslld $16, %%mm0               \n\t"
1277                 "psrld $16, %%mm1               \n\t"
1278                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1279                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1280                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1281                 "por %%mm0, %%mm2               \n\t"
1282                 "por %%mm1, %%mm2               \n\t"
1283                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1284                 "addl $8, %%eax                 \n\t"
1285                 "cmpl %2, %%eax                 \n\t"
1286                 " jb 1b                         \n\t"
1287                 :: "r" (src), "r"(dst), "r" (src_size-7)
1288                 : "%eax"
1289         );
1290
1291         __asm __volatile(SFENCE:::"memory");
1292         __asm __volatile(EMMS:::"memory");
1293 #else
1294         unsigned i;
1295         unsigned num_pixels = src_size >> 2;
1296         for(i=0; i<num_pixels; i++)
1297         {
1298                 dst[4*i + 0] = src[4*i + 2];
1299                 dst[4*i + 1] = src[4*i + 1];
1300                 dst[4*i + 2] = src[4*i + 0];
1301         }
1302 #endif
1303 }
1304
1305 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1306 {
1307         unsigned i;
1308 #ifdef HAVE_MMX
1309         int mmx_size= 23 - src_size;
1310         asm volatile (
1311                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1312                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1313                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1314                 ".balign 16                     \n\t"
1315                 "1:                             \n\t"
1316                 PREFETCH" 32(%1, %%eax)         \n\t"
1317                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1318                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1319                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1320                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1321                 "pand %%mm5, %%mm0              \n\t"
1322                 "pand %%mm6, %%mm1              \n\t"
1323                 "pand %%mm7, %%mm2              \n\t"
1324                 "por %%mm0, %%mm1               \n\t"
1325                 "por %%mm2, %%mm1               \n\t"                
1326                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1327                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1328                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1329                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1330                 "pand %%mm7, %%mm0              \n\t"
1331                 "pand %%mm5, %%mm1              \n\t"
1332                 "pand %%mm6, %%mm2              \n\t"
1333                 "por %%mm0, %%mm1               \n\t"
1334                 "por %%mm2, %%mm1               \n\t"                
1335                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1336                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1337                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1338                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1339                 "pand %%mm6, %%mm0              \n\t"
1340                 "pand %%mm7, %%mm1              \n\t"
1341                 "pand %%mm5, %%mm2              \n\t"
1342                 "por %%mm0, %%mm1               \n\t"
1343                 "por %%mm2, %%mm1               \n\t"                
1344                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1345                 "addl $24, %%eax                \n\t"
1346                 " js 1b                         \n\t"
1347                 : "+a" (mmx_size)
1348                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1349         );
1350
1351         __asm __volatile(SFENCE:::"memory");
1352         __asm __volatile(EMMS:::"memory");
1353
1354         if(mmx_size==23) return; //finihsed, was multiple of 8
1355
1356         src+= src_size;
1357         dst+= src_size;
1358         src_size= 23-mmx_size;
1359         src-= src_size;
1360         dst-= src_size;
1361 #endif
1362         for(i=0; i<src_size; i+=3)
1363         {
1364                 register uint8_t x;
1365                 x          = src[i + 2];
1366                 dst[i + 1] = src[i + 1];
1367                 dst[i + 2] = src[i + 0];
1368                 dst[i + 0] = x;
1369         }
1370 }
1371
1372 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1373         unsigned int width, unsigned int height,
1374         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1375 {
1376         unsigned y;
1377         const unsigned chromWidth= width>>1;
1378         for(y=0; y<height; y++)
1379         {
1380 #ifdef HAVE_MMX
1381 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1382                 asm volatile(
1383                         "xorl %%eax, %%eax              \n\t"
1384                         ".balign 16                     \n\t"
1385                         "1:                             \n\t"
1386                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1387                         PREFETCH" 32(%2, %%eax)         \n\t"
1388                         PREFETCH" 32(%3, %%eax)         \n\t"
1389                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1390                         "movq %%mm0, %%mm2              \n\t" // U(0)
1391                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1392                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1393                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1394
1395                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1396                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1397                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1398                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1399                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1400                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1401                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1402                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1403
1404                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1405                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1406                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1408
1409                         "addl $8, %%eax                 \n\t"
1410                         "cmpl %4, %%eax                 \n\t"
1411                         " jb 1b                         \n\t"
1412                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1413                         : "%eax"
1414                 );
1415 #else
1416 #if __WORDSIZE >= 64
1417                 int i;
1418                 uint64_t *ldst = (uint64_t *) dst;
1419                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420                 for(i = 0; i < chromWidth; i += 2){
1421                         uint64_t k, l;
1422                         k = yc[0] + (uc[0] << 8) +
1423                             (yc[1] << 16) + (vc[0] << 24);
1424                         l = yc[2] + (uc[1] << 8) +
1425                             (yc[3] << 16) + (vc[1] << 24);
1426                         *ldst++ = k + (l << 32);
1427                         yc += 4;
1428                         uc += 2;
1429                         vc += 2;
1430                 }
1431
1432 #else
1433                 int i, *idst = (int32_t *) dst;
1434                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435                 for(i = 0; i < chromWidth; i++){
1436                         *idst++ = yc[0] + (uc[0] << 8) +
1437                             (yc[1] << 16) + (vc[0] << 24);
1438                         yc += 2;
1439                         uc++;
1440                         vc++;
1441                 }
1442 #endif
1443 #endif
1444                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1445                 {
1446                         usrc += chromStride;
1447                         vsrc += chromStride;
1448                 }
1449                 ysrc += lumStride;
1450                 dst += dstStride;
1451         }
1452 #ifdef HAVE_MMX
1453 asm(    EMMS" \n\t"
1454         SFENCE" \n\t"
1455         :::"memory");
1456 #endif
1457 }
1458
1459 /**
1460  *
1461  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462  * problem for anyone then tell me, and ill fix it)
1463  */
1464 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465         unsigned int width, unsigned int height,
1466         int lumStride, int chromStride, int dstStride)
1467 {
1468         //FIXME interpolate chroma
1469         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1470 }
1471
1472 /**
1473  *
1474  * width should be a multiple of 16
1475  */
1476 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477         unsigned int width, unsigned int height,
1478         int lumStride, int chromStride, int dstStride)
1479 {
1480         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1481 }
1482
1483 /**
1484  *
1485  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486  * problem for anyone then tell me, and ill fix it)
1487  */
1488 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1489         unsigned int width, unsigned int height,
1490         int lumStride, int chromStride, int srcStride)
1491 {
1492         unsigned y;
1493         const unsigned chromWidth= width>>1;
1494         for(y=0; y<height; y+=2)
1495         {
1496 #ifdef HAVE_MMX
1497                 asm volatile(
1498                         "xorl %%eax, %%eax              \n\t"
1499                         "pcmpeqw %%mm7, %%mm7           \n\t"
1500                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1501                         ".balign 16                     \n\t"
1502                         "1:                             \n\t"
1503                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1504                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1505                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1506                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1507                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1508                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1509                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1510                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1511                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1512                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1513                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1514
1515                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1516
1517                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1518                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1519                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1520                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1521                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1522                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1523                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1524                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1525                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1526                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1527
1528                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1529
1530                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1531                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1532                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1533                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1534                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1535                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1536                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1537                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1538
1539                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1540                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1541
1542                         "addl $8, %%eax                 \n\t"
1543                         "cmpl %4, %%eax                 \n\t"
1544                         " jb 1b                         \n\t"
1545                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1546                         : "memory", "%eax"
1547                 );
1548
1549                 ydst += lumStride;
1550                 src  += srcStride;
1551
1552                 asm volatile(
1553                         "xorl %%eax, %%eax              \n\t"
1554                         ".balign 16                     \n\t"
1555                         "1:                             \n\t"
1556                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1557                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1558                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1559                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1560                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1561                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1562                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1563                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1564                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1565                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1566                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1567
1568                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1569                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1570
1571                         "addl $8, %%eax                 \n\t"
1572                         "cmpl %4, %%eax                 \n\t"
1573                         " jb 1b                         \n\t"
1574
1575                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1576                         : "memory", "%eax"
1577                 );
1578 #else
1579                 unsigned i;
1580                 for(i=0; i<chromWidth; i++)
1581                 {
1582                         ydst[2*i+0]     = src[4*i+0];
1583                         udst[i]         = src[4*i+1];
1584                         ydst[2*i+1]     = src[4*i+2];
1585                         vdst[i]         = src[4*i+3];
1586                 }
1587                 ydst += lumStride;
1588                 src  += srcStride;
1589
1590                 for(i=0; i<chromWidth; i++)
1591                 {
1592                         ydst[2*i+0]     = src[4*i+0];
1593                         ydst[2*i+1]     = src[4*i+2];
1594                 }
1595 #endif
1596                 udst += chromStride;
1597                 vdst += chromStride;
1598                 ydst += lumStride;
1599                 src  += srcStride;
1600         }
1601 #ifdef HAVE_MMX
1602 asm volatile(   EMMS" \n\t"
1603                 SFENCE" \n\t"
1604                 :::"memory");
1605 #endif
1606 }
1607
1608 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610         unsigned int width, unsigned int height, int lumStride, int chromStride)
1611 {
1612         /* Y Plane */
1613         memcpy(ydst, ysrc, width*height);
1614
1615         /* XXX: implement upscaling for U,V */
1616 }
1617
1618 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1619 {
1620         int x,y;
1621         
1622         dst[0]= src[0];
1623         
1624         // first line
1625         for(x=0; x<srcWidth-1; x++){
1626                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1627                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1628         }
1629         dst[2*srcWidth-1]= src[srcWidth-1];
1630         
1631         dst+= dstStride;
1632
1633         for(y=1; y<srcHeight; y++){
1634 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1635                 const int mmxSize= srcWidth&~15;
1636                 asm volatile(
1637                         "movl %4, %%eax                 \n\t"
1638                         "1:                             \n\t"
1639                         "movq (%0, %%eax), %%mm0        \n\t"
1640                         "movq (%1, %%eax), %%mm1        \n\t"
1641                         "movq 1(%0, %%eax), %%mm2       \n\t"
1642                         "movq 1(%1, %%eax), %%mm3       \n\t"
1643                         "movq -1(%0, %%eax), %%mm4      \n\t"
1644                         "movq -1(%1, %%eax), %%mm5      \n\t"
1645                         PAVGB" %%mm0, %%mm5             \n\t"
1646                         PAVGB" %%mm0, %%mm3             \n\t"
1647                         PAVGB" %%mm0, %%mm5             \n\t"
1648                         PAVGB" %%mm0, %%mm3             \n\t"
1649                         PAVGB" %%mm1, %%mm4             \n\t"
1650                         PAVGB" %%mm1, %%mm2             \n\t"
1651                         PAVGB" %%mm1, %%mm4             \n\t"
1652                         PAVGB" %%mm1, %%mm2             \n\t"
1653                         "movq %%mm5, %%mm7              \n\t"
1654                         "movq %%mm4, %%mm6              \n\t"
1655                         "punpcklbw %%mm3, %%mm5         \n\t"
1656                         "punpckhbw %%mm3, %%mm7         \n\t"
1657                         "punpcklbw %%mm2, %%mm4         \n\t"
1658                         "punpckhbw %%mm2, %%mm6         \n\t"
1659 #if 1
1660                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
1661                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
1662                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
1663                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
1664 #else
1665                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
1666                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
1667                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
1668                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
1669 #endif
1670                         "addl $8, %%eax                 \n\t"
1671                         " js 1b                         \n\t"
1672                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1673                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1674                            "g" (-mmxSize)
1675                         : "%eax"
1676
1677                 );
1678 #else
1679                 const int mmxSize=1;
1680 #endif
1681                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1682                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1683
1684                 for(x=mmxSize-1; x<srcWidth-1; x++){
1685                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1686                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1687                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1688                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1689                 }
1690                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1691                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1692
1693                 dst+=dstStride*2;
1694                 src+=srcStride;
1695         }
1696         
1697         // last line
1698 #if 1
1699         dst[0]= src[0];
1700         
1701         for(x=0; x<srcWidth-1; x++){
1702                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1703                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1704         }
1705         dst[2*srcWidth-1]= src[srcWidth-1];
1706 #else
1707         for(x=0; x<srcWidth; x++){
1708                 dst[2*x+0]=
1709                 dst[2*x+1]= src[x];
1710         }
1711 #endif
1712
1713 #ifdef HAVE_MMX
1714 asm volatile(   EMMS" \n\t"
1715                 SFENCE" \n\t"
1716                 :::"memory");
1717 #endif
1718 }
1719
1720 /**
1721  *
1722  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1723  * problem for anyone then tell me, and ill fix it)
1724  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1725  */
1726 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1727         unsigned int width, unsigned int height,
1728         int lumStride, int chromStride, int srcStride)
1729 {
1730         unsigned y;
1731         const unsigned chromWidth= width>>1;
1732         for(y=0; y<height; y+=2)
1733         {
1734 #ifdef HAVE_MMX
1735                 asm volatile(
1736                         "xorl %%eax, %%eax              \n\t"
1737                         "pcmpeqw %%mm7, %%mm7           \n\t"
1738                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1739                         ".balign 16                     \n\t"
1740                         "1:                             \n\t"
1741                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1742                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1743                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1744                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1745                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1746                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1747                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1748                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1749                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1750                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1751                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1752
1753                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1754
1755                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1756                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1757                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1758                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1759                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1760                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1761                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1762                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1763                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1764                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1765
1766                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1767
1768                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1769                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1770                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1771                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1772                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1773                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1774                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1775                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1776
1777                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1778                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1779
1780                         "addl $8, %%eax                 \n\t"
1781                         "cmpl %4, %%eax                 \n\t"
1782                         " jb 1b                         \n\t"
1783                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1784                         : "memory", "%eax"
1785                 );
1786
1787                 ydst += lumStride;
1788                 src  += srcStride;
1789
1790                 asm volatile(
1791                         "xorl %%eax, %%eax              \n\t"
1792                         ".balign 16                     \n\t"
1793                         "1:                             \n\t"
1794                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1795                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1796                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1797                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1798                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1799                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1800                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1801                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1802                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1803                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1804                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1805
1806                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1807                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1808
1809                         "addl $8, %%eax                 \n\t"
1810                         "cmpl %4, %%eax                 \n\t"
1811                         " jb 1b                         \n\t"
1812
1813                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1814                         : "memory", "%eax"
1815                 );
1816 #else
1817                 unsigned i;
1818                 for(i=0; i<chromWidth; i++)
1819                 {
1820                         udst[i]         = src[4*i+0];
1821                         ydst[2*i+0]     = src[4*i+1];
1822                         vdst[i]         = src[4*i+2];
1823                         ydst[2*i+1]     = src[4*i+3];
1824                 }
1825                 ydst += lumStride;
1826                 src  += srcStride;
1827
1828                 for(i=0; i<chromWidth; i++)
1829                 {
1830                         ydst[2*i+0]     = src[4*i+1];
1831                         ydst[2*i+1]     = src[4*i+3];
1832                 }
1833 #endif
1834                 udst += chromStride;
1835                 vdst += chromStride;
1836                 ydst += lumStride;
1837                 src  += srcStride;
1838         }
1839 #ifdef HAVE_MMX
1840 asm volatile(   EMMS" \n\t"
1841                 SFENCE" \n\t"
1842                 :::"memory");
1843 #endif
1844 }
1845
1846 /**
1847  *
1848  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1849  * problem for anyone then tell me, and ill fix it)
1850  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1851  */
1852 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853         unsigned int width, unsigned int height,
1854         int lumStride, int chromStride, int srcStride)
1855 {
1856         unsigned y;
1857         const unsigned chromWidth= width>>1;
1858 #ifdef HAVE_MMX
1859         for(y=0; y<height-2; y+=2)
1860         {
1861                 unsigned i;
1862                 for(i=0; i<2; i++)
1863                 {
1864                         asm volatile(
1865                                 "movl %2, %%eax                 \n\t"
1866                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1867                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1868                                 "pxor %%mm7, %%mm7              \n\t"
1869                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1870                                 ".balign 16                     \n\t"
1871                                 "1:                             \n\t"
1872                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1873                                 "movd (%0, %%ebx), %%mm0        \n\t"
1874                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1875                                 "punpcklbw %%mm7, %%mm0         \n\t"
1876                                 "punpcklbw %%mm7, %%mm1         \n\t"
1877                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1878                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1879                                 "punpcklbw %%mm7, %%mm2         \n\t"
1880                                 "punpcklbw %%mm7, %%mm3         \n\t"
1881                                 "pmaddwd %%mm6, %%mm0           \n\t"
1882                                 "pmaddwd %%mm6, %%mm1           \n\t"
1883                                 "pmaddwd %%mm6, %%mm2           \n\t"
1884                                 "pmaddwd %%mm6, %%mm3           \n\t"
1885 #ifndef FAST_BGR2YV12
1886                                 "psrad $8, %%mm0                \n\t"
1887                                 "psrad $8, %%mm1                \n\t"
1888                                 "psrad $8, %%mm2                \n\t"
1889                                 "psrad $8, %%mm3                \n\t"
1890 #endif
1891                                 "packssdw %%mm1, %%mm0          \n\t"
1892                                 "packssdw %%mm3, %%mm2          \n\t"
1893                                 "pmaddwd %%mm5, %%mm0           \n\t"
1894                                 "pmaddwd %%mm5, %%mm2           \n\t"
1895                                 "packssdw %%mm2, %%mm0          \n\t"
1896                                 "psraw $7, %%mm0                \n\t"
1897
1898                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1899                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1900                                 "punpcklbw %%mm7, %%mm4         \n\t"
1901                                 "punpcklbw %%mm7, %%mm1         \n\t"
1902                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1903                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1904                                 "punpcklbw %%mm7, %%mm2         \n\t"
1905                                 "punpcklbw %%mm7, %%mm3         \n\t"
1906                                 "pmaddwd %%mm6, %%mm4           \n\t"
1907                                 "pmaddwd %%mm6, %%mm1           \n\t"
1908                                 "pmaddwd %%mm6, %%mm2           \n\t"
1909                                 "pmaddwd %%mm6, %%mm3           \n\t"
1910 #ifndef FAST_BGR2YV12
1911                                 "psrad $8, %%mm4                \n\t"
1912                                 "psrad $8, %%mm1                \n\t"
1913                                 "psrad $8, %%mm2                \n\t"
1914                                 "psrad $8, %%mm3                \n\t"
1915 #endif
1916                                 "packssdw %%mm1, %%mm4          \n\t"
1917                                 "packssdw %%mm3, %%mm2          \n\t"
1918                                 "pmaddwd %%mm5, %%mm4           \n\t"
1919                                 "pmaddwd %%mm5, %%mm2           \n\t"
1920                                 "addl $24, %%ebx                \n\t"
1921                                 "packssdw %%mm2, %%mm4          \n\t"
1922                                 "psraw $7, %%mm4                \n\t"
1923
1924                                 "packuswb %%mm4, %%mm0          \n\t"
1925                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1926
1927                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
1928                                 "addl $8, %%eax                 \n\t"
1929                                 " js 1b                         \n\t"
1930                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1931                                 : "%eax", "%ebx"
1932                         );
1933                         ydst += lumStride;
1934                         src  += srcStride;
1935                 }
1936                 src -= srcStride*2;
1937                 asm volatile(
1938                         "movl %4, %%eax                 \n\t"
1939                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1940                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1941                         "pxor %%mm7, %%mm7              \n\t"
1942                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1943                         "addl %%ebx, %%ebx              \n\t"
1944                         ".balign 16                     \n\t"
1945                         "1:                             \n\t"
1946                         PREFETCH" 64(%0, %%ebx)         \n\t"
1947                         PREFETCH" 64(%1, %%ebx)         \n\t"
1948 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1949                         "movq (%0, %%ebx), %%mm0        \n\t"
1950                         "movq (%1, %%ebx), %%mm1        \n\t"
1951                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1952                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1953                         PAVGB" %%mm1, %%mm0             \n\t"
1954                         PAVGB" %%mm3, %%mm2             \n\t"
1955                         "movq %%mm0, %%mm1              \n\t"
1956                         "movq %%mm2, %%mm3              \n\t"
1957                         "psrlq $24, %%mm0               \n\t"
1958                         "psrlq $24, %%mm2               \n\t"
1959                         PAVGB" %%mm1, %%mm0             \n\t"
1960                         PAVGB" %%mm3, %%mm2             \n\t"
1961                         "punpcklbw %%mm7, %%mm0         \n\t"
1962                         "punpcklbw %%mm7, %%mm2         \n\t"
1963 #else
1964                         "movd (%0, %%ebx), %%mm0        \n\t"
1965                         "movd (%1, %%ebx), %%mm1        \n\t"
1966                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1967                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1968                         "punpcklbw %%mm7, %%mm0         \n\t"
1969                         "punpcklbw %%mm7, %%mm1         \n\t"
1970                         "punpcklbw %%mm7, %%mm2         \n\t"
1971                         "punpcklbw %%mm7, %%mm3         \n\t"
1972                         "paddw %%mm1, %%mm0             \n\t"
1973                         "paddw %%mm3, %%mm2             \n\t"
1974                         "paddw %%mm2, %%mm0             \n\t"
1975                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1976                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1977                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1978                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1979                         "punpcklbw %%mm7, %%mm4         \n\t"
1980                         "punpcklbw %%mm7, %%mm1         \n\t"
1981                         "punpcklbw %%mm7, %%mm2         \n\t"
1982                         "punpcklbw %%mm7, %%mm3         \n\t"
1983                         "paddw %%mm1, %%mm4             \n\t"
1984                         "paddw %%mm3, %%mm2             \n\t"
1985                         "paddw %%mm4, %%mm2             \n\t"
1986                         "psrlw $2, %%mm0                \n\t"
1987                         "psrlw $2, %%mm2                \n\t"
1988 #endif
1989                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1990                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1991
1992                         "pmaddwd %%mm0, %%mm1           \n\t"
1993                         "pmaddwd %%mm2, %%mm3           \n\t"
1994                         "pmaddwd %%mm6, %%mm0           \n\t"
1995                         "pmaddwd %%mm6, %%mm2           \n\t"
1996 #ifndef FAST_BGR2YV12
1997                         "psrad $8, %%mm0                \n\t"
1998                         "psrad $8, %%mm1                \n\t"
1999                         "psrad $8, %%mm2                \n\t"
2000                         "psrad $8, %%mm3                \n\t"
2001 #endif
2002                         "packssdw %%mm2, %%mm0          \n\t"
2003                         "packssdw %%mm3, %%mm1          \n\t"
2004                         "pmaddwd %%mm5, %%mm0           \n\t"
2005                         "pmaddwd %%mm5, %%mm1           \n\t"
2006                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2007                         "psraw $7, %%mm0                \n\t"
2008
2009 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2010                         "movq 12(%0, %%ebx), %%mm4      \n\t"
2011                         "movq 12(%1, %%ebx), %%mm1      \n\t"
2012                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2013                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2014                         PAVGB" %%mm1, %%mm4             \n\t"
2015                         PAVGB" %%mm3, %%mm2             \n\t"
2016                         "movq %%mm4, %%mm1              \n\t"
2017                         "movq %%mm2, %%mm3              \n\t"
2018                         "psrlq $24, %%mm4               \n\t"
2019                         "psrlq $24, %%mm2               \n\t"
2020                         PAVGB" %%mm1, %%mm4             \n\t"
2021                         PAVGB" %%mm3, %%mm2             \n\t"
2022                         "punpcklbw %%mm7, %%mm4         \n\t"
2023                         "punpcklbw %%mm7, %%mm2         \n\t"
2024 #else
2025                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2026                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2027                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2028                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2029                         "punpcklbw %%mm7, %%mm4         \n\t"
2030                         "punpcklbw %%mm7, %%mm1         \n\t"
2031                         "punpcklbw %%mm7, %%mm2         \n\t"
2032                         "punpcklbw %%mm7, %%mm3         \n\t"
2033                         "paddw %%mm1, %%mm4             \n\t"
2034                         "paddw %%mm3, %%mm2             \n\t"
2035                         "paddw %%mm2, %%mm4             \n\t"
2036                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2037                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2038                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2039                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2040                         "punpcklbw %%mm7, %%mm5         \n\t"
2041                         "punpcklbw %%mm7, %%mm1         \n\t"
2042                         "punpcklbw %%mm7, %%mm2         \n\t"
2043                         "punpcklbw %%mm7, %%mm3         \n\t"
2044                         "paddw %%mm1, %%mm5             \n\t"
2045                         "paddw %%mm3, %%mm2             \n\t"
2046                         "paddw %%mm5, %%mm2             \n\t"
2047                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2048                         "psrlw $2, %%mm4                \n\t"
2049                         "psrlw $2, %%mm2                \n\t"
2050 #endif
2051                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2052                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2053
2054                         "pmaddwd %%mm4, %%mm1           \n\t"
2055                         "pmaddwd %%mm2, %%mm3           \n\t"
2056                         "pmaddwd %%mm6, %%mm4           \n\t"
2057                         "pmaddwd %%mm6, %%mm2           \n\t"
2058 #ifndef FAST_BGR2YV12
2059                         "psrad $8, %%mm4                \n\t"
2060                         "psrad $8, %%mm1                \n\t"
2061                         "psrad $8, %%mm2                \n\t"
2062                         "psrad $8, %%mm3                \n\t"
2063 #endif
2064                         "packssdw %%mm2, %%mm4          \n\t"
2065                         "packssdw %%mm3, %%mm1          \n\t"
2066                         "pmaddwd %%mm5, %%mm4           \n\t"
2067                         "pmaddwd %%mm5, %%mm1           \n\t"
2068                         "addl $24, %%ebx                \n\t"
2069                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2070                         "psraw $7, %%mm4                \n\t"
2071
2072                         "movq %%mm0, %%mm1              \n\t"
2073                         "punpckldq %%mm4, %%mm0         \n\t"
2074                         "punpckhdq %%mm4, %%mm1         \n\t"
2075                         "packsswb %%mm1, %%mm0          \n\t"
2076                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2077
2078                         "movd %%mm0, (%2, %%eax)        \n\t"
2079                         "punpckhdq %%mm0, %%mm0         \n\t"
2080                         "movd %%mm0, (%3, %%eax)        \n\t"
2081                         "addl $4, %%eax                 \n\t"
2082                         " js 1b                         \n\t"
2083                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2084                         : "%eax", "%ebx"
2085                 );
2086
2087                 udst += chromStride;
2088                 vdst += chromStride;
2089                 src  += srcStride*2;
2090         }
2091
2092         asm volatile(   EMMS" \n\t"
2093                         SFENCE" \n\t"
2094                         :::"memory");
2095 #else
2096         y=0;
2097 #endif
2098         for(; y<height; y+=2)
2099         {
2100                 unsigned i;
2101                 for(i=0; i<chromWidth; i++)
2102                 {
2103                         unsigned int b= src[6*i+0];
2104                         unsigned int g= src[6*i+1];
2105                         unsigned int r= src[6*i+2];
2106
2107                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2108                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2109                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2110
2111                         udst[i]         = U;
2112                         vdst[i]         = V;
2113                         ydst[2*i]       = Y;
2114
2115                         b= src[6*i+3];
2116                         g= src[6*i+4];
2117                         r= src[6*i+5];
2118
2119                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2120                         ydst[2*i+1]     = Y;
2121                 }
2122                 ydst += lumStride;
2123                 src  += srcStride;
2124
2125                 for(i=0; i<chromWidth; i++)
2126                 {
2127                         unsigned int b= src[6*i+0];
2128                         unsigned int g= src[6*i+1];
2129                         unsigned int r= src[6*i+2];
2130
2131                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2132
2133                         ydst[2*i]       = Y;
2134
2135                         b= src[6*i+3];
2136                         g= src[6*i+4];
2137                         r= src[6*i+5];
2138
2139                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2140                         ydst[2*i+1]     = Y;
2141                 }
2142                 udst += chromStride;
2143                 vdst += chromStride;
2144                 ydst += lumStride;
2145                 src  += srcStride;
2146         }
2147 }
2148
2149 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2150                             unsigned width, unsigned height, int src1Stride,
2151                             int src2Stride, int dstStride){
2152         unsigned h;
2153
2154         for(h=0; h < height; h++)
2155         {
2156                 unsigned w;
2157
2158 #ifdef HAVE_MMX
2159 #ifdef HAVE_SSE2
2160                 asm(
2161                         "xorl %%eax, %%eax              \n\t"
2162                         "1:                             \n\t"
2163                         PREFETCH" 64(%1, %%eax)         \n\t"
2164                         PREFETCH" 64(%2, %%eax)         \n\t"
2165                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2166                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2167                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2168                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2169                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2170                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2171                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2172                         "addl $16, %%eax                        \n\t"
2173                         "cmpl %3, %%eax                 \n\t"
2174                         " jb 1b                         \n\t"
2175                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2176                         : "memory", "%eax"
2177                 );
2178 #else
2179                 asm(
2180                         "xorl %%eax, %%eax              \n\t"
2181                         "1:                             \n\t"
2182                         PREFETCH" 64(%1, %%eax)         \n\t"
2183                         PREFETCH" 64(%2, %%eax)         \n\t"
2184                         "movq (%1, %%eax), %%mm0        \n\t"
2185                         "movq 8(%1, %%eax), %%mm2       \n\t"
2186                         "movq %%mm0, %%mm1              \n\t"
2187                         "movq %%mm2, %%mm3              \n\t"
2188                         "movq (%2, %%eax), %%mm4        \n\t"
2189                         "movq 8(%2, %%eax), %%mm5       \n\t"
2190                         "punpcklbw %%mm4, %%mm0         \n\t"
2191                         "punpckhbw %%mm4, %%mm1         \n\t"
2192                         "punpcklbw %%mm5, %%mm2         \n\t"
2193                         "punpckhbw %%mm5, %%mm3         \n\t"
2194                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2195                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2196                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2197                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2198                         "addl $16, %%eax                        \n\t"
2199                         "cmpl %3, %%eax                 \n\t"
2200                         " jb 1b                         \n\t"
2201                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2202                         : "memory", "%eax"
2203                 );
2204 #endif
2205                 for(w= (width&(~15)); w < width; w++)
2206                 {
2207                         dest[2*w+0] = src1[w];
2208                         dest[2*w+1] = src2[w];
2209                 }
2210 #else
2211                 for(w=0; w < width; w++)
2212                 {
2213                         dest[2*w+0] = src1[w];
2214                         dest[2*w+1] = src2[w];
2215                 }
2216 #endif
2217                 dest += dstStride;
2218                 src1 += src1Stride;
2219                 src2 += src2Stride;
2220         }
2221 #ifdef HAVE_MMX
2222         asm(
2223                 EMMS" \n\t"
2224                 SFENCE" \n\t"
2225                 ::: "memory"
2226                 );
2227 #endif
2228 }
2229
2230 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2231                         uint8_t *dst1, uint8_t *dst2,
2232                         unsigned width, unsigned height,
2233                         int srcStride1, int srcStride2,
2234                         int dstStride1, int dstStride2)
2235 {
2236     unsigned int y,x,h;
2237     int w;
2238     w=width/2; h=height/2;
2239 #ifdef HAVE_MMX
2240     asm volatile(
2241         PREFETCH" %0\n\t"
2242         PREFETCH" %1\n\t"
2243         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2244 #endif
2245     for(y=0;y<h;y++){
2246         const uint8_t* s1=src1+srcStride1*(y>>1);
2247         uint8_t* d=dst1+dstStride1*y;
2248         x=0;
2249 #ifdef HAVE_MMX
2250         for(;x<w-31;x+=32)
2251         {
2252             asm volatile(
2253                 PREFETCH" 32%1\n\t"
2254                 "movq   %1, %%mm0\n\t"
2255                 "movq   8%1, %%mm2\n\t"
2256                 "movq   16%1, %%mm4\n\t"
2257                 "movq   24%1, %%mm6\n\t"
2258                 "movq   %%mm0, %%mm1\n\t"
2259                 "movq   %%mm2, %%mm3\n\t"
2260                 "movq   %%mm4, %%mm5\n\t"
2261                 "movq   %%mm6, %%mm7\n\t"
2262                 "punpcklbw %%mm0, %%mm0\n\t"
2263                 "punpckhbw %%mm1, %%mm1\n\t"
2264                 "punpcklbw %%mm2, %%mm2\n\t"
2265                 "punpckhbw %%mm3, %%mm3\n\t"
2266                 "punpcklbw %%mm4, %%mm4\n\t"
2267                 "punpckhbw %%mm5, %%mm5\n\t"
2268                 "punpcklbw %%mm6, %%mm6\n\t"
2269                 "punpckhbw %%mm7, %%mm7\n\t"
2270                 MOVNTQ" %%mm0, %0\n\t"
2271                 MOVNTQ" %%mm1, 8%0\n\t"
2272                 MOVNTQ" %%mm2, 16%0\n\t"
2273                 MOVNTQ" %%mm3, 24%0\n\t"
2274                 MOVNTQ" %%mm4, 32%0\n\t"
2275                 MOVNTQ" %%mm5, 40%0\n\t"
2276                 MOVNTQ" %%mm6, 48%0\n\t"
2277                 MOVNTQ" %%mm7, 56%0"
2278                 :"=m"(d[2*x])
2279                 :"m"(s1[x])
2280                 :"memory");
2281         }
2282 #endif
2283         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2284     }
2285     for(y=0;y<h;y++){
2286         const uint8_t* s2=src2+srcStride2*(y>>1);
2287         uint8_t* d=dst2+dstStride2*y;
2288         x=0;
2289 #ifdef HAVE_MMX
2290         for(;x<w-31;x+=32)
2291         {
2292             asm volatile(
2293                 PREFETCH" 32%1\n\t"
2294                 "movq   %1, %%mm0\n\t"
2295                 "movq   8%1, %%mm2\n\t"
2296                 "movq   16%1, %%mm4\n\t"
2297                 "movq   24%1, %%mm6\n\t"
2298                 "movq   %%mm0, %%mm1\n\t"
2299                 "movq   %%mm2, %%mm3\n\t"
2300                 "movq   %%mm4, %%mm5\n\t"
2301                 "movq   %%mm6, %%mm7\n\t"
2302                 "punpcklbw %%mm0, %%mm0\n\t"
2303                 "punpckhbw %%mm1, %%mm1\n\t"
2304                 "punpcklbw %%mm2, %%mm2\n\t"
2305                 "punpckhbw %%mm3, %%mm3\n\t"
2306                 "punpcklbw %%mm4, %%mm4\n\t"
2307                 "punpckhbw %%mm5, %%mm5\n\t"
2308                 "punpcklbw %%mm6, %%mm6\n\t"
2309                 "punpckhbw %%mm7, %%mm7\n\t"
2310                 MOVNTQ" %%mm0, %0\n\t"
2311                 MOVNTQ" %%mm1, 8%0\n\t"
2312                 MOVNTQ" %%mm2, 16%0\n\t"
2313                 MOVNTQ" %%mm3, 24%0\n\t"
2314                 MOVNTQ" %%mm4, 32%0\n\t"
2315                 MOVNTQ" %%mm5, 40%0\n\t"
2316                 MOVNTQ" %%mm6, 48%0\n\t"
2317                 MOVNTQ" %%mm7, 56%0"
2318                 :"=m"(d[2*x])
2319                 :"m"(s2[x])
2320                 :"memory");
2321         }
2322 #endif
2323         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2324     }
2325 #ifdef HAVE_MMX
2326         asm(
2327                 EMMS" \n\t"
2328                 SFENCE" \n\t"
2329                 ::: "memory"
2330                 );
2331 #endif
2332 }
2333
2334 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2335                         uint8_t *dst,
2336                         unsigned width, unsigned height,
2337                         int srcStride1, int srcStride2,
2338                         int srcStride3, int dstStride)
2339 {
2340     unsigned y,x,x2,w,h;
2341     w=width/2; h=height;
2342 #ifdef HAVE_MMX
2343     asm volatile(
2344         PREFETCH" %0\n\t"
2345         PREFETCH" %1\n\t"
2346         PREFETCH" %2\n\t"
2347         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2348 #endif
2349     for(y=0;y<h;y++){
2350         const uint8_t* yp=src1+srcStride1*y;
2351         const uint8_t* up=src2+srcStride2*(y>>2);
2352         const uint8_t* vp=src3+srcStride3*(y>>2);
2353         uint8_t* d=dst+dstStride*y;
2354         x2=0;
2355         x=0;
2356 #ifdef HAVE_MMX
2357         for(;x<w;x+=8,x2+=32)
2358         {
2359             asm volatile(
2360                 PREFETCH" 32%1\n\t"
2361                 PREFETCH" 32%2\n\t"
2362                 PREFETCH" 32%3\n\t"
2363                 "movq   %1, %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2364                 "movq   %2, %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2365                 "movq   %3, %%mm2\n\t"       /* V0V1V2V3V4V5V6V7 */
2366                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2367                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2368                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2369                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2370                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2371                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2372                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2373
2374                 "movq   %%mm1, %%mm6\n\t"
2375                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2376                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2377                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2378                 MOVNTQ" %%mm0, %0\n\t"
2379                 MOVNTQ" %%mm3, 8%0\n\t"
2380                 
2381                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2382                 "movq   8%1, %%mm0\n\t"
2383                 "movq   %%mm0, %%mm3\n\t"
2384                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2385                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2386                 MOVNTQ" %%mm0, 16%0\n\t"
2387                 MOVNTQ" %%mm3, 24%0\n\t"
2388
2389                 "movq   %%mm4, %%mm6\n\t"
2390                 "movq   16%1, %%mm0\n\t"
2391                 "movq   %%mm0, %%mm3\n\t"
2392                 "punpcklbw %%mm5, %%mm4\n\t"
2393                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2394                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2395                 MOVNTQ" %%mm0, 32%0\n\t"
2396                 MOVNTQ" %%mm3, 40%0\n\t"
2397                 
2398                 "punpckhbw %%mm5, %%mm6\n\t"
2399                 "movq   24%1, %%mm0\n\t"
2400                 "movq   %%mm0, %%mm3\n\t"
2401                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2402                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2403                 MOVNTQ" %%mm0, 48%0\n\t"
2404                 MOVNTQ" %%mm3, 56%0\n\t"
2405
2406                 :"=m"(d[8*x])
2407                 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2408                 :"memory");
2409         }
2410 #endif
2411         for(;x<w;x++,x2+=4)
2412         {
2413             d[8*x+0]=yp[x2];
2414             d[8*x+1]=up[x];
2415             d[8*x+2]=yp[x2+1];
2416             d[8*x+3]=vp[x];
2417             d[8*x+4]=yp[x2+2];
2418             d[8*x+5]=up[x];
2419             d[8*x+6]=yp[x2+3];
2420             d[8*x+7]=vp[x];
2421         }
2422     }
2423 #ifdef HAVE_MMX
2424         asm(
2425                 EMMS" \n\t"
2426                 SFENCE" \n\t"
2427                 ::: "memory"
2428                 );
2429 #endif
2430 }