]> git.sesse.net Git - ffmpeg/blob - libswscale/rgb2rgb_template.c
f06e4ed72d07021e391ffd48aa5fc98cece998a6
[ffmpeg] / libswscale / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  *
11  * This file is part of FFmpeg.
12  *
13  * FFmpeg is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * FFmpeg is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with FFmpeg; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26  * 
27  * the C code (not assembly, mmx, ...) of this file can be used
28  * under the LGPL license too
29  */
30
31 #include <stddef.h>
32 #include <inttypes.h> /* for __WORDSIZE */
33
34 #ifndef __WORDSIZE
35 // #warning You have misconfigured system and probably will lose performance!
36 #define __WORDSIZE MP_WORDSIZE
37 #endif
38
39 #undef PREFETCH
40 #undef MOVNTQ
41 #undef EMMS
42 #undef SFENCE
43 #undef MMREG_SIZE
44 #undef PREFETCHW
45 #undef PAVGB
46
47 #ifdef HAVE_SSE2
48 #define MMREG_SIZE 16
49 #else
50 #define MMREG_SIZE 8
51 #endif
52
53 #ifdef HAVE_3DNOW
54 #define PREFETCH  "prefetch"
55 #define PREFETCHW "prefetchw"
56 #define PAVGB     "pavgusb"
57 #elif defined ( HAVE_MMX2 )
58 #define PREFETCH "prefetchnta"
59 #define PREFETCHW "prefetcht0"
60 #define PAVGB     "pavgb"
61 #else
62 #ifdef __APPLE__
63 #define PREFETCH "#"
64 #define PREFETCHW "#"
65 #else
66 #define PREFETCH  " # nop"
67 #define PREFETCHW " # nop"
68 #endif
69 #endif
70
71 #ifdef HAVE_3DNOW
72 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
73 #define EMMS     "femms"
74 #else
75 #define EMMS     "emms"
76 #endif
77
78 #ifdef HAVE_MMX2
79 #define MOVNTQ "movntq"
80 #define SFENCE "sfence"
81 #else
82 #define MOVNTQ "movq"
83 #define SFENCE " # nop"
84 #endif
85
86 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
87 {
88   uint8_t *dest = dst;
89   const uint8_t *s = src;
90   const uint8_t *end;
91 #ifdef HAVE_MMX
92   const uint8_t *mm_end;
93 #endif
94   end = s + src_size;
95 #ifdef HAVE_MMX
96   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
97   mm_end = end - 23;
98   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
99   while(s < mm_end)
100   {
101     __asm __volatile(
102         PREFETCH"       32%1\n\t"
103         "movd   %1, %%mm0\n\t"
104         "punpckldq 3%1, %%mm0\n\t"
105         "movd   6%1, %%mm1\n\t"
106         "punpckldq 9%1, %%mm1\n\t"
107         "movd   12%1, %%mm2\n\t"
108         "punpckldq 15%1, %%mm2\n\t"
109         "movd   18%1, %%mm3\n\t"
110         "punpckldq 21%1, %%mm3\n\t"
111         "pand   %%mm7, %%mm0\n\t"
112         "pand   %%mm7, %%mm1\n\t"
113         "pand   %%mm7, %%mm2\n\t"
114         "pand   %%mm7, %%mm3\n\t"
115         MOVNTQ" %%mm0, %0\n\t"
116         MOVNTQ" %%mm1, 8%0\n\t"
117         MOVNTQ" %%mm2, 16%0\n\t"
118         MOVNTQ" %%mm3, 24%0"
119         :"=m"(*dest)
120         :"m"(*s)
121         :"memory");
122     dest += 32;
123     s += 24;
124   }
125   __asm __volatile(SFENCE:::"memory");
126   __asm __volatile(EMMS:::"memory");
127 #endif
128   while(s < end)
129   {
130 #ifdef WORDS_BIGENDIAN
131     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
132     *dest++ = 0;
133     *dest++ = s[2];
134     *dest++ = s[1];
135     *dest++ = s[0];
136     s+=3;
137 #else
138     *dest++ = *s++;
139     *dest++ = *s++;
140     *dest++ = *s++;
141     *dest++ = 0;
142 #endif
143   }
144 }
145
146 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
147 {
148   uint8_t *dest = dst;
149   const uint8_t *s = src;
150   const uint8_t *end;
151 #ifdef HAVE_MMX
152   const uint8_t *mm_end;
153 #endif
154   end = s + src_size;
155 #ifdef HAVE_MMX
156   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
157   mm_end = end - 31;
158   while(s < mm_end)
159   {
160     __asm __volatile(
161         PREFETCH"       32%1\n\t"
162         "movq   %1, %%mm0\n\t"
163         "movq   8%1, %%mm1\n\t"
164         "movq   16%1, %%mm4\n\t"
165         "movq   24%1, %%mm5\n\t"
166         "movq   %%mm0, %%mm2\n\t"
167         "movq   %%mm1, %%mm3\n\t"
168         "movq   %%mm4, %%mm6\n\t"
169         "movq   %%mm5, %%mm7\n\t"
170         "psrlq  $8, %%mm2\n\t"
171         "psrlq  $8, %%mm3\n\t"
172         "psrlq  $8, %%mm6\n\t"
173         "psrlq  $8, %%mm7\n\t"
174         "pand   %2, %%mm0\n\t"
175         "pand   %2, %%mm1\n\t"
176         "pand   %2, %%mm4\n\t"
177         "pand   %2, %%mm5\n\t"
178         "pand   %3, %%mm2\n\t"
179         "pand   %3, %%mm3\n\t"
180         "pand   %3, %%mm6\n\t"
181         "pand   %3, %%mm7\n\t"
182         "por    %%mm2, %%mm0\n\t"
183         "por    %%mm3, %%mm1\n\t"
184         "por    %%mm6, %%mm4\n\t"
185         "por    %%mm7, %%mm5\n\t"
186
187         "movq   %%mm1, %%mm2\n\t"
188         "movq   %%mm4, %%mm3\n\t"
189         "psllq  $48, %%mm2\n\t"
190         "psllq  $32, %%mm3\n\t"
191         "pand   %4, %%mm2\n\t"
192         "pand   %5, %%mm3\n\t"
193         "por    %%mm2, %%mm0\n\t"
194         "psrlq  $16, %%mm1\n\t"
195         "psrlq  $32, %%mm4\n\t"
196         "psllq  $16, %%mm5\n\t"
197         "por    %%mm3, %%mm1\n\t"
198         "pand   %6, %%mm5\n\t"
199         "por    %%mm5, %%mm4\n\t"
200
201         MOVNTQ" %%mm0, %0\n\t"
202         MOVNTQ" %%mm1, 8%0\n\t"
203         MOVNTQ" %%mm4, 16%0"
204         :"=m"(*dest)
205         :"m"(*s),"m"(mask24l),
206          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
207         :"memory");
208     dest += 24;
209     s += 32;
210   }
211   __asm __volatile(SFENCE:::"memory");
212   __asm __volatile(EMMS:::"memory");
213 #endif
214   while(s < end)
215   {
216 #ifdef WORDS_BIGENDIAN
217     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
218     s++;
219     dest[2] = *s++;
220     dest[1] = *s++;
221     dest[0] = *s++;
222     dest += 3;
223 #else
224     *dest++ = *s++;
225     *dest++ = *s++;
226     *dest++ = *s++;
227     s++;
228 #endif
229   }
230 }
231
232 /*
233  Original by Strepto/Astral
234  ported to gcc & bugfixed : A'rpi
235  MMX2, 3DNOW optimization by Nick Kurshev
236  32bit c version, and and&add trick by Michael Niedermayer
237 */
238 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239 {
240   register const uint8_t* s=src;
241   register uint8_t* d=dst;
242   register const uint8_t *end;
243   const uint8_t *mm_end;
244   end = s + src_size;
245 #ifdef HAVE_MMX
246   __asm __volatile(PREFETCH"    %0"::"m"(*s));
247   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
248   mm_end = end - 15;
249   while(s<mm_end)
250   {
251         __asm __volatile(
252                 PREFETCH"       32%1\n\t"
253                 "movq   %1, %%mm0\n\t"
254                 "movq   8%1, %%mm2\n\t"
255                 "movq   %%mm0, %%mm1\n\t"
256                 "movq   %%mm2, %%mm3\n\t"
257                 "pand   %%mm4, %%mm0\n\t"
258                 "pand   %%mm4, %%mm2\n\t"
259                 "paddw  %%mm1, %%mm0\n\t"
260                 "paddw  %%mm3, %%mm2\n\t"
261                 MOVNTQ" %%mm0, %0\n\t"
262                 MOVNTQ" %%mm2, 8%0"
263                 :"=m"(*d)
264                 :"m"(*s)
265                 );
266         d+=16;
267         s+=16;
268   }
269   __asm __volatile(SFENCE:::"memory");
270   __asm __volatile(EMMS:::"memory");
271 #endif
272     mm_end = end - 3;
273     while(s < mm_end)
274     {
275         register unsigned x= *((uint32_t *)s);
276         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
277         d+=4;
278         s+=4;
279     }
280     if(s < end)
281     {
282         register unsigned short x= *((uint16_t *)s);
283         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
284     }
285 }
286
287 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288 {
289   register const uint8_t* s=src;
290   register uint8_t* d=dst;
291   register const uint8_t *end;
292   const uint8_t *mm_end;
293   end = s + src_size;
294 #ifdef HAVE_MMX
295   __asm __volatile(PREFETCH"    %0"::"m"(*s));
296   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
297   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
298   mm_end = end - 15;
299   while(s<mm_end)
300   {
301         __asm __volatile(
302                 PREFETCH"       32%1\n\t"
303                 "movq   %1, %%mm0\n\t"
304                 "movq   8%1, %%mm2\n\t"
305                 "movq   %%mm0, %%mm1\n\t"
306                 "movq   %%mm2, %%mm3\n\t"
307                 "psrlq  $1, %%mm0\n\t"
308                 "psrlq  $1, %%mm2\n\t"
309                 "pand   %%mm7, %%mm0\n\t"
310                 "pand   %%mm7, %%mm2\n\t"
311                 "pand   %%mm6, %%mm1\n\t"
312                 "pand   %%mm6, %%mm3\n\t"
313                 "por    %%mm1, %%mm0\n\t"
314                 "por    %%mm3, %%mm2\n\t"
315                 MOVNTQ" %%mm0, %0\n\t"
316                 MOVNTQ" %%mm2, 8%0"
317                 :"=m"(*d)
318                 :"m"(*s)
319                 );
320         d+=16;
321         s+=16;
322   }
323   __asm __volatile(SFENCE:::"memory");
324   __asm __volatile(EMMS:::"memory");
325 #endif
326     mm_end = end - 3;
327     while(s < mm_end)
328     {
329         register uint32_t x= *((uint32_t *)s);
330         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
331         s+=4;
332         d+=4;
333     }
334     if(s < end)
335     {
336         register uint16_t x= *((uint16_t *)s);
337         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
338         s+=2;
339         d+=2;
340     }
341 }
342
343 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 {
345         const uint8_t *s = src;
346         const uint8_t *end;
347 #ifdef HAVE_MMX
348         const uint8_t *mm_end;
349 #endif
350         uint16_t *d = (uint16_t *)dst;
351         end = s + src_size;
352 #ifdef HAVE_MMX
353         mm_end = end - 15;
354 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
355         asm volatile(
356                 "movq %3, %%mm5                 \n\t"
357                 "movq %4, %%mm6                 \n\t"
358                 "movq %5, %%mm7                 \n\t"
359                 "jmp 2f                         \n\t"
360                 ASMALIGN(4)
361                 "1:                             \n\t"
362                 PREFETCH" 32(%1)                \n\t"
363                 "movd   (%1), %%mm0             \n\t"
364                 "movd   4(%1), %%mm3            \n\t"
365                 "punpckldq 8(%1), %%mm0         \n\t"
366                 "punpckldq 12(%1), %%mm3        \n\t"
367                 "movq %%mm0, %%mm1              \n\t"
368                 "movq %%mm3, %%mm4              \n\t"
369                 "pand %%mm6, %%mm0              \n\t"
370                 "pand %%mm6, %%mm3              \n\t"
371                 "pmaddwd %%mm7, %%mm0           \n\t"
372                 "pmaddwd %%mm7, %%mm3           \n\t"
373                 "pand %%mm5, %%mm1              \n\t"
374                 "pand %%mm5, %%mm4              \n\t"
375                 "por %%mm1, %%mm0               \n\t"   
376                 "por %%mm4, %%mm3               \n\t"
377                 "psrld $5, %%mm0                \n\t"
378                 "pslld $11, %%mm3               \n\t"
379                 "por %%mm3, %%mm0               \n\t"
380                 MOVNTQ" %%mm0, (%0)             \n\t"
381                 "add $16, %1                    \n\t"
382                 "add $8, %0                     \n\t"
383                 "2:                             \n\t"
384                 "cmp %2, %1                     \n\t"
385                 " jb 1b                         \n\t"
386                 : "+r" (d), "+r"(s)
387                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
388         );
389 #else
390         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
391         __asm __volatile(
392             "movq       %0, %%mm7\n\t"
393             "movq       %1, %%mm6\n\t"
394             ::"m"(red_16mask),"m"(green_16mask));
395         while(s < mm_end)
396         {
397             __asm __volatile(
398                 PREFETCH" 32%1\n\t"
399                 "movd   %1, %%mm0\n\t"
400                 "movd   4%1, %%mm3\n\t"
401                 "punpckldq 8%1, %%mm0\n\t"
402                 "punpckldq 12%1, %%mm3\n\t"
403                 "movq   %%mm0, %%mm1\n\t"
404                 "movq   %%mm0, %%mm2\n\t"
405                 "movq   %%mm3, %%mm4\n\t"
406                 "movq   %%mm3, %%mm5\n\t"
407                 "psrlq  $3, %%mm0\n\t"
408                 "psrlq  $3, %%mm3\n\t"
409                 "pand   %2, %%mm0\n\t"
410                 "pand   %2, %%mm3\n\t"
411                 "psrlq  $5, %%mm1\n\t"
412                 "psrlq  $5, %%mm4\n\t"
413                 "pand   %%mm6, %%mm1\n\t"
414                 "pand   %%mm6, %%mm4\n\t"
415                 "psrlq  $8, %%mm2\n\t"
416                 "psrlq  $8, %%mm5\n\t"
417                 "pand   %%mm7, %%mm2\n\t"
418                 "pand   %%mm7, %%mm5\n\t"
419                 "por    %%mm1, %%mm0\n\t"
420                 "por    %%mm4, %%mm3\n\t"
421                 "por    %%mm2, %%mm0\n\t"
422                 "por    %%mm5, %%mm3\n\t"
423                 "psllq  $16, %%mm3\n\t"
424                 "por    %%mm3, %%mm0\n\t"
425                 MOVNTQ" %%mm0, %0\n\t"
426                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
427                 d += 4;
428                 s += 16;
429         }
430 #endif
431         __asm __volatile(SFENCE:::"memory");
432         __asm __volatile(EMMS:::"memory");
433 #endif
434         while(s < end)
435         {
436                 register int rgb = *(uint32_t*)s; s += 4;
437                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
438         }
439 }
440
441 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 {
443         const uint8_t *s = src;
444         const uint8_t *end;
445 #ifdef HAVE_MMX
446         const uint8_t *mm_end;
447 #endif
448         uint16_t *d = (uint16_t *)dst;
449         end = s + src_size;
450 #ifdef HAVE_MMX
451         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
452         __asm __volatile(
453             "movq       %0, %%mm7\n\t"
454             "movq       %1, %%mm6\n\t"
455             ::"m"(red_16mask),"m"(green_16mask));
456         mm_end = end - 15;
457         while(s < mm_end)
458         {
459             __asm __volatile(
460                 PREFETCH" 32%1\n\t"
461                 "movd   %1, %%mm0\n\t"
462                 "movd   4%1, %%mm3\n\t"
463                 "punpckldq 8%1, %%mm0\n\t"
464                 "punpckldq 12%1, %%mm3\n\t"
465                 "movq   %%mm0, %%mm1\n\t"
466                 "movq   %%mm0, %%mm2\n\t"
467                 "movq   %%mm3, %%mm4\n\t"
468                 "movq   %%mm3, %%mm5\n\t"
469                 "psllq  $8, %%mm0\n\t"
470                 "psllq  $8, %%mm3\n\t"
471                 "pand   %%mm7, %%mm0\n\t"
472                 "pand   %%mm7, %%mm3\n\t"
473                 "psrlq  $5, %%mm1\n\t"
474                 "psrlq  $5, %%mm4\n\t"
475                 "pand   %%mm6, %%mm1\n\t"
476                 "pand   %%mm6, %%mm4\n\t"
477                 "psrlq  $19, %%mm2\n\t"
478                 "psrlq  $19, %%mm5\n\t"
479                 "pand   %2, %%mm2\n\t"
480                 "pand   %2, %%mm5\n\t"
481                 "por    %%mm1, %%mm0\n\t"
482                 "por    %%mm4, %%mm3\n\t"
483                 "por    %%mm2, %%mm0\n\t"
484                 "por    %%mm5, %%mm3\n\t"
485                 "psllq  $16, %%mm3\n\t"
486                 "por    %%mm3, %%mm0\n\t"
487                 MOVNTQ" %%mm0, %0\n\t"
488                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
489                 d += 4;
490                 s += 16;
491         }
492         __asm __volatile(SFENCE:::"memory");
493         __asm __volatile(EMMS:::"memory");
494 #endif
495         while(s < end)
496         {
497                 register int rgb = *(uint32_t*)s; s += 4;
498                 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
499         }
500 }
501
502 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 {
504         const uint8_t *s = src;
505         const uint8_t *end;
506 #ifdef HAVE_MMX
507         const uint8_t *mm_end;
508 #endif
509         uint16_t *d = (uint16_t *)dst;
510         end = s + src_size;
511 #ifdef HAVE_MMX
512         mm_end = end - 15;
513 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
514         asm volatile(
515                 "movq %3, %%mm5                 \n\t"
516                 "movq %4, %%mm6                 \n\t"
517                 "movq %5, %%mm7                 \n\t"
518                 "jmp 2f                         \n\t"
519                 ASMALIGN(4)
520                 "1:                             \n\t"
521                 PREFETCH" 32(%1)                \n\t"
522                 "movd   (%1), %%mm0             \n\t"
523                 "movd   4(%1), %%mm3            \n\t"
524                 "punpckldq 8(%1), %%mm0         \n\t"
525                 "punpckldq 12(%1), %%mm3        \n\t"
526                 "movq %%mm0, %%mm1              \n\t"
527                 "movq %%mm3, %%mm4              \n\t"
528                 "pand %%mm6, %%mm0              \n\t"
529                 "pand %%mm6, %%mm3              \n\t"
530                 "pmaddwd %%mm7, %%mm0           \n\t"
531                 "pmaddwd %%mm7, %%mm3           \n\t"
532                 "pand %%mm5, %%mm1              \n\t"
533                 "pand %%mm5, %%mm4              \n\t"
534                 "por %%mm1, %%mm0               \n\t"   
535                 "por %%mm4, %%mm3               \n\t"
536                 "psrld $6, %%mm0                \n\t"
537                 "pslld $10, %%mm3               \n\t"
538                 "por %%mm3, %%mm0               \n\t"
539                 MOVNTQ" %%mm0, (%0)             \n\t"
540                 "add $16, %1                    \n\t"
541                 "add $8, %0                     \n\t"
542                 "2:                             \n\t"
543                 "cmp %2, %1                     \n\t"
544                 " jb 1b                         \n\t"
545                 : "+r" (d), "+r"(s)
546                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
547         );
548 #else
549         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
550         __asm __volatile(
551             "movq       %0, %%mm7\n\t"
552             "movq       %1, %%mm6\n\t"
553             ::"m"(red_15mask),"m"(green_15mask));
554         while(s < mm_end)
555         {
556             __asm __volatile(
557                 PREFETCH" 32%1\n\t"
558                 "movd   %1, %%mm0\n\t"
559                 "movd   4%1, %%mm3\n\t"
560                 "punpckldq 8%1, %%mm0\n\t"
561                 "punpckldq 12%1, %%mm3\n\t"
562                 "movq   %%mm0, %%mm1\n\t"
563                 "movq   %%mm0, %%mm2\n\t"
564                 "movq   %%mm3, %%mm4\n\t"
565                 "movq   %%mm3, %%mm5\n\t"
566                 "psrlq  $3, %%mm0\n\t"
567                 "psrlq  $3, %%mm3\n\t"
568                 "pand   %2, %%mm0\n\t"
569                 "pand   %2, %%mm3\n\t"
570                 "psrlq  $6, %%mm1\n\t"
571                 "psrlq  $6, %%mm4\n\t"
572                 "pand   %%mm6, %%mm1\n\t"
573                 "pand   %%mm6, %%mm4\n\t"
574                 "psrlq  $9, %%mm2\n\t"
575                 "psrlq  $9, %%mm5\n\t"
576                 "pand   %%mm7, %%mm2\n\t"
577                 "pand   %%mm7, %%mm5\n\t"
578                 "por    %%mm1, %%mm0\n\t"
579                 "por    %%mm4, %%mm3\n\t"
580                 "por    %%mm2, %%mm0\n\t"
581                 "por    %%mm5, %%mm3\n\t"
582                 "psllq  $16, %%mm3\n\t"
583                 "por    %%mm3, %%mm0\n\t"
584                 MOVNTQ" %%mm0, %0\n\t"
585                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
586                 d += 4;
587                 s += 16;
588         }
589 #endif
590         __asm __volatile(SFENCE:::"memory");
591         __asm __volatile(EMMS:::"memory");
592 #endif
593         while(s < end)
594         {
595                 register int rgb = *(uint32_t*)s; s += 4;
596                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
597         }
598 }
599
600 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 {
602         const uint8_t *s = src;
603         const uint8_t *end;
604 #ifdef HAVE_MMX
605         const uint8_t *mm_end;
606 #endif
607         uint16_t *d = (uint16_t *)dst;
608         end = s + src_size;
609 #ifdef HAVE_MMX
610         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
611         __asm __volatile(
612             "movq       %0, %%mm7\n\t"
613             "movq       %1, %%mm6\n\t"
614             ::"m"(red_15mask),"m"(green_15mask));
615         mm_end = end - 15;
616         while(s < mm_end)
617         {
618             __asm __volatile(
619                 PREFETCH" 32%1\n\t"
620                 "movd   %1, %%mm0\n\t"
621                 "movd   4%1, %%mm3\n\t"
622                 "punpckldq 8%1, %%mm0\n\t"
623                 "punpckldq 12%1, %%mm3\n\t"
624                 "movq   %%mm0, %%mm1\n\t"
625                 "movq   %%mm0, %%mm2\n\t"
626                 "movq   %%mm3, %%mm4\n\t"
627                 "movq   %%mm3, %%mm5\n\t"
628                 "psllq  $7, %%mm0\n\t"
629                 "psllq  $7, %%mm3\n\t"
630                 "pand   %%mm7, %%mm0\n\t"
631                 "pand   %%mm7, %%mm3\n\t"
632                 "psrlq  $6, %%mm1\n\t"
633                 "psrlq  $6, %%mm4\n\t"
634                 "pand   %%mm6, %%mm1\n\t"
635                 "pand   %%mm6, %%mm4\n\t"
636                 "psrlq  $19, %%mm2\n\t"
637                 "psrlq  $19, %%mm5\n\t"
638                 "pand   %2, %%mm2\n\t"
639                 "pand   %2, %%mm5\n\t"
640                 "por    %%mm1, %%mm0\n\t"
641                 "por    %%mm4, %%mm3\n\t"
642                 "por    %%mm2, %%mm0\n\t"
643                 "por    %%mm5, %%mm3\n\t"
644                 "psllq  $16, %%mm3\n\t"
645                 "por    %%mm3, %%mm0\n\t"
646                 MOVNTQ" %%mm0, %0\n\t"
647                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
648                 d += 4;
649                 s += 16;
650         }
651         __asm __volatile(SFENCE:::"memory");
652         __asm __volatile(EMMS:::"memory");
653 #endif
654         while(s < end)
655         {
656                 register int rgb = *(uint32_t*)s; s += 4;
657                 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
658         }
659 }
660
661 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662 {
663         const uint8_t *s = src;
664         const uint8_t *end;
665 #ifdef HAVE_MMX
666         const uint8_t *mm_end;
667 #endif
668         uint16_t *d = (uint16_t *)dst;
669         end = s + src_size;
670 #ifdef HAVE_MMX
671         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
672         __asm __volatile(
673             "movq       %0, %%mm7\n\t"
674             "movq       %1, %%mm6\n\t"
675             ::"m"(red_16mask),"m"(green_16mask));
676         mm_end = end - 11;
677         while(s < mm_end)
678         {
679             __asm __volatile(
680                 PREFETCH" 32%1\n\t"
681                 "movd   %1, %%mm0\n\t"
682                 "movd   3%1, %%mm3\n\t"
683                 "punpckldq 6%1, %%mm0\n\t"
684                 "punpckldq 9%1, %%mm3\n\t"
685                 "movq   %%mm0, %%mm1\n\t"
686                 "movq   %%mm0, %%mm2\n\t"
687                 "movq   %%mm3, %%mm4\n\t"
688                 "movq   %%mm3, %%mm5\n\t"
689                 "psrlq  $3, %%mm0\n\t"
690                 "psrlq  $3, %%mm3\n\t"
691                 "pand   %2, %%mm0\n\t"
692                 "pand   %2, %%mm3\n\t"
693                 "psrlq  $5, %%mm1\n\t"
694                 "psrlq  $5, %%mm4\n\t"
695                 "pand   %%mm6, %%mm1\n\t"
696                 "pand   %%mm6, %%mm4\n\t"
697                 "psrlq  $8, %%mm2\n\t"
698                 "psrlq  $8, %%mm5\n\t"
699                 "pand   %%mm7, %%mm2\n\t"
700                 "pand   %%mm7, %%mm5\n\t"
701                 "por    %%mm1, %%mm0\n\t"
702                 "por    %%mm4, %%mm3\n\t"
703                 "por    %%mm2, %%mm0\n\t"
704                 "por    %%mm5, %%mm3\n\t"
705                 "psllq  $16, %%mm3\n\t"
706                 "por    %%mm3, %%mm0\n\t"
707                 MOVNTQ" %%mm0, %0\n\t"
708                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
709                 d += 4;
710                 s += 12;
711         }
712         __asm __volatile(SFENCE:::"memory");
713         __asm __volatile(EMMS:::"memory");
714 #endif
715         while(s < end)
716         {
717                 const int b= *s++;
718                 const int g= *s++;
719                 const int r= *s++;
720                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
721         }
722 }
723
724 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725 {
726         const uint8_t *s = src;
727         const uint8_t *end;
728 #ifdef HAVE_MMX
729         const uint8_t *mm_end;
730 #endif
731         uint16_t *d = (uint16_t *)dst;
732         end = s + src_size;
733 #ifdef HAVE_MMX
734         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
735         __asm __volatile(
736             "movq       %0, %%mm7\n\t"
737             "movq       %1, %%mm6\n\t"
738             ::"m"(red_16mask),"m"(green_16mask));
739         mm_end = end - 15;
740         while(s < mm_end)
741         {
742             __asm __volatile(
743                 PREFETCH" 32%1\n\t"
744                 "movd   %1, %%mm0\n\t"
745                 "movd   3%1, %%mm3\n\t"
746                 "punpckldq 6%1, %%mm0\n\t"
747                 "punpckldq 9%1, %%mm3\n\t"
748                 "movq   %%mm0, %%mm1\n\t"
749                 "movq   %%mm0, %%mm2\n\t"
750                 "movq   %%mm3, %%mm4\n\t"
751                 "movq   %%mm3, %%mm5\n\t"
752                 "psllq  $8, %%mm0\n\t"
753                 "psllq  $8, %%mm3\n\t"
754                 "pand   %%mm7, %%mm0\n\t"
755                 "pand   %%mm7, %%mm3\n\t"
756                 "psrlq  $5, %%mm1\n\t"
757                 "psrlq  $5, %%mm4\n\t"
758                 "pand   %%mm6, %%mm1\n\t"
759                 "pand   %%mm6, %%mm4\n\t"
760                 "psrlq  $19, %%mm2\n\t"
761                 "psrlq  $19, %%mm5\n\t"
762                 "pand   %2, %%mm2\n\t"
763                 "pand   %2, %%mm5\n\t"
764                 "por    %%mm1, %%mm0\n\t"
765                 "por    %%mm4, %%mm3\n\t"
766                 "por    %%mm2, %%mm0\n\t"
767                 "por    %%mm5, %%mm3\n\t"
768                 "psllq  $16, %%mm3\n\t"
769                 "por    %%mm3, %%mm0\n\t"
770                 MOVNTQ" %%mm0, %0\n\t"
771                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
772                 d += 4;
773                 s += 12;
774         }
775         __asm __volatile(SFENCE:::"memory");
776         __asm __volatile(EMMS:::"memory");
777 #endif
778         while(s < end)
779         {
780                 const int r= *s++;
781                 const int g= *s++;
782                 const int b= *s++;
783                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
784         }
785 }
786
787 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788 {
789         const uint8_t *s = src;
790         const uint8_t *end;
791 #ifdef HAVE_MMX
792         const uint8_t *mm_end;
793 #endif
794         uint16_t *d = (uint16_t *)dst;
795         end = s + src_size;
796 #ifdef HAVE_MMX
797         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
798         __asm __volatile(
799             "movq       %0, %%mm7\n\t"
800             "movq       %1, %%mm6\n\t"
801             ::"m"(red_15mask),"m"(green_15mask));
802         mm_end = end - 11;
803         while(s < mm_end)
804         {
805             __asm __volatile(
806                 PREFETCH" 32%1\n\t"
807                 "movd   %1, %%mm0\n\t"
808                 "movd   3%1, %%mm3\n\t"
809                 "punpckldq 6%1, %%mm0\n\t"
810                 "punpckldq 9%1, %%mm3\n\t"
811                 "movq   %%mm0, %%mm1\n\t"
812                 "movq   %%mm0, %%mm2\n\t"
813                 "movq   %%mm3, %%mm4\n\t"
814                 "movq   %%mm3, %%mm5\n\t"
815                 "psrlq  $3, %%mm0\n\t"
816                 "psrlq  $3, %%mm3\n\t"
817                 "pand   %2, %%mm0\n\t"
818                 "pand   %2, %%mm3\n\t"
819                 "psrlq  $6, %%mm1\n\t"
820                 "psrlq  $6, %%mm4\n\t"
821                 "pand   %%mm6, %%mm1\n\t"
822                 "pand   %%mm6, %%mm4\n\t"
823                 "psrlq  $9, %%mm2\n\t"
824                 "psrlq  $9, %%mm5\n\t"
825                 "pand   %%mm7, %%mm2\n\t"
826                 "pand   %%mm7, %%mm5\n\t"
827                 "por    %%mm1, %%mm0\n\t"
828                 "por    %%mm4, %%mm3\n\t"
829                 "por    %%mm2, %%mm0\n\t"
830                 "por    %%mm5, %%mm3\n\t"
831                 "psllq  $16, %%mm3\n\t"
832                 "por    %%mm3, %%mm0\n\t"
833                 MOVNTQ" %%mm0, %0\n\t"
834                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
835                 d += 4;
836                 s += 12;
837         }
838         __asm __volatile(SFENCE:::"memory");
839         __asm __volatile(EMMS:::"memory");
840 #endif
841         while(s < end)
842         {
843                 const int b= *s++;
844                 const int g= *s++;
845                 const int r= *s++;
846                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
847         }
848 }
849
850 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851 {
852         const uint8_t *s = src;
853         const uint8_t *end;
854 #ifdef HAVE_MMX
855         const uint8_t *mm_end;
856 #endif
857         uint16_t *d = (uint16_t *)dst;
858         end = s + src_size;
859 #ifdef HAVE_MMX
860         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
861         __asm __volatile(
862             "movq       %0, %%mm7\n\t"
863             "movq       %1, %%mm6\n\t"
864             ::"m"(red_15mask),"m"(green_15mask));
865         mm_end = end - 15;
866         while(s < mm_end)
867         {
868             __asm __volatile(
869                 PREFETCH" 32%1\n\t"
870                 "movd   %1, %%mm0\n\t"
871                 "movd   3%1, %%mm3\n\t"
872                 "punpckldq 6%1, %%mm0\n\t"
873                 "punpckldq 9%1, %%mm3\n\t"
874                 "movq   %%mm0, %%mm1\n\t"
875                 "movq   %%mm0, %%mm2\n\t"
876                 "movq   %%mm3, %%mm4\n\t"
877                 "movq   %%mm3, %%mm5\n\t"
878                 "psllq  $7, %%mm0\n\t"
879                 "psllq  $7, %%mm3\n\t"
880                 "pand   %%mm7, %%mm0\n\t"
881                 "pand   %%mm7, %%mm3\n\t"
882                 "psrlq  $6, %%mm1\n\t"
883                 "psrlq  $6, %%mm4\n\t"
884                 "pand   %%mm6, %%mm1\n\t"
885                 "pand   %%mm6, %%mm4\n\t"
886                 "psrlq  $19, %%mm2\n\t"
887                 "psrlq  $19, %%mm5\n\t"
888                 "pand   %2, %%mm2\n\t"
889                 "pand   %2, %%mm5\n\t"
890                 "por    %%mm1, %%mm0\n\t"
891                 "por    %%mm4, %%mm3\n\t"
892                 "por    %%mm2, %%mm0\n\t"
893                 "por    %%mm5, %%mm3\n\t"
894                 "psllq  $16, %%mm3\n\t"
895                 "por    %%mm3, %%mm0\n\t"
896                 MOVNTQ" %%mm0, %0\n\t"
897                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
898                 d += 4;
899                 s += 12;
900         }
901         __asm __volatile(SFENCE:::"memory");
902         __asm __volatile(EMMS:::"memory");
903 #endif
904         while(s < end)
905         {
906                 const int r= *s++;
907                 const int g= *s++;
908                 const int b= *s++;
909                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
910         }
911 }
912
913 /*
914   I use here less accurate approximation by simply
915  left-shifting the input
916   value and filling the low order bits with
917  zeroes. This method improves png's
918   compression but this scheme cannot reproduce white exactly, since it does not
919   generate an all-ones maximum value; the net effect is to darken the
920   image slightly.
921
922   The better method should be "left bit replication":
923
924    4 3 2 1 0
925    ---------
926    1 1 0 1 1
927
928    7 6 5 4 3  2 1 0
929    ----------------
930    1 1 0 1 1  1 1 0
931    |=======|  |===|
932        |      Leftmost Bits Repeated to Fill Open Bits
933        |
934    Original Bits
935 */
936 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
937 {
938         const uint16_t *end;
939 #ifdef HAVE_MMX
940         const uint16_t *mm_end;
941 #endif
942         uint8_t *d = (uint8_t *)dst;
943         const uint16_t *s = (uint16_t *)src;
944         end = s + src_size/2;
945 #ifdef HAVE_MMX
946         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
947         mm_end = end - 7;
948         while(s < mm_end)
949         {
950             __asm __volatile(
951                 PREFETCH" 32%1\n\t"
952                 "movq   %1, %%mm0\n\t"
953                 "movq   %1, %%mm1\n\t"
954                 "movq   %1, %%mm2\n\t"
955                 "pand   %2, %%mm0\n\t"
956                 "pand   %3, %%mm1\n\t"
957                 "pand   %4, %%mm2\n\t"
958                 "psllq  $3, %%mm0\n\t"
959                 "psrlq  $2, %%mm1\n\t"
960                 "psrlq  $7, %%mm2\n\t"
961                 "movq   %%mm0, %%mm3\n\t"
962                 "movq   %%mm1, %%mm4\n\t"
963                 "movq   %%mm2, %%mm5\n\t"
964                 "punpcklwd %5, %%mm0\n\t"
965                 "punpcklwd %5, %%mm1\n\t"
966                 "punpcklwd %5, %%mm2\n\t"
967                 "punpckhwd %5, %%mm3\n\t"
968                 "punpckhwd %5, %%mm4\n\t"
969                 "punpckhwd %5, %%mm5\n\t"
970                 "psllq  $8, %%mm1\n\t"
971                 "psllq  $16, %%mm2\n\t"
972                 "por    %%mm1, %%mm0\n\t"
973                 "por    %%mm2, %%mm0\n\t"
974                 "psllq  $8, %%mm4\n\t"
975                 "psllq  $16, %%mm5\n\t"
976                 "por    %%mm4, %%mm3\n\t"
977                 "por    %%mm5, %%mm3\n\t"
978
979                 "movq   %%mm0, %%mm6\n\t"
980                 "movq   %%mm3, %%mm7\n\t"
981                 
982                 "movq   8%1, %%mm0\n\t"
983                 "movq   8%1, %%mm1\n\t"
984                 "movq   8%1, %%mm2\n\t"
985                 "pand   %2, %%mm0\n\t"
986                 "pand   %3, %%mm1\n\t"
987                 "pand   %4, %%mm2\n\t"
988                 "psllq  $3, %%mm0\n\t"
989                 "psrlq  $2, %%mm1\n\t"
990                 "psrlq  $7, %%mm2\n\t"
991                 "movq   %%mm0, %%mm3\n\t"
992                 "movq   %%mm1, %%mm4\n\t"
993                 "movq   %%mm2, %%mm5\n\t"
994                 "punpcklwd %5, %%mm0\n\t"
995                 "punpcklwd %5, %%mm1\n\t"
996                 "punpcklwd %5, %%mm2\n\t"
997                 "punpckhwd %5, %%mm3\n\t"
998                 "punpckhwd %5, %%mm4\n\t"
999                 "punpckhwd %5, %%mm5\n\t"
1000                 "psllq  $8, %%mm1\n\t"
1001                 "psllq  $16, %%mm2\n\t"
1002                 "por    %%mm1, %%mm0\n\t"
1003                 "por    %%mm2, %%mm0\n\t"
1004                 "psllq  $8, %%mm4\n\t"
1005                 "psllq  $16, %%mm5\n\t"
1006                 "por    %%mm4, %%mm3\n\t"
1007                 "por    %%mm5, %%mm3\n\t"
1008
1009                 :"=m"(*d)
1010                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1011                 :"memory");
1012             /* Borrowed 32 to 24 */
1013             __asm __volatile(
1014                 "movq   %%mm0, %%mm4\n\t"
1015                 "movq   %%mm3, %%mm5\n\t"
1016                 "movq   %%mm6, %%mm0\n\t"
1017                 "movq   %%mm7, %%mm1\n\t"
1018                 
1019                 "movq   %%mm4, %%mm6\n\t"
1020                 "movq   %%mm5, %%mm7\n\t"
1021                 "movq   %%mm0, %%mm2\n\t"
1022                 "movq   %%mm1, %%mm3\n\t"
1023
1024                 "psrlq  $8, %%mm2\n\t"
1025                 "psrlq  $8, %%mm3\n\t"
1026                 "psrlq  $8, %%mm6\n\t"
1027                 "psrlq  $8, %%mm7\n\t"
1028                 "pand   %2, %%mm0\n\t"
1029                 "pand   %2, %%mm1\n\t"
1030                 "pand   %2, %%mm4\n\t"
1031                 "pand   %2, %%mm5\n\t"
1032                 "pand   %3, %%mm2\n\t"
1033                 "pand   %3, %%mm3\n\t"
1034                 "pand   %3, %%mm6\n\t"
1035                 "pand   %3, %%mm7\n\t"
1036                 "por    %%mm2, %%mm0\n\t"
1037                 "por    %%mm3, %%mm1\n\t"
1038                 "por    %%mm6, %%mm4\n\t"
1039                 "por    %%mm7, %%mm5\n\t"
1040
1041                 "movq   %%mm1, %%mm2\n\t"
1042                 "movq   %%mm4, %%mm3\n\t"
1043                 "psllq  $48, %%mm2\n\t"
1044                 "psllq  $32, %%mm3\n\t"
1045                 "pand   %4, %%mm2\n\t"
1046                 "pand   %5, %%mm3\n\t"
1047                 "por    %%mm2, %%mm0\n\t"
1048                 "psrlq  $16, %%mm1\n\t"
1049                 "psrlq  $32, %%mm4\n\t"
1050                 "psllq  $16, %%mm5\n\t"
1051                 "por    %%mm3, %%mm1\n\t"
1052                 "pand   %6, %%mm5\n\t"
1053                 "por    %%mm5, %%mm4\n\t"
1054
1055                 MOVNTQ" %%mm0, %0\n\t"
1056                 MOVNTQ" %%mm1, 8%0\n\t"
1057                 MOVNTQ" %%mm4, 16%0"
1058
1059                 :"=m"(*d)
1060                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1061                 :"memory");
1062                 d += 24;
1063                 s += 8;
1064         }
1065         __asm __volatile(SFENCE:::"memory");
1066         __asm __volatile(EMMS:::"memory");
1067 #endif
1068         while(s < end)
1069         {
1070                 register uint16_t bgr;
1071                 bgr = *s++;
1072                 *d++ = (bgr&0x1F)<<3;
1073                 *d++ = (bgr&0x3E0)>>2;
1074                 *d++ = (bgr&0x7C00)>>7;
1075         }
1076 }
1077
1078 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1079 {
1080         const uint16_t *end;
1081 #ifdef HAVE_MMX
1082         const uint16_t *mm_end;
1083 #endif
1084         uint8_t *d = (uint8_t *)dst;
1085         const uint16_t *s = (const uint16_t *)src;
1086         end = s + src_size/2;
1087 #ifdef HAVE_MMX
1088         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1089         mm_end = end - 7;
1090         while(s < mm_end)
1091         {
1092             __asm __volatile(
1093                 PREFETCH" 32%1\n\t"
1094                 "movq   %1, %%mm0\n\t"
1095                 "movq   %1, %%mm1\n\t"
1096                 "movq   %1, %%mm2\n\t"
1097                 "pand   %2, %%mm0\n\t"
1098                 "pand   %3, %%mm1\n\t"
1099                 "pand   %4, %%mm2\n\t"
1100                 "psllq  $3, %%mm0\n\t"
1101                 "psrlq  $3, %%mm1\n\t"
1102                 "psrlq  $8, %%mm2\n\t"
1103                 "movq   %%mm0, %%mm3\n\t"
1104                 "movq   %%mm1, %%mm4\n\t"
1105                 "movq   %%mm2, %%mm5\n\t"
1106                 "punpcklwd %5, %%mm0\n\t"
1107                 "punpcklwd %5, %%mm1\n\t"
1108                 "punpcklwd %5, %%mm2\n\t"
1109                 "punpckhwd %5, %%mm3\n\t"
1110                 "punpckhwd %5, %%mm4\n\t"
1111                 "punpckhwd %5, %%mm5\n\t"
1112                 "psllq  $8, %%mm1\n\t"
1113                 "psllq  $16, %%mm2\n\t"
1114                 "por    %%mm1, %%mm0\n\t"
1115                 "por    %%mm2, %%mm0\n\t"
1116                 "psllq  $8, %%mm4\n\t"
1117                 "psllq  $16, %%mm5\n\t"
1118                 "por    %%mm4, %%mm3\n\t"
1119                 "por    %%mm5, %%mm3\n\t"
1120                 
1121                 "movq   %%mm0, %%mm6\n\t"
1122                 "movq   %%mm3, %%mm7\n\t"
1123
1124                 "movq   8%1, %%mm0\n\t"
1125                 "movq   8%1, %%mm1\n\t"
1126                 "movq   8%1, %%mm2\n\t"
1127                 "pand   %2, %%mm0\n\t"
1128                 "pand   %3, %%mm1\n\t"
1129                 "pand   %4, %%mm2\n\t"
1130                 "psllq  $3, %%mm0\n\t"
1131                 "psrlq  $3, %%mm1\n\t"
1132                 "psrlq  $8, %%mm2\n\t"
1133                 "movq   %%mm0, %%mm3\n\t"
1134                 "movq   %%mm1, %%mm4\n\t"
1135                 "movq   %%mm2, %%mm5\n\t"
1136                 "punpcklwd %5, %%mm0\n\t"
1137                 "punpcklwd %5, %%mm1\n\t"
1138                 "punpcklwd %5, %%mm2\n\t"
1139                 "punpckhwd %5, %%mm3\n\t"
1140                 "punpckhwd %5, %%mm4\n\t"
1141                 "punpckhwd %5, %%mm5\n\t"
1142                 "psllq  $8, %%mm1\n\t"
1143                 "psllq  $16, %%mm2\n\t"
1144                 "por    %%mm1, %%mm0\n\t"
1145                 "por    %%mm2, %%mm0\n\t"
1146                 "psllq  $8, %%mm4\n\t"
1147                 "psllq  $16, %%mm5\n\t"
1148                 "por    %%mm4, %%mm3\n\t"
1149                 "por    %%mm5, %%mm3\n\t"
1150                 :"=m"(*d)
1151                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1152                 :"memory");
1153             /* Borrowed 32 to 24 */
1154             __asm __volatile(
1155                 "movq   %%mm0, %%mm4\n\t"
1156                 "movq   %%mm3, %%mm5\n\t"
1157                 "movq   %%mm6, %%mm0\n\t"
1158                 "movq   %%mm7, %%mm1\n\t"
1159                 
1160                 "movq   %%mm4, %%mm6\n\t"
1161                 "movq   %%mm5, %%mm7\n\t"
1162                 "movq   %%mm0, %%mm2\n\t"
1163                 "movq   %%mm1, %%mm3\n\t"
1164
1165                 "psrlq  $8, %%mm2\n\t"
1166                 "psrlq  $8, %%mm3\n\t"
1167                 "psrlq  $8, %%mm6\n\t"
1168                 "psrlq  $8, %%mm7\n\t"
1169                 "pand   %2, %%mm0\n\t"
1170                 "pand   %2, %%mm1\n\t"
1171                 "pand   %2, %%mm4\n\t"
1172                 "pand   %2, %%mm5\n\t"
1173                 "pand   %3, %%mm2\n\t"
1174                 "pand   %3, %%mm3\n\t"
1175                 "pand   %3, %%mm6\n\t"
1176                 "pand   %3, %%mm7\n\t"
1177                 "por    %%mm2, %%mm0\n\t"
1178                 "por    %%mm3, %%mm1\n\t"
1179                 "por    %%mm6, %%mm4\n\t"
1180                 "por    %%mm7, %%mm5\n\t"
1181
1182                 "movq   %%mm1, %%mm2\n\t"
1183                 "movq   %%mm4, %%mm3\n\t"
1184                 "psllq  $48, %%mm2\n\t"
1185                 "psllq  $32, %%mm3\n\t"
1186                 "pand   %4, %%mm2\n\t"
1187                 "pand   %5, %%mm3\n\t"
1188                 "por    %%mm2, %%mm0\n\t"
1189                 "psrlq  $16, %%mm1\n\t"
1190                 "psrlq  $32, %%mm4\n\t"
1191                 "psllq  $16, %%mm5\n\t"
1192                 "por    %%mm3, %%mm1\n\t"
1193                 "pand   %6, %%mm5\n\t"
1194                 "por    %%mm5, %%mm4\n\t"
1195
1196                 MOVNTQ" %%mm0, %0\n\t"
1197                 MOVNTQ" %%mm1, 8%0\n\t"
1198                 MOVNTQ" %%mm4, 16%0"
1199
1200                 :"=m"(*d)
1201                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1202                 :"memory");
1203                 d += 24;
1204                 s += 8;
1205         }
1206         __asm __volatile(SFENCE:::"memory");
1207         __asm __volatile(EMMS:::"memory");
1208 #endif
1209         while(s < end)
1210         {
1211                 register uint16_t bgr;
1212                 bgr = *s++;
1213                 *d++ = (bgr&0x1F)<<3;
1214                 *d++ = (bgr&0x7E0)>>3;
1215                 *d++ = (bgr&0xF800)>>8;
1216         }
1217 }
1218
1219 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1220 {
1221         const uint16_t *end;
1222 #ifdef HAVE_MMX
1223         const uint16_t *mm_end;
1224 #endif
1225         uint8_t *d = (uint8_t *)dst;
1226         const uint16_t *s = (const uint16_t *)src;
1227         end = s + src_size/2;
1228 #ifdef HAVE_MMX
1229         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1230         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1231         mm_end = end - 3;
1232         while(s < mm_end)
1233         {
1234             __asm __volatile(
1235                 PREFETCH" 32%1\n\t"
1236                 "movq   %1, %%mm0\n\t"
1237                 "movq   %1, %%mm1\n\t"
1238                 "movq   %1, %%mm2\n\t"
1239                 "pand   %2, %%mm0\n\t"
1240                 "pand   %3, %%mm1\n\t"
1241                 "pand   %4, %%mm2\n\t"
1242                 "psllq  $3, %%mm0\n\t"
1243                 "psrlq  $2, %%mm1\n\t"
1244                 "psrlq  $7, %%mm2\n\t"
1245                 "movq   %%mm0, %%mm3\n\t"
1246                 "movq   %%mm1, %%mm4\n\t"
1247                 "movq   %%mm2, %%mm5\n\t"
1248                 "punpcklwd %%mm7, %%mm0\n\t"
1249                 "punpcklwd %%mm7, %%mm1\n\t"
1250                 "punpcklwd %%mm7, %%mm2\n\t"
1251                 "punpckhwd %%mm7, %%mm3\n\t"
1252                 "punpckhwd %%mm7, %%mm4\n\t"
1253                 "punpckhwd %%mm7, %%mm5\n\t"
1254                 "psllq  $8, %%mm1\n\t"
1255                 "psllq  $16, %%mm2\n\t"
1256                 "por    %%mm1, %%mm0\n\t"
1257                 "por    %%mm2, %%mm0\n\t"
1258                 "psllq  $8, %%mm4\n\t"
1259                 "psllq  $16, %%mm5\n\t"
1260                 "por    %%mm4, %%mm3\n\t"
1261                 "por    %%mm5, %%mm3\n\t"
1262                 MOVNTQ" %%mm0, %0\n\t"
1263                 MOVNTQ" %%mm3, 8%0\n\t"
1264                 :"=m"(*d)
1265                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1266                 :"memory");
1267                 d += 16;
1268                 s += 4;
1269         }
1270         __asm __volatile(SFENCE:::"memory");
1271         __asm __volatile(EMMS:::"memory");
1272 #endif
1273         while(s < end)
1274         {
1275 #if 0 //slightly slower on athlon
1276                 int bgr= *s++;
1277                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1278 #else
1279                 register uint16_t bgr;
1280                 bgr = *s++;
1281 #ifdef WORDS_BIGENDIAN
1282                 *d++ = 0;
1283                 *d++ = (bgr&0x7C00)>>7;
1284                 *d++ = (bgr&0x3E0)>>2;
1285                 *d++ = (bgr&0x1F)<<3;
1286 #else
1287                 *d++ = (bgr&0x1F)<<3;
1288                 *d++ = (bgr&0x3E0)>>2;
1289                 *d++ = (bgr&0x7C00)>>7;
1290                 *d++ = 0;
1291 #endif
1292
1293 #endif
1294         }
1295 }
1296
1297 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1298 {
1299         const uint16_t *end;
1300 #ifdef HAVE_MMX
1301         const uint16_t *mm_end;
1302 #endif
1303         uint8_t *d = (uint8_t *)dst;
1304         const uint16_t *s = (uint16_t *)src;
1305         end = s + src_size/2;
1306 #ifdef HAVE_MMX
1307         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1308         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1309         mm_end = end - 3;
1310         while(s < mm_end)
1311         {
1312             __asm __volatile(
1313                 PREFETCH" 32%1\n\t"
1314                 "movq   %1, %%mm0\n\t"
1315                 "movq   %1, %%mm1\n\t"
1316                 "movq   %1, %%mm2\n\t"
1317                 "pand   %2, %%mm0\n\t"
1318                 "pand   %3, %%mm1\n\t"
1319                 "pand   %4, %%mm2\n\t"
1320                 "psllq  $3, %%mm0\n\t"
1321                 "psrlq  $3, %%mm1\n\t"
1322                 "psrlq  $8, %%mm2\n\t"
1323                 "movq   %%mm0, %%mm3\n\t"
1324                 "movq   %%mm1, %%mm4\n\t"
1325                 "movq   %%mm2, %%mm5\n\t"
1326                 "punpcklwd %%mm7, %%mm0\n\t"
1327                 "punpcklwd %%mm7, %%mm1\n\t"
1328                 "punpcklwd %%mm7, %%mm2\n\t"
1329                 "punpckhwd %%mm7, %%mm3\n\t"
1330                 "punpckhwd %%mm7, %%mm4\n\t"
1331                 "punpckhwd %%mm7, %%mm5\n\t"
1332                 "psllq  $8, %%mm1\n\t"
1333                 "psllq  $16, %%mm2\n\t"
1334                 "por    %%mm1, %%mm0\n\t"
1335                 "por    %%mm2, %%mm0\n\t"
1336                 "psllq  $8, %%mm4\n\t"
1337                 "psllq  $16, %%mm5\n\t"
1338                 "por    %%mm4, %%mm3\n\t"
1339                 "por    %%mm5, %%mm3\n\t"
1340                 MOVNTQ" %%mm0, %0\n\t"
1341                 MOVNTQ" %%mm3, 8%0\n\t"
1342                 :"=m"(*d)
1343                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1344                 :"memory");
1345                 d += 16;
1346                 s += 4;
1347         }
1348         __asm __volatile(SFENCE:::"memory");
1349         __asm __volatile(EMMS:::"memory");
1350 #endif
1351         while(s < end)
1352         {
1353                 register uint16_t bgr;
1354                 bgr = *s++;
1355 #ifdef WORDS_BIGENDIAN
1356                 *d++ = 0;
1357                 *d++ = (bgr&0xF800)>>8;
1358                 *d++ = (bgr&0x7E0)>>3;
1359                 *d++ = (bgr&0x1F)<<3;
1360 #else
1361                 *d++ = (bgr&0x1F)<<3;
1362                 *d++ = (bgr&0x7E0)>>3;
1363                 *d++ = (bgr&0xF800)>>8;
1364                 *d++ = 0;
1365 #endif
1366         }
1367 }
1368
1369 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1370 {
1371         long idx = 15 - src_size;
1372         uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1373 #ifdef HAVE_MMX
1374         __asm __volatile(
1375                 "       test %0, %0                     \n"
1376                 "       jns 2f                          \n"
1377                 "       "PREFETCH" (%1, %0)             \n"
1378                 "       movq %3, %%mm7                  \n"
1379                 "       pxor %4, %%mm7                  \n"
1380                 "       movq %%mm7, %%mm6               \n"
1381                 "       pxor %5, %%mm7                  \n"
1382                         ASMALIGN(4)
1383                 "1:                                     \n"
1384                 "       "PREFETCH" 32(%1, %0)           \n"
1385                 "       movq (%1, %0), %%mm0            \n"
1386                 "       movq 8(%1, %0), %%mm1           \n"
1387 # ifdef HAVE_MMX2
1388                 "       pshufw $177, %%mm0, %%mm3       \n"
1389                 "       pshufw $177, %%mm1, %%mm5       \n"
1390                 "       pand %%mm7, %%mm0               \n"
1391                 "       pand %%mm6, %%mm3               \n"
1392                 "       pand %%mm7, %%mm1               \n"
1393                 "       pand %%mm6, %%mm5               \n"
1394                 "       por %%mm3, %%mm0                \n"
1395                 "       por %%mm5, %%mm1                \n"
1396 # else
1397                 "       movq %%mm0, %%mm2               \n"
1398                 "       movq %%mm1, %%mm4               \n"
1399                 "       pand %%mm7, %%mm0               \n"
1400                 "       pand %%mm6, %%mm2               \n"
1401                 "       pand %%mm7, %%mm1               \n"
1402                 "       pand %%mm6, %%mm4               \n"
1403                 "       movq %%mm2, %%mm3               \n"
1404                 "       movq %%mm4, %%mm5               \n"
1405                 "       pslld $16, %%mm2                \n"
1406                 "       psrld $16, %%mm3                \n"
1407                 "       pslld $16, %%mm4                \n"
1408                 "       psrld $16, %%mm5                \n"
1409                 "       por %%mm2, %%mm0                \n"
1410                 "       por %%mm4, %%mm1                \n"
1411                 "       por %%mm3, %%mm0                \n"
1412                 "       por %%mm5, %%mm1                \n"
1413 # endif
1414                 "       "MOVNTQ" %%mm0, (%2, %0)        \n"
1415                 "       "MOVNTQ" %%mm1, 8(%2, %0)       \n"
1416                 "       add $16, %0                     \n"
1417                 "       js 1b                           \n"
1418                 "       "SFENCE"                        \n"
1419                 "       "EMMS"                          \n"
1420                 "2:                                     \n"
1421                 : "+&r"(idx)
1422                 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1423                 : "memory");
1424 #endif
1425         for (; idx<15; idx+=4) {
1426                 register int v = *(uint32_t *)&s[idx], g = v & 0xff00;
1427                 v &= 0xff00ff;
1428                 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1429         }
1430 }
1431
1432 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1433 {
1434         unsigned i;
1435 #ifdef HAVE_MMX
1436         long mmx_size= 23 - src_size;
1437         asm volatile (
1438                 "test %%"REG_a", %%"REG_a"      \n\t"
1439                 "jns 2f                         \n\t"
1440                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1441                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1442                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1443                 ASMALIGN(4)
1444                 "1:                             \n\t"
1445                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1446                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1447                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1448                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1449                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1450                 "pand %%mm5, %%mm0              \n\t"
1451                 "pand %%mm6, %%mm1              \n\t"
1452                 "pand %%mm7, %%mm2              \n\t"
1453                 "por %%mm0, %%mm1               \n\t"
1454                 "por %%mm2, %%mm1               \n\t"                
1455                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1456                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1457                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1458                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1459                 "pand %%mm7, %%mm0              \n\t"
1460                 "pand %%mm5, %%mm1              \n\t"
1461                 "pand %%mm6, %%mm2              \n\t"
1462                 "por %%mm0, %%mm1               \n\t"
1463                 "por %%mm2, %%mm1               \n\t"                
1464                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1465                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1466                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1467                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1468                 "pand %%mm6, %%mm0              \n\t"
1469                 "pand %%mm7, %%mm1              \n\t"
1470                 "pand %%mm5, %%mm2              \n\t"
1471                 "por %%mm0, %%mm1               \n\t"
1472                 "por %%mm2, %%mm1               \n\t"                
1473                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1474                 "add $24, %%"REG_a"             \n\t"
1475                 " js 1b                         \n\t"
1476                 "2:                             \n\t"
1477                 : "+a" (mmx_size)
1478                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1479         );
1480
1481         __asm __volatile(SFENCE:::"memory");
1482         __asm __volatile(EMMS:::"memory");
1483
1484         if(mmx_size==23) return; //finihsed, was multiple of 8
1485
1486         src+= src_size;
1487         dst+= src_size;
1488         src_size= 23-mmx_size;
1489         src-= src_size;
1490         dst-= src_size;
1491 #endif
1492         for(i=0; i<src_size; i+=3)
1493         {
1494                 register uint8_t x;
1495                 x          = src[i + 2];
1496                 dst[i + 1] = src[i + 1];
1497                 dst[i + 2] = src[i + 0];
1498                 dst[i + 0] = x;
1499         }
1500 }
1501
1502 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1503         long width, long height,
1504         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1505 {
1506         long y;
1507         const long chromWidth= width>>1;
1508         for(y=0; y<height; y++)
1509         {
1510 #ifdef HAVE_MMX
1511 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1512                 asm volatile(
1513                         "xor %%"REG_a", %%"REG_a"       \n\t"
1514                         ASMALIGN(4)
1515                         "1:                             \n\t"
1516                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1517                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1518                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1519                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1520                         "movq %%mm0, %%mm2              \n\t" // U(0)
1521                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1522                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1523                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1524
1525                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1526                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1527                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1528                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1529                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1530                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1531                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1532                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1533
1534                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1535                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1536                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1537                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1538
1539                         "add $8, %%"REG_a"              \n\t"
1540                         "cmp %4, %%"REG_a"              \n\t"
1541                         " jb 1b                         \n\t"
1542                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1543                         : "%"REG_a
1544                 );
1545 #else
1546
1547 #if defined ARCH_ALPHA && defined HAVE_MVI
1548 #define pl2yuy2(n)                                      \
1549         y1 = yc[n];                                     \
1550         y2 = yc2[n];                                    \
1551         u = uc[n];                                      \
1552         v = vc[n];                                      \
1553         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1554         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1555         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1556         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1557         yuv1 = (u << 8) + (v << 24);                    \
1558         yuv2 = yuv1 + y2;                               \
1559         yuv1 += y1;                                     \
1560         qdst[n] = yuv1;                                 \
1561         qdst2[n] = yuv2;
1562
1563                 int i;
1564                 uint64_t *qdst = (uint64_t *) dst;
1565                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1566                 const uint32_t *yc = (uint32_t *) ysrc;
1567                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1568                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1569                 for(i = 0; i < chromWidth; i += 8){
1570                         uint64_t y1, y2, yuv1, yuv2;
1571                         uint64_t u, v;
1572                         /* Prefetch */
1573                         asm("ldq $31,64(%0)" :: "r"(yc));
1574                         asm("ldq $31,64(%0)" :: "r"(yc2));
1575                         asm("ldq $31,64(%0)" :: "r"(uc));
1576                         asm("ldq $31,64(%0)" :: "r"(vc));
1577
1578                         pl2yuy2(0);
1579                         pl2yuy2(1);
1580                         pl2yuy2(2);
1581                         pl2yuy2(3);
1582
1583                         yc += 4;
1584                         yc2 += 4;
1585                         uc += 4;
1586                         vc += 4;
1587                         qdst += 4;
1588                         qdst2 += 4;
1589                 }
1590                 y++;
1591                 ysrc += lumStride;
1592                 dst += dstStride;
1593
1594 #elif __WORDSIZE >= 64
1595                 int i;
1596                 uint64_t *ldst = (uint64_t *) dst;
1597                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1598                 for(i = 0; i < chromWidth; i += 2){
1599                         uint64_t k, l;
1600                         k = yc[0] + (uc[0] << 8) +
1601                             (yc[1] << 16) + (vc[0] << 24);
1602                         l = yc[2] + (uc[1] << 8) +
1603                             (yc[3] << 16) + (vc[1] << 24);
1604                         *ldst++ = k + (l << 32);
1605                         yc += 4;
1606                         uc += 2;
1607                         vc += 2;
1608                 }
1609
1610 #else
1611                 int i, *idst = (int32_t *) dst;
1612                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1613                 for(i = 0; i < chromWidth; i++){
1614 #ifdef WORDS_BIGENDIAN
1615                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1616                             (yc[1] << 8) + (vc[0] << 0);
1617 #else
1618                         *idst++ = yc[0] + (uc[0] << 8) +
1619                             (yc[1] << 16) + (vc[0] << 24);
1620 #endif
1621                         yc += 2;
1622                         uc++;
1623                         vc++;
1624                 }
1625 #endif
1626 #endif
1627                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1628                 {
1629                         usrc += chromStride;
1630                         vsrc += chromStride;
1631                 }
1632                 ysrc += lumStride;
1633                 dst += dstStride;
1634         }
1635 #ifdef HAVE_MMX
1636 asm(    EMMS" \n\t"
1637         SFENCE" \n\t"
1638         :::"memory");
1639 #endif
1640 }
1641
1642 /**
1643  *
1644  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1645  * problem for anyone then tell me, and ill fix it)
1646  */
1647 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1648         long width, long height,
1649         long lumStride, long chromStride, long dstStride)
1650 {
1651         //FIXME interpolate chroma
1652         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1653 }
1654
1655 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1656         long width, long height,
1657         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1658 {
1659         long y;
1660         const long chromWidth= width>>1;
1661         for(y=0; y<height; y++)
1662         {
1663 #ifdef HAVE_MMX
1664 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1665                 asm volatile(
1666                         "xor %%"REG_a", %%"REG_a"       \n\t"
1667                         ASMALIGN(4)
1668                         "1:                             \n\t"
1669                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1670                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1671                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1672                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1673                         "movq %%mm0, %%mm2              \n\t" // U(0)
1674                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1675                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1676                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1677
1678                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1679                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1680                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1681                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1682                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1683                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1684                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1685                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1686
1687                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1688                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1689                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1690                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1691
1692                         "add $8, %%"REG_a"              \n\t"
1693                         "cmp %4, %%"REG_a"              \n\t"
1694                         " jb 1b                         \n\t"
1695                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1696                         : "%"REG_a
1697                 );
1698 #else
1699 //FIXME adapt the alpha asm code from yv12->yuy2
1700
1701 #if __WORDSIZE >= 64
1702                 int i;
1703                 uint64_t *ldst = (uint64_t *) dst;
1704                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1705                 for(i = 0; i < chromWidth; i += 2){
1706                         uint64_t k, l;
1707                         k = uc[0] + (yc[0] << 8) +
1708                             (vc[0] << 16) + (yc[1] << 24);
1709                         l = uc[1] + (yc[2] << 8) +
1710                             (vc[1] << 16) + (yc[3] << 24);
1711                         *ldst++ = k + (l << 32);
1712                         yc += 4;
1713                         uc += 2;
1714                         vc += 2;
1715                 }
1716
1717 #else
1718                 int i, *idst = (int32_t *) dst;
1719                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1720                 for(i = 0; i < chromWidth; i++){
1721 #ifdef WORDS_BIGENDIAN
1722                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1723                             (vc[0] << 8) + (yc[1] << 0);
1724 #else
1725                         *idst++ = uc[0] + (yc[0] << 8) +
1726                             (vc[0] << 16) + (yc[1] << 24);
1727 #endif
1728                         yc += 2;
1729                         uc++;
1730                         vc++;
1731                 }
1732 #endif
1733 #endif
1734                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1735                 {
1736                         usrc += chromStride;
1737                         vsrc += chromStride;
1738                 }
1739                 ysrc += lumStride;
1740                 dst += dstStride;
1741         }
1742 #ifdef HAVE_MMX
1743 asm(    EMMS" \n\t"
1744         SFENCE" \n\t"
1745         :::"memory");
1746 #endif
1747 }
1748
1749 /**
1750  *
1751  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1752  * problem for anyone then tell me, and ill fix it)
1753  */
1754 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1755         long width, long height,
1756         long lumStride, long chromStride, long dstStride)
1757 {
1758         //FIXME interpolate chroma
1759         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1760 }
1761
1762 /**
1763  *
1764  * width should be a multiple of 16
1765  */
1766 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1767         long width, long height,
1768         long lumStride, long chromStride, long dstStride)
1769 {
1770         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1771 }
1772
1773 /**
1774  *
1775  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1776  * problem for anyone then tell me, and ill fix it)
1777  */
1778 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1779         long width, long height,
1780         long lumStride, long chromStride, long srcStride)
1781 {
1782         long y;
1783         const long chromWidth= width>>1;
1784         for(y=0; y<height; y+=2)
1785         {
1786 #ifdef HAVE_MMX
1787                 asm volatile(
1788                         "xor %%"REG_a", %%"REG_a"       \n\t"
1789                         "pcmpeqw %%mm7, %%mm7           \n\t"
1790                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1791                         ASMALIGN(4)
1792                         "1:                             \n\t"
1793                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1794                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1795                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1796                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1797                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1798                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1799                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1800                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1801                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1802                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1803                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1804
1805                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1806
1807                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1808                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1809                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1810                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1811                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1812                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1813                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1814                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1815                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1816                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1817
1818                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1819
1820                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1821                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1822                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1823                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1824                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1825                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1826                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1827                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1828
1829                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1830                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1831
1832                         "add $8, %%"REG_a"              \n\t"
1833                         "cmp %4, %%"REG_a"              \n\t"
1834                         " jb 1b                         \n\t"
1835                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1836                         : "memory", "%"REG_a
1837                 );
1838
1839                 ydst += lumStride;
1840                 src  += srcStride;
1841
1842                 asm volatile(
1843                         "xor %%"REG_a", %%"REG_a"       \n\t"
1844                         ASMALIGN(4)
1845                         "1:                             \n\t"
1846                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1847                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1848                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1849                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1850                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1851                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1852                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1853                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1854                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1855                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1856                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1857
1858                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1859                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1860
1861                         "add $8, %%"REG_a"              \n\t"
1862                         "cmp %4, %%"REG_a"              \n\t"
1863                         " jb 1b                         \n\t"
1864
1865                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1866                         : "memory", "%"REG_a
1867                 );
1868 #else
1869                 long i;
1870                 for(i=0; i<chromWidth; i++)
1871                 {
1872                         ydst[2*i+0]     = src[4*i+0];
1873                         udst[i]         = src[4*i+1];
1874                         ydst[2*i+1]     = src[4*i+2];
1875                         vdst[i]         = src[4*i+3];
1876                 }
1877                 ydst += lumStride;
1878                 src  += srcStride;
1879
1880                 for(i=0; i<chromWidth; i++)
1881                 {
1882                         ydst[2*i+0]     = src[4*i+0];
1883                         ydst[2*i+1]     = src[4*i+2];
1884                 }
1885 #endif
1886                 udst += chromStride;
1887                 vdst += chromStride;
1888                 ydst += lumStride;
1889                 src  += srcStride;
1890         }
1891 #ifdef HAVE_MMX
1892 asm volatile(   EMMS" \n\t"
1893                 SFENCE" \n\t"
1894                 :::"memory");
1895 #endif
1896 }
1897
1898 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1899         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1900         long width, long height, long lumStride, long chromStride)
1901 {
1902         /* Y Plane */
1903         memcpy(ydst, ysrc, width*height);
1904
1905         /* XXX: implement upscaling for U,V */
1906 }
1907
1908 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1909 {
1910         long x,y;
1911         
1912         dst[0]= src[0];
1913         
1914         // first line
1915         for(x=0; x<srcWidth-1; x++){
1916                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1917                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1918         }
1919         dst[2*srcWidth-1]= src[srcWidth-1];
1920         
1921         dst+= dstStride;
1922
1923         for(y=1; y<srcHeight; y++){
1924 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1925                 const long mmxSize= srcWidth&~15;
1926                 asm volatile(
1927                         "mov %4, %%"REG_a"              \n\t"
1928                         "1:                             \n\t"
1929                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1930                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1931                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1932                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1933                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1934                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1935                         PAVGB" %%mm0, %%mm5             \n\t"
1936                         PAVGB" %%mm0, %%mm3             \n\t"
1937                         PAVGB" %%mm0, %%mm5             \n\t"
1938                         PAVGB" %%mm0, %%mm3             \n\t"
1939                         PAVGB" %%mm1, %%mm4             \n\t"
1940                         PAVGB" %%mm1, %%mm2             \n\t"
1941                         PAVGB" %%mm1, %%mm4             \n\t"
1942                         PAVGB" %%mm1, %%mm2             \n\t"
1943                         "movq %%mm5, %%mm7              \n\t"
1944                         "movq %%mm4, %%mm6              \n\t"
1945                         "punpcklbw %%mm3, %%mm5         \n\t"
1946                         "punpckhbw %%mm3, %%mm7         \n\t"
1947                         "punpcklbw %%mm2, %%mm4         \n\t"
1948                         "punpckhbw %%mm2, %%mm6         \n\t"
1949 #if 1
1950                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1951                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1952                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1953                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1954 #else
1955                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1956                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1957                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1958                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1959 #endif
1960                         "add $8, %%"REG_a"              \n\t"
1961                         " js 1b                         \n\t"
1962                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1963                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1964                            "g" (-mmxSize)
1965                         : "%"REG_a
1966
1967                 );
1968 #else
1969                 const long mmxSize=1;
1970 #endif
1971                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1972                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1973
1974                 for(x=mmxSize-1; x<srcWidth-1; x++){
1975                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1976                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1977                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1978                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1979                 }
1980                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1981                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1982
1983                 dst+=dstStride*2;
1984                 src+=srcStride;
1985         }
1986         
1987         // last line
1988 #if 1
1989         dst[0]= src[0];
1990         
1991         for(x=0; x<srcWidth-1; x++){
1992                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1993                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1994         }
1995         dst[2*srcWidth-1]= src[srcWidth-1];
1996 #else
1997         for(x=0; x<srcWidth; x++){
1998                 dst[2*x+0]=
1999                 dst[2*x+1]= src[x];
2000         }
2001 #endif
2002
2003 #ifdef HAVE_MMX
2004 asm volatile(   EMMS" \n\t"
2005                 SFENCE" \n\t"
2006                 :::"memory");
2007 #endif
2008 }
2009
2010 /**
2011  *
2012  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2013  * problem for anyone then tell me, and ill fix it)
2014  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2015  */
2016 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2017         long width, long height,
2018         long lumStride, long chromStride, long srcStride)
2019 {
2020         long y;
2021         const long chromWidth= width>>1;
2022         for(y=0; y<height; y+=2)
2023         {
2024 #ifdef HAVE_MMX
2025                 asm volatile(
2026                         "xorl %%eax, %%eax              \n\t"
2027                         "pcmpeqw %%mm7, %%mm7           \n\t"
2028                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
2029                         ASMALIGN(4)
2030                         "1:                             \n\t"
2031                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2032                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
2033                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
2034                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
2035                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
2036                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
2037                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
2038                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2039                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2040                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
2041                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
2042
2043                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
2044
2045                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2046                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2047                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2048                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2049                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2050                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2051                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2052                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2053                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2054                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2055
2056                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2057
2058                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2059                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2060                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2061                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2062                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2063                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2064                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2065                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2066
2067                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2068                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2069
2070                         "addl $8, %%eax                 \n\t"
2071                         "cmpl %4, %%eax                 \n\t"
2072                         " jb 1b                         \n\t"
2073                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2074                         : "memory", "%eax"
2075                 );
2076
2077                 ydst += lumStride;
2078                 src  += srcStride;
2079
2080                 asm volatile(
2081                         "xorl %%eax, %%eax              \n\t"
2082                         ASMALIGN(4)
2083                         "1:                             \n\t"
2084                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2085                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2086                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2087                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2088                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2089                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2090                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2091                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2092                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2093                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2094                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2095
2096                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2097                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2098
2099                         "addl $8, %%eax                 \n\t"
2100                         "cmpl %4, %%eax                 \n\t"
2101                         " jb 1b                         \n\t"
2102
2103                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2104                         : "memory", "%eax"
2105                 );
2106 #else
2107                 long i;
2108                 for(i=0; i<chromWidth; i++)
2109                 {
2110                         udst[i]         = src[4*i+0];
2111                         ydst[2*i+0]     = src[4*i+1];
2112                         vdst[i]         = src[4*i+2];
2113                         ydst[2*i+1]     = src[4*i+3];
2114                 }
2115                 ydst += lumStride;
2116                 src  += srcStride;
2117
2118                 for(i=0; i<chromWidth; i++)
2119                 {
2120                         ydst[2*i+0]     = src[4*i+1];
2121                         ydst[2*i+1]     = src[4*i+3];
2122                 }
2123 #endif
2124                 udst += chromStride;
2125                 vdst += chromStride;
2126                 ydst += lumStride;
2127                 src  += srcStride;
2128         }
2129 #ifdef HAVE_MMX
2130 asm volatile(   EMMS" \n\t"
2131                 SFENCE" \n\t"
2132                 :::"memory");
2133 #endif
2134 }
2135
2136 /**
2137  *
2138  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2139  * problem for anyone then tell me, and ill fix it)
2140  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2141  */
2142 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2143         long width, long height,
2144         long lumStride, long chromStride, long srcStride)
2145 {
2146         long y;
2147         const long chromWidth= width>>1;
2148 #ifdef HAVE_MMX
2149         for(y=0; y<height-2; y+=2)
2150         {
2151                 long i;
2152                 for(i=0; i<2; i++)
2153                 {
2154                         asm volatile(
2155                                 "mov %2, %%"REG_a"              \n\t"
2156                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2157                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2158                                 "pxor %%mm7, %%mm7              \n\t"
2159                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2160                                 ASMALIGN(4)
2161                                 "1:                             \n\t"
2162                                 PREFETCH" 64(%0, %%"REG_d")     \n\t"
2163                                 "movd (%0, %%"REG_d"), %%mm0    \n\t"
2164                                 "movd 3(%0, %%"REG_d"), %%mm1   \n\t"
2165                                 "punpcklbw %%mm7, %%mm0         \n\t"
2166                                 "punpcklbw %%mm7, %%mm1         \n\t"
2167                                 "movd 6(%0, %%"REG_d"), %%mm2   \n\t"
2168                                 "movd 9(%0, %%"REG_d"), %%mm3   \n\t"
2169                                 "punpcklbw %%mm7, %%mm2         \n\t"
2170                                 "punpcklbw %%mm7, %%mm3         \n\t"
2171                                 "pmaddwd %%mm6, %%mm0           \n\t"
2172                                 "pmaddwd %%mm6, %%mm1           \n\t"
2173                                 "pmaddwd %%mm6, %%mm2           \n\t"
2174                                 "pmaddwd %%mm6, %%mm3           \n\t"
2175 #ifndef FAST_BGR2YV12
2176                                 "psrad $8, %%mm0                \n\t"
2177                                 "psrad $8, %%mm1                \n\t"
2178                                 "psrad $8, %%mm2                \n\t"
2179                                 "psrad $8, %%mm3                \n\t"
2180 #endif
2181                                 "packssdw %%mm1, %%mm0          \n\t"
2182                                 "packssdw %%mm3, %%mm2          \n\t"
2183                                 "pmaddwd %%mm5, %%mm0           \n\t"
2184                                 "pmaddwd %%mm5, %%mm2           \n\t"
2185                                 "packssdw %%mm2, %%mm0          \n\t"
2186                                 "psraw $7, %%mm0                \n\t"
2187
2188                                 "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2189                                 "movd 15(%0, %%"REG_d"), %%mm1  \n\t"
2190                                 "punpcklbw %%mm7, %%mm4         \n\t"
2191                                 "punpcklbw %%mm7, %%mm1         \n\t"
2192                                 "movd 18(%0, %%"REG_d"), %%mm2  \n\t"
2193                                 "movd 21(%0, %%"REG_d"), %%mm3  \n\t"
2194                                 "punpcklbw %%mm7, %%mm2         \n\t"
2195                                 "punpcklbw %%mm7, %%mm3         \n\t"
2196                                 "pmaddwd %%mm6, %%mm4           \n\t"
2197                                 "pmaddwd %%mm6, %%mm1           \n\t"
2198                                 "pmaddwd %%mm6, %%mm2           \n\t"
2199                                 "pmaddwd %%mm6, %%mm3           \n\t"
2200 #ifndef FAST_BGR2YV12
2201                                 "psrad $8, %%mm4                \n\t"
2202                                 "psrad $8, %%mm1                \n\t"
2203                                 "psrad $8, %%mm2                \n\t"
2204                                 "psrad $8, %%mm3                \n\t"
2205 #endif
2206                                 "packssdw %%mm1, %%mm4          \n\t"
2207                                 "packssdw %%mm3, %%mm2          \n\t"
2208                                 "pmaddwd %%mm5, %%mm4           \n\t"
2209                                 "pmaddwd %%mm5, %%mm2           \n\t"
2210                                 "add $24, %%"REG_d"             \n\t"
2211                                 "packssdw %%mm2, %%mm4          \n\t"
2212                                 "psraw $7, %%mm4                \n\t"
2213
2214                                 "packuswb %%mm4, %%mm0          \n\t"
2215                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2216
2217                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2218                                 "add $8, %%"REG_a"              \n\t"
2219                                 " js 1b                         \n\t"
2220                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2221                                 : "%"REG_a, "%"REG_d
2222                         );
2223                         ydst += lumStride;
2224                         src  += srcStride;
2225                 }
2226                 src -= srcStride*2;
2227                 asm volatile(
2228                         "mov %4, %%"REG_a"              \n\t"
2229                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2230                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2231                         "pxor %%mm7, %%mm7              \n\t"
2232                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2233                         "add %%"REG_d", %%"REG_d"       \n\t"
2234                         ASMALIGN(4)
2235                         "1:                             \n\t"
2236                         PREFETCH" 64(%0, %%"REG_d")     \n\t"
2237                         PREFETCH" 64(%1, %%"REG_d")     \n\t"
2238 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2239                         "movq (%0, %%"REG_d"), %%mm0    \n\t"
2240                         "movq (%1, %%"REG_d"), %%mm1    \n\t"
2241                         "movq 6(%0, %%"REG_d"), %%mm2   \n\t"
2242                         "movq 6(%1, %%"REG_d"), %%mm3   \n\t"
2243                         PAVGB" %%mm1, %%mm0             \n\t"
2244                         PAVGB" %%mm3, %%mm2             \n\t"
2245                         "movq %%mm0, %%mm1              \n\t"
2246                         "movq %%mm2, %%mm3              \n\t"
2247                         "psrlq $24, %%mm0               \n\t"
2248                         "psrlq $24, %%mm2               \n\t"
2249                         PAVGB" %%mm1, %%mm0             \n\t"
2250                         PAVGB" %%mm3, %%mm2             \n\t"
2251                         "punpcklbw %%mm7, %%mm0         \n\t"
2252                         "punpcklbw %%mm7, %%mm2         \n\t"
2253 #else
2254                         "movd (%0, %%"REG_d"), %%mm0    \n\t"
2255                         "movd (%1, %%"REG_d"), %%mm1    \n\t"
2256                         "movd 3(%0, %%"REG_d"), %%mm2   \n\t"
2257                         "movd 3(%1, %%"REG_d"), %%mm3   \n\t"
2258                         "punpcklbw %%mm7, %%mm0         \n\t"
2259                         "punpcklbw %%mm7, %%mm1         \n\t"
2260                         "punpcklbw %%mm7, %%mm2         \n\t"
2261                         "punpcklbw %%mm7, %%mm3         \n\t"
2262                         "paddw %%mm1, %%mm0             \n\t"
2263                         "paddw %%mm3, %%mm2             \n\t"
2264                         "paddw %%mm2, %%mm0             \n\t"
2265                         "movd 6(%0, %%"REG_d"), %%mm4   \n\t"
2266                         "movd 6(%1, %%"REG_d"), %%mm1   \n\t"
2267                         "movd 9(%0, %%"REG_d"), %%mm2   \n\t"
2268                         "movd 9(%1, %%"REG_d"), %%mm3   \n\t"
2269                         "punpcklbw %%mm7, %%mm4         \n\t"
2270                         "punpcklbw %%mm7, %%mm1         \n\t"
2271                         "punpcklbw %%mm7, %%mm2         \n\t"
2272                         "punpcklbw %%mm7, %%mm3         \n\t"
2273                         "paddw %%mm1, %%mm4             \n\t"
2274                         "paddw %%mm3, %%mm2             \n\t"
2275                         "paddw %%mm4, %%mm2             \n\t"
2276                         "psrlw $2, %%mm0                \n\t"
2277                         "psrlw $2, %%mm2                \n\t"
2278 #endif
2279                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2280                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2281
2282                         "pmaddwd %%mm0, %%mm1           \n\t"
2283                         "pmaddwd %%mm2, %%mm3           \n\t"
2284                         "pmaddwd %%mm6, %%mm0           \n\t"
2285                         "pmaddwd %%mm6, %%mm2           \n\t"
2286 #ifndef FAST_BGR2YV12
2287                         "psrad $8, %%mm0                \n\t"
2288                         "psrad $8, %%mm1                \n\t"
2289                         "psrad $8, %%mm2                \n\t"
2290                         "psrad $8, %%mm3                \n\t"
2291 #endif
2292                         "packssdw %%mm2, %%mm0          \n\t"
2293                         "packssdw %%mm3, %%mm1          \n\t"
2294                         "pmaddwd %%mm5, %%mm0           \n\t"
2295                         "pmaddwd %%mm5, %%mm1           \n\t"
2296                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2297                         "psraw $7, %%mm0                \n\t"
2298
2299 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2300                         "movq 12(%0, %%"REG_d"), %%mm4  \n\t"
2301                         "movq 12(%1, %%"REG_d"), %%mm1  \n\t"
2302                         "movq 18(%0, %%"REG_d"), %%mm2  \n\t"
2303                         "movq 18(%1, %%"REG_d"), %%mm3  \n\t"
2304                         PAVGB" %%mm1, %%mm4             \n\t"
2305                         PAVGB" %%mm3, %%mm2             \n\t"
2306                         "movq %%mm4, %%mm1              \n\t"
2307                         "movq %%mm2, %%mm3              \n\t"
2308                         "psrlq $24, %%mm4               \n\t"
2309                         "psrlq $24, %%mm2               \n\t"
2310                         PAVGB" %%mm1, %%mm4             \n\t"
2311                         PAVGB" %%mm3, %%mm2             \n\t"
2312                         "punpcklbw %%mm7, %%mm4         \n\t"
2313                         "punpcklbw %%mm7, %%mm2         \n\t"
2314 #else
2315                         "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2316                         "movd 12(%1, %%"REG_d"), %%mm1  \n\t"
2317                         "movd 15(%0, %%"REG_d"), %%mm2  \n\t"
2318                         "movd 15(%1, %%"REG_d"), %%mm3  \n\t"
2319                         "punpcklbw %%mm7, %%mm4         \n\t"
2320                         "punpcklbw %%mm7, %%mm1         \n\t"
2321                         "punpcklbw %%mm7, %%mm2         \n\t"
2322                         "punpcklbw %%mm7, %%mm3         \n\t"
2323                         "paddw %%mm1, %%mm4             \n\t"
2324                         "paddw %%mm3, %%mm2             \n\t"
2325                         "paddw %%mm2, %%mm4             \n\t"
2326                         "movd 18(%0, %%"REG_d"), %%mm5  \n\t"
2327                         "movd 18(%1, %%"REG_d"), %%mm1  \n\t"
2328                         "movd 21(%0, %%"REG_d"), %%mm2  \n\t"
2329                         "movd 21(%1, %%"REG_d"), %%mm3  \n\t"
2330                         "punpcklbw %%mm7, %%mm5         \n\t"
2331                         "punpcklbw %%mm7, %%mm1         \n\t"
2332                         "punpcklbw %%mm7, %%mm2         \n\t"
2333                         "punpcklbw %%mm7, %%mm3         \n\t"
2334                         "paddw %%mm1, %%mm5             \n\t"
2335                         "paddw %%mm3, %%mm2             \n\t"
2336                         "paddw %%mm5, %%mm2             \n\t"
2337                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2338                         "psrlw $2, %%mm4                \n\t"
2339                         "psrlw $2, %%mm2                \n\t"
2340 #endif
2341                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2342                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2343
2344                         "pmaddwd %%mm4, %%mm1           \n\t"
2345                         "pmaddwd %%mm2, %%mm3           \n\t"
2346                         "pmaddwd %%mm6, %%mm4           \n\t"
2347                         "pmaddwd %%mm6, %%mm2           \n\t"
2348 #ifndef FAST_BGR2YV12
2349                         "psrad $8, %%mm4                \n\t"
2350                         "psrad $8, %%mm1                \n\t"
2351                         "psrad $8, %%mm2                \n\t"
2352                         "psrad $8, %%mm3                \n\t"
2353 #endif
2354                         "packssdw %%mm2, %%mm4          \n\t"
2355                         "packssdw %%mm3, %%mm1          \n\t"
2356                         "pmaddwd %%mm5, %%mm4           \n\t"
2357                         "pmaddwd %%mm5, %%mm1           \n\t"
2358                         "add $24, %%"REG_d"             \n\t"
2359                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2360                         "psraw $7, %%mm4                \n\t"
2361
2362                         "movq %%mm0, %%mm1              \n\t"
2363                         "punpckldq %%mm4, %%mm0         \n\t"
2364                         "punpckhdq %%mm4, %%mm1         \n\t"
2365                         "packsswb %%mm1, %%mm0          \n\t"
2366                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2367                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2368                         "punpckhdq %%mm0, %%mm0         \n\t"
2369                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2370                         "add $4, %%"REG_a"              \n\t"
2371                         " js 1b                         \n\t"
2372                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2373                         : "%"REG_a, "%"REG_d
2374                 );
2375
2376                 udst += chromStride;
2377                 vdst += chromStride;
2378                 src  += srcStride*2;
2379         }
2380
2381         asm volatile(   EMMS" \n\t"
2382                         SFENCE" \n\t"
2383                         :::"memory");
2384 #else
2385         y=0;
2386 #endif
2387         for(; y<height; y+=2)
2388         {
2389                 long i;
2390                 for(i=0; i<chromWidth; i++)
2391                 {
2392                         unsigned int b= src[6*i+0];
2393                         unsigned int g= src[6*i+1];
2394                         unsigned int r= src[6*i+2];
2395
2396                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2398                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2399
2400                         udst[i]         = U;
2401                         vdst[i]         = V;
2402                         ydst[2*i]       = Y;
2403
2404                         b= src[6*i+3];
2405                         g= src[6*i+4];
2406                         r= src[6*i+5];
2407
2408                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2409                         ydst[2*i+1]     = Y;
2410                 }
2411                 ydst += lumStride;
2412                 src  += srcStride;
2413
2414                 for(i=0; i<chromWidth; i++)
2415                 {
2416                         unsigned int b= src[6*i+0];
2417                         unsigned int g= src[6*i+1];
2418                         unsigned int r= src[6*i+2];
2419
2420                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2421
2422                         ydst[2*i]       = Y;
2423
2424                         b= src[6*i+3];
2425                         g= src[6*i+4];
2426                         r= src[6*i+5];
2427
2428                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2429                         ydst[2*i+1]     = Y;
2430                 }
2431                 udst += chromStride;
2432                 vdst += chromStride;
2433                 ydst += lumStride;
2434                 src  += srcStride;
2435         }
2436 }
2437
2438 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2439                             long width, long height, long src1Stride,
2440                             long src2Stride, long dstStride){
2441         long h;
2442
2443         for(h=0; h < height; h++)
2444         {
2445                 long w;
2446
2447 #ifdef HAVE_MMX
2448 #ifdef HAVE_SSE2
2449                 asm(
2450                         "xor %%"REG_a", %%"REG_a"       \n\t"
2451                         "1:                             \n\t"
2452                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2453                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2454                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2455                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2456                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2457                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2458                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2459                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2460                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2461                         "add $16, %%"REG_a"             \n\t"
2462                         "cmp %3, %%"REG_a"              \n\t"
2463                         " jb 1b                         \n\t"
2464                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465                         : "memory", "%"REG_a""
2466                 );
2467 #else
2468                 asm(
2469                         "xor %%"REG_a", %%"REG_a"       \n\t"
2470                         "1:                             \n\t"
2471                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2472                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2473                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2474                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2475                         "movq %%mm0, %%mm1              \n\t"
2476                         "movq %%mm2, %%mm3              \n\t"
2477                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2478                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2479                         "punpcklbw %%mm4, %%mm0         \n\t"
2480                         "punpckhbw %%mm4, %%mm1         \n\t"
2481                         "punpcklbw %%mm5, %%mm2         \n\t"
2482                         "punpckhbw %%mm5, %%mm3         \n\t"
2483                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2484                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2485                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2486                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2487                         "add $16, %%"REG_a"             \n\t"
2488                         "cmp %3, %%"REG_a"              \n\t"
2489                         " jb 1b                         \n\t"
2490                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2491                         : "memory", "%"REG_a
2492                 );
2493 #endif
2494                 for(w= (width&(~15)); w < width; w++)
2495                 {
2496                         dest[2*w+0] = src1[w];
2497                         dest[2*w+1] = src2[w];
2498                 }
2499 #else
2500                 for(w=0; w < width; w++)
2501                 {
2502                         dest[2*w+0] = src1[w];
2503                         dest[2*w+1] = src2[w];
2504                 }
2505 #endif
2506                 dest += dstStride;
2507                 src1 += src1Stride;
2508                 src2 += src2Stride;
2509         }
2510 #ifdef HAVE_MMX
2511         asm(
2512                 EMMS" \n\t"
2513                 SFENCE" \n\t"
2514                 ::: "memory"
2515                 );
2516 #endif
2517 }
2518
2519 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2520                         uint8_t *dst1, uint8_t *dst2,
2521                         long width, long height,
2522                         long srcStride1, long srcStride2,
2523                         long dstStride1, long dstStride2)
2524 {
2525     long y,x,w,h;
2526     w=width/2; h=height/2;
2527 #ifdef HAVE_MMX
2528     asm volatile(
2529         PREFETCH" %0\n\t"
2530         PREFETCH" %1\n\t"
2531         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2532 #endif
2533     for(y=0;y<h;y++){
2534         const uint8_t* s1=src1+srcStride1*(y>>1);
2535         uint8_t* d=dst1+dstStride1*y;
2536         x=0;
2537 #ifdef HAVE_MMX
2538         for(;x<w-31;x+=32)
2539         {
2540             asm volatile(
2541                 PREFETCH" 32%1\n\t"
2542                 "movq   %1, %%mm0\n\t"
2543                 "movq   8%1, %%mm2\n\t"
2544                 "movq   16%1, %%mm4\n\t"
2545                 "movq   24%1, %%mm6\n\t"
2546                 "movq   %%mm0, %%mm1\n\t"
2547                 "movq   %%mm2, %%mm3\n\t"
2548                 "movq   %%mm4, %%mm5\n\t"
2549                 "movq   %%mm6, %%mm7\n\t"
2550                 "punpcklbw %%mm0, %%mm0\n\t"
2551                 "punpckhbw %%mm1, %%mm1\n\t"
2552                 "punpcklbw %%mm2, %%mm2\n\t"
2553                 "punpckhbw %%mm3, %%mm3\n\t"
2554                 "punpcklbw %%mm4, %%mm4\n\t"
2555                 "punpckhbw %%mm5, %%mm5\n\t"
2556                 "punpcklbw %%mm6, %%mm6\n\t"
2557                 "punpckhbw %%mm7, %%mm7\n\t"
2558                 MOVNTQ" %%mm0, %0\n\t"
2559                 MOVNTQ" %%mm1, 8%0\n\t"
2560                 MOVNTQ" %%mm2, 16%0\n\t"
2561                 MOVNTQ" %%mm3, 24%0\n\t"
2562                 MOVNTQ" %%mm4, 32%0\n\t"
2563                 MOVNTQ" %%mm5, 40%0\n\t"
2564                 MOVNTQ" %%mm6, 48%0\n\t"
2565                 MOVNTQ" %%mm7, 56%0"
2566                 :"=m"(d[2*x])
2567                 :"m"(s1[x])
2568                 :"memory");
2569         }
2570 #endif
2571         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2572     }
2573     for(y=0;y<h;y++){
2574         const uint8_t* s2=src2+srcStride2*(y>>1);
2575         uint8_t* d=dst2+dstStride2*y;
2576         x=0;
2577 #ifdef HAVE_MMX
2578         for(;x<w-31;x+=32)
2579         {
2580             asm volatile(
2581                 PREFETCH" 32%1\n\t"
2582                 "movq   %1, %%mm0\n\t"
2583                 "movq   8%1, %%mm2\n\t"
2584                 "movq   16%1, %%mm4\n\t"
2585                 "movq   24%1, %%mm6\n\t"
2586                 "movq   %%mm0, %%mm1\n\t"
2587                 "movq   %%mm2, %%mm3\n\t"
2588                 "movq   %%mm4, %%mm5\n\t"
2589                 "movq   %%mm6, %%mm7\n\t"
2590                 "punpcklbw %%mm0, %%mm0\n\t"
2591                 "punpckhbw %%mm1, %%mm1\n\t"
2592                 "punpcklbw %%mm2, %%mm2\n\t"
2593                 "punpckhbw %%mm3, %%mm3\n\t"
2594                 "punpcklbw %%mm4, %%mm4\n\t"
2595                 "punpckhbw %%mm5, %%mm5\n\t"
2596                 "punpcklbw %%mm6, %%mm6\n\t"
2597                 "punpckhbw %%mm7, %%mm7\n\t"
2598                 MOVNTQ" %%mm0, %0\n\t"
2599                 MOVNTQ" %%mm1, 8%0\n\t"
2600                 MOVNTQ" %%mm2, 16%0\n\t"
2601                 MOVNTQ" %%mm3, 24%0\n\t"
2602                 MOVNTQ" %%mm4, 32%0\n\t"
2603                 MOVNTQ" %%mm5, 40%0\n\t"
2604                 MOVNTQ" %%mm6, 48%0\n\t"
2605                 MOVNTQ" %%mm7, 56%0"
2606                 :"=m"(d[2*x])
2607                 :"m"(s2[x])
2608                 :"memory");
2609         }
2610 #endif
2611         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2612     }
2613 #ifdef HAVE_MMX
2614         asm(
2615                 EMMS" \n\t"
2616                 SFENCE" \n\t"
2617                 ::: "memory"
2618                 );
2619 #endif
2620 }
2621
2622 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2623                         uint8_t *dst,
2624                         long width, long height,
2625                         long srcStride1, long srcStride2,
2626                         long srcStride3, long dstStride)
2627 {
2628     long y,x,w,h;
2629     w=width/2; h=height;
2630     for(y=0;y<h;y++){
2631         const uint8_t* yp=src1+srcStride1*y;
2632         const uint8_t* up=src2+srcStride2*(y>>2);
2633         const uint8_t* vp=src3+srcStride3*(y>>2);
2634         uint8_t* d=dst+dstStride*y;
2635         x=0;
2636 #ifdef HAVE_MMX
2637         for(;x<w-7;x+=8)
2638         {
2639             asm volatile(
2640                 PREFETCH" 32(%1, %0)\n\t"
2641                 PREFETCH" 32(%2, %0)\n\t"
2642                 PREFETCH" 32(%3, %0)\n\t"
2643                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2644                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2645                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2646                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2647                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2648                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2649                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2650                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2651                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2652                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2653
2654                 "movq   %%mm1, %%mm6\n\t"
2655                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2656                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2657                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2658                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2659                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2660                 
2661                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2662                 "movq   8(%1, %0, 4), %%mm0\n\t"
2663                 "movq   %%mm0, %%mm3\n\t"
2664                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2665                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2666                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2667                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2668
2669                 "movq   %%mm4, %%mm6\n\t"
2670                 "movq   16(%1, %0, 4), %%mm0\n\t"
2671                 "movq   %%mm0, %%mm3\n\t"
2672                 "punpcklbw %%mm5, %%mm4\n\t"
2673                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2674                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2675                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2676                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2677                 
2678                 "punpckhbw %%mm5, %%mm6\n\t"
2679                 "movq   24(%1, %0, 4), %%mm0\n\t"
2680                 "movq   %%mm0, %%mm3\n\t"
2681                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2682                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2683                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2684                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2685
2686                 : "+r" (x)
2687                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2688                 :"memory");
2689         }
2690 #endif
2691         for(; x<w; x++)
2692         {
2693             const long x2= x<<2;
2694             d[8*x+0]=yp[x2];
2695             d[8*x+1]=up[x];
2696             d[8*x+2]=yp[x2+1];
2697             d[8*x+3]=vp[x];
2698             d[8*x+4]=yp[x2+2];
2699             d[8*x+5]=up[x];
2700             d[8*x+6]=yp[x2+3];
2701             d[8*x+7]=vp[x];
2702         }
2703     }
2704 #ifdef HAVE_MMX
2705         asm(
2706                 EMMS" \n\t"
2707                 SFENCE" \n\t"
2708                 ::: "memory"
2709                 );
2710 #endif
2711 }
2712
2713 static inline void RENAME(rgb2rgb_init)(void){
2714         rgb15to16= RENAME(rgb15to16);
2715         rgb15to24= RENAME(rgb15to24);
2716         rgb15to32= RENAME(rgb15to32);
2717         rgb16to24= RENAME(rgb16to24);
2718         rgb16to32= RENAME(rgb16to32);
2719         rgb16to15= RENAME(rgb16to15);
2720         rgb24to16= RENAME(rgb24to16);
2721         rgb24to15= RENAME(rgb24to15);
2722         rgb24to32= RENAME(rgb24to32);
2723         rgb32to16= RENAME(rgb32to16);
2724         rgb32to15= RENAME(rgb32to15);
2725         rgb32to24= RENAME(rgb32to24);
2726         rgb24tobgr15= RENAME(rgb24tobgr15);
2727         rgb24tobgr16= RENAME(rgb24tobgr16);
2728         rgb24tobgr24= RENAME(rgb24tobgr24);
2729         rgb32tobgr32= RENAME(rgb32tobgr32);
2730         rgb32tobgr16= RENAME(rgb32tobgr16);
2731         rgb32tobgr15= RENAME(rgb32tobgr15);
2732         yv12toyuy2= RENAME(yv12toyuy2);
2733         yv12touyvy= RENAME(yv12touyvy);
2734         yuv422ptoyuy2= RENAME(yuv422ptoyuy2);
2735         yuy2toyv12= RENAME(yuy2toyv12);
2736 //      uyvytoyv12= RENAME(uyvytoyv12);
2737 //      yvu9toyv12= RENAME(yvu9toyv12);
2738         planar2x= RENAME(planar2x);
2739         rgb24toyv12= RENAME(rgb24toyv12);
2740         interleaveBytes= RENAME(interleaveBytes);
2741         vu9_to_vu12= RENAME(vu9_to_vu12);
2742         yvu9_to_yuy2= RENAME(yvu9_to_yuy2);
2743 }