]> git.sesse.net Git - ffmpeg/blob - libswscale/rgb2rgb_template.c
Remove duplicate *.lib from rm command.
[ffmpeg] / libswscale / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  *
11  * This file is part of FFmpeg.
12  *
13  * FFmpeg is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * FFmpeg is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with FFmpeg; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26  */
27
28 #include <stddef.h>
29 #include <inttypes.h> /* for __WORDSIZE */
30
31 #ifndef __WORDSIZE
32 // #warning You have misconfigured system and probably will lose performance!
33 #define __WORDSIZE MP_WORDSIZE
34 #endif
35
36 #undef PREFETCH
37 #undef MOVNTQ
38 #undef EMMS
39 #undef SFENCE
40 #undef MMREG_SIZE
41 #undef PREFETCHW
42 #undef PAVGB
43
44 #ifdef HAVE_SSE2
45 #define MMREG_SIZE 16
46 #else
47 #define MMREG_SIZE 8
48 #endif
49
50 #ifdef HAVE_3DNOW
51 #define PREFETCH  "prefetch"
52 #define PREFETCHW "prefetchw"
53 #define PAVGB     "pavgusb"
54 #elif defined ( HAVE_MMX2 )
55 #define PREFETCH "prefetchnta"
56 #define PREFETCHW "prefetcht0"
57 #define PAVGB     "pavgb"
58 #else
59 #ifdef __APPLE__
60 #define PREFETCH "#"
61 #define PREFETCHW "#"
62 #else
63 #define PREFETCH  " # nop"
64 #define PREFETCHW " # nop"
65 #endif
66 #endif
67
68 #ifdef HAVE_3DNOW
69 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
70 #define EMMS     "femms"
71 #else
72 #define EMMS     "emms"
73 #endif
74
75 #ifdef HAVE_MMX2
76 #define MOVNTQ "movntq"
77 #define SFENCE "sfence"
78 #else
79 #define MOVNTQ "movq"
80 #define SFENCE " # nop"
81 #endif
82
83 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
84 {
85   uint8_t *dest = dst;
86   const uint8_t *s = src;
87   const uint8_t *end;
88 #ifdef HAVE_MMX
89   const uint8_t *mm_end;
90 #endif
91   end = s + src_size;
92 #ifdef HAVE_MMX
93   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
94   mm_end = end - 23;
95   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
96   while(s < mm_end)
97   {
98     __asm __volatile(
99         PREFETCH"       32%1\n\t"
100         "movd   %1, %%mm0\n\t"
101         "punpckldq 3%1, %%mm0\n\t"
102         "movd   6%1, %%mm1\n\t"
103         "punpckldq 9%1, %%mm1\n\t"
104         "movd   12%1, %%mm2\n\t"
105         "punpckldq 15%1, %%mm2\n\t"
106         "movd   18%1, %%mm3\n\t"
107         "punpckldq 21%1, %%mm3\n\t"
108         "pand   %%mm7, %%mm0\n\t"
109         "pand   %%mm7, %%mm1\n\t"
110         "pand   %%mm7, %%mm2\n\t"
111         "pand   %%mm7, %%mm3\n\t"
112         MOVNTQ" %%mm0, %0\n\t"
113         MOVNTQ" %%mm1, 8%0\n\t"
114         MOVNTQ" %%mm2, 16%0\n\t"
115         MOVNTQ" %%mm3, 24%0"
116         :"=m"(*dest)
117         :"m"(*s)
118         :"memory");
119     dest += 32;
120     s += 24;
121   }
122   __asm __volatile(SFENCE:::"memory");
123   __asm __volatile(EMMS:::"memory");
124 #endif
125   while(s < end)
126   {
127 #ifdef WORDS_BIGENDIAN
128     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
129     *dest++ = 0;
130     *dest++ = s[2];
131     *dest++ = s[1];
132     *dest++ = s[0];
133     s+=3;
134 #else
135     *dest++ = *s++;
136     *dest++ = *s++;
137     *dest++ = *s++;
138     *dest++ = 0;
139 #endif
140   }
141 }
142
143 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
144 {
145   uint8_t *dest = dst;
146   const uint8_t *s = src;
147   const uint8_t *end;
148 #ifdef HAVE_MMX
149   const uint8_t *mm_end;
150 #endif
151   end = s + src_size;
152 #ifdef HAVE_MMX
153   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
154   mm_end = end - 31;
155   while(s < mm_end)
156   {
157     __asm __volatile(
158         PREFETCH"       32%1\n\t"
159         "movq   %1, %%mm0\n\t"
160         "movq   8%1, %%mm1\n\t"
161         "movq   16%1, %%mm4\n\t"
162         "movq   24%1, %%mm5\n\t"
163         "movq   %%mm0, %%mm2\n\t"
164         "movq   %%mm1, %%mm3\n\t"
165         "movq   %%mm4, %%mm6\n\t"
166         "movq   %%mm5, %%mm7\n\t"
167         "psrlq  $8, %%mm2\n\t"
168         "psrlq  $8, %%mm3\n\t"
169         "psrlq  $8, %%mm6\n\t"
170         "psrlq  $8, %%mm7\n\t"
171         "pand   %2, %%mm0\n\t"
172         "pand   %2, %%mm1\n\t"
173         "pand   %2, %%mm4\n\t"
174         "pand   %2, %%mm5\n\t"
175         "pand   %3, %%mm2\n\t"
176         "pand   %3, %%mm3\n\t"
177         "pand   %3, %%mm6\n\t"
178         "pand   %3, %%mm7\n\t"
179         "por    %%mm2, %%mm0\n\t"
180         "por    %%mm3, %%mm1\n\t"
181         "por    %%mm6, %%mm4\n\t"
182         "por    %%mm7, %%mm5\n\t"
183
184         "movq   %%mm1, %%mm2\n\t"
185         "movq   %%mm4, %%mm3\n\t"
186         "psllq  $48, %%mm2\n\t"
187         "psllq  $32, %%mm3\n\t"
188         "pand   %4, %%mm2\n\t"
189         "pand   %5, %%mm3\n\t"
190         "por    %%mm2, %%mm0\n\t"
191         "psrlq  $16, %%mm1\n\t"
192         "psrlq  $32, %%mm4\n\t"
193         "psllq  $16, %%mm5\n\t"
194         "por    %%mm3, %%mm1\n\t"
195         "pand   %6, %%mm5\n\t"
196         "por    %%mm5, %%mm4\n\t"
197
198         MOVNTQ" %%mm0, %0\n\t"
199         MOVNTQ" %%mm1, 8%0\n\t"
200         MOVNTQ" %%mm4, 16%0"
201         :"=m"(*dest)
202         :"m"(*s),"m"(mask24l),
203          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
204         :"memory");
205     dest += 24;
206     s += 32;
207   }
208   __asm __volatile(SFENCE:::"memory");
209   __asm __volatile(EMMS:::"memory");
210 #endif
211   while(s < end)
212   {
213 #ifdef WORDS_BIGENDIAN
214     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
215     s++;
216     dest[2] = *s++;
217     dest[1] = *s++;
218     dest[0] = *s++;
219     dest += 3;
220 #else
221     *dest++ = *s++;
222     *dest++ = *s++;
223     *dest++ = *s++;
224     s++;
225 #endif
226   }
227 }
228
229 /*
230  Original by Strepto/Astral
231  ported to gcc & bugfixed : A'rpi
232  MMX2, 3DNOW optimization by Nick Kurshev
233  32bit c version, and and&add trick by Michael Niedermayer
234 */
235 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
236 {
237   register const uint8_t* s=src;
238   register uint8_t* d=dst;
239   register const uint8_t *end;
240   const uint8_t *mm_end;
241   end = s + src_size;
242 #ifdef HAVE_MMX
243   __asm __volatile(PREFETCH"    %0"::"m"(*s));
244   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
245   mm_end = end - 15;
246   while(s<mm_end)
247   {
248         __asm __volatile(
249                 PREFETCH"       32%1\n\t"
250                 "movq   %1, %%mm0\n\t"
251                 "movq   8%1, %%mm2\n\t"
252                 "movq   %%mm0, %%mm1\n\t"
253                 "movq   %%mm2, %%mm3\n\t"
254                 "pand   %%mm4, %%mm0\n\t"
255                 "pand   %%mm4, %%mm2\n\t"
256                 "paddw  %%mm1, %%mm0\n\t"
257                 "paddw  %%mm3, %%mm2\n\t"
258                 MOVNTQ" %%mm0, %0\n\t"
259                 MOVNTQ" %%mm2, 8%0"
260                 :"=m"(*d)
261                 :"m"(*s)
262                 );
263         d+=16;
264         s+=16;
265   }
266   __asm __volatile(SFENCE:::"memory");
267   __asm __volatile(EMMS:::"memory");
268 #endif
269     mm_end = end - 3;
270     while(s < mm_end)
271     {
272         register unsigned x= *((uint32_t *)s);
273         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
274         d+=4;
275         s+=4;
276     }
277     if(s < end)
278     {
279         register unsigned short x= *((uint16_t *)s);
280         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
281     }
282 }
283
284 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
285 {
286   register const uint8_t* s=src;
287   register uint8_t* d=dst;
288   register const uint8_t *end;
289   const uint8_t *mm_end;
290   end = s + src_size;
291 #ifdef HAVE_MMX
292   __asm __volatile(PREFETCH"    %0"::"m"(*s));
293   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
294   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
295   mm_end = end - 15;
296   while(s<mm_end)
297   {
298         __asm __volatile(
299                 PREFETCH"       32%1\n\t"
300                 "movq   %1, %%mm0\n\t"
301                 "movq   8%1, %%mm2\n\t"
302                 "movq   %%mm0, %%mm1\n\t"
303                 "movq   %%mm2, %%mm3\n\t"
304                 "psrlq  $1, %%mm0\n\t"
305                 "psrlq  $1, %%mm2\n\t"
306                 "pand   %%mm7, %%mm0\n\t"
307                 "pand   %%mm7, %%mm2\n\t"
308                 "pand   %%mm6, %%mm1\n\t"
309                 "pand   %%mm6, %%mm3\n\t"
310                 "por    %%mm1, %%mm0\n\t"
311                 "por    %%mm3, %%mm2\n\t"
312                 MOVNTQ" %%mm0, %0\n\t"
313                 MOVNTQ" %%mm2, 8%0"
314                 :"=m"(*d)
315                 :"m"(*s)
316                 );
317         d+=16;
318         s+=16;
319   }
320   __asm __volatile(SFENCE:::"memory");
321   __asm __volatile(EMMS:::"memory");
322 #endif
323     mm_end = end - 3;
324     while(s < mm_end)
325     {
326         register uint32_t x= *((uint32_t *)s);
327         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
328         s+=4;
329         d+=4;
330     }
331     if(s < end)
332     {
333         register uint16_t x= *((uint16_t *)s);
334         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
335         s+=2;
336         d+=2;
337     }
338 }
339
340 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
341 {
342         const uint8_t *s = src;
343         const uint8_t *end;
344 #ifdef HAVE_MMX
345         const uint8_t *mm_end;
346 #endif
347         uint16_t *d = (uint16_t *)dst;
348         end = s + src_size;
349 #ifdef HAVE_MMX
350         mm_end = end - 15;
351 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
352         asm volatile(
353                 "movq %3, %%mm5                 \n\t"
354                 "movq %4, %%mm6                 \n\t"
355                 "movq %5, %%mm7                 \n\t"
356                 ASMALIGN(4)
357                 "1:                             \n\t"
358                 PREFETCH" 32(%1)                \n\t"
359                 "movd   (%1), %%mm0             \n\t"
360                 "movd   4(%1), %%mm3            \n\t"
361                 "punpckldq 8(%1), %%mm0         \n\t"
362                 "punpckldq 12(%1), %%mm3        \n\t"
363                 "movq %%mm0, %%mm1              \n\t"
364                 "movq %%mm3, %%mm4              \n\t"
365                 "pand %%mm6, %%mm0              \n\t"
366                 "pand %%mm6, %%mm3              \n\t"
367                 "pmaddwd %%mm7, %%mm0           \n\t"
368                 "pmaddwd %%mm7, %%mm3           \n\t"
369                 "pand %%mm5, %%mm1              \n\t"
370                 "pand %%mm5, %%mm4              \n\t"
371                 "por %%mm1, %%mm0               \n\t"   
372                 "por %%mm4, %%mm3               \n\t"
373                 "psrld $5, %%mm0                \n\t"
374                 "pslld $11, %%mm3               \n\t"
375                 "por %%mm3, %%mm0               \n\t"
376                 MOVNTQ" %%mm0, (%0)             \n\t"
377                 "add $16, %1                    \n\t"
378                 "add $8, %0                     \n\t"
379                 "cmp %2, %1                     \n\t"
380                 " jb 1b                         \n\t"
381                 : "+r" (d), "+r"(s)
382                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
383         );
384 #else
385         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
386         __asm __volatile(
387             "movq       %0, %%mm7\n\t"
388             "movq       %1, %%mm6\n\t"
389             ::"m"(red_16mask),"m"(green_16mask));
390         while(s < mm_end)
391         {
392             __asm __volatile(
393                 PREFETCH" 32%1\n\t"
394                 "movd   %1, %%mm0\n\t"
395                 "movd   4%1, %%mm3\n\t"
396                 "punpckldq 8%1, %%mm0\n\t"
397                 "punpckldq 12%1, %%mm3\n\t"
398                 "movq   %%mm0, %%mm1\n\t"
399                 "movq   %%mm0, %%mm2\n\t"
400                 "movq   %%mm3, %%mm4\n\t"
401                 "movq   %%mm3, %%mm5\n\t"
402                 "psrlq  $3, %%mm0\n\t"
403                 "psrlq  $3, %%mm3\n\t"
404                 "pand   %2, %%mm0\n\t"
405                 "pand   %2, %%mm3\n\t"
406                 "psrlq  $5, %%mm1\n\t"
407                 "psrlq  $5, %%mm4\n\t"
408                 "pand   %%mm6, %%mm1\n\t"
409                 "pand   %%mm6, %%mm4\n\t"
410                 "psrlq  $8, %%mm2\n\t"
411                 "psrlq  $8, %%mm5\n\t"
412                 "pand   %%mm7, %%mm2\n\t"
413                 "pand   %%mm7, %%mm5\n\t"
414                 "por    %%mm1, %%mm0\n\t"
415                 "por    %%mm4, %%mm3\n\t"
416                 "por    %%mm2, %%mm0\n\t"
417                 "por    %%mm5, %%mm3\n\t"
418                 "psllq  $16, %%mm3\n\t"
419                 "por    %%mm3, %%mm0\n\t"
420                 MOVNTQ" %%mm0, %0\n\t"
421                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
422                 d += 4;
423                 s += 16;
424         }
425 #endif
426         __asm __volatile(SFENCE:::"memory");
427         __asm __volatile(EMMS:::"memory");
428 #endif
429         while(s < end)
430         {
431                 register int rgb = *(uint32_t*)s; s += 4;
432                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
433         }
434 }
435
436 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
437 {
438         const uint8_t *s = src;
439         const uint8_t *end;
440 #ifdef HAVE_MMX
441         const uint8_t *mm_end;
442 #endif
443         uint16_t *d = (uint16_t *)dst;
444         end = s + src_size;
445 #ifdef HAVE_MMX
446         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
447         __asm __volatile(
448             "movq       %0, %%mm7\n\t"
449             "movq       %1, %%mm6\n\t"
450             ::"m"(red_16mask),"m"(green_16mask));
451         mm_end = end - 15;
452         while(s < mm_end)
453         {
454             __asm __volatile(
455                 PREFETCH" 32%1\n\t"
456                 "movd   %1, %%mm0\n\t"
457                 "movd   4%1, %%mm3\n\t"
458                 "punpckldq 8%1, %%mm0\n\t"
459                 "punpckldq 12%1, %%mm3\n\t"
460                 "movq   %%mm0, %%mm1\n\t"
461                 "movq   %%mm0, %%mm2\n\t"
462                 "movq   %%mm3, %%mm4\n\t"
463                 "movq   %%mm3, %%mm5\n\t"
464                 "psllq  $8, %%mm0\n\t"
465                 "psllq  $8, %%mm3\n\t"
466                 "pand   %%mm7, %%mm0\n\t"
467                 "pand   %%mm7, %%mm3\n\t"
468                 "psrlq  $5, %%mm1\n\t"
469                 "psrlq  $5, %%mm4\n\t"
470                 "pand   %%mm6, %%mm1\n\t"
471                 "pand   %%mm6, %%mm4\n\t"
472                 "psrlq  $19, %%mm2\n\t"
473                 "psrlq  $19, %%mm5\n\t"
474                 "pand   %2, %%mm2\n\t"
475                 "pand   %2, %%mm5\n\t"
476                 "por    %%mm1, %%mm0\n\t"
477                 "por    %%mm4, %%mm3\n\t"
478                 "por    %%mm2, %%mm0\n\t"
479                 "por    %%mm5, %%mm3\n\t"
480                 "psllq  $16, %%mm3\n\t"
481                 "por    %%mm3, %%mm0\n\t"
482                 MOVNTQ" %%mm0, %0\n\t"
483                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
484                 d += 4;
485                 s += 16;
486         }
487         __asm __volatile(SFENCE:::"memory");
488         __asm __volatile(EMMS:::"memory");
489 #endif
490         while(s < end)
491         {
492                 register int rgb = *(uint32_t*)s; s += 4;
493                 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
494         }
495 }
496
497 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
498 {
499         const uint8_t *s = src;
500         const uint8_t *end;
501 #ifdef HAVE_MMX
502         const uint8_t *mm_end;
503 #endif
504         uint16_t *d = (uint16_t *)dst;
505         end = s + src_size;
506 #ifdef HAVE_MMX
507         mm_end = end - 15;
508 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
509         asm volatile(
510                 "movq %3, %%mm5                 \n\t"
511                 "movq %4, %%mm6                 \n\t"
512                 "movq %5, %%mm7                 \n\t"
513                 ASMALIGN(4)
514                 "1:                             \n\t"
515                 PREFETCH" 32(%1)                \n\t"
516                 "movd   (%1), %%mm0             \n\t"
517                 "movd   4(%1), %%mm3            \n\t"
518                 "punpckldq 8(%1), %%mm0         \n\t"
519                 "punpckldq 12(%1), %%mm3        \n\t"
520                 "movq %%mm0, %%mm1              \n\t"
521                 "movq %%mm3, %%mm4              \n\t"
522                 "pand %%mm6, %%mm0              \n\t"
523                 "pand %%mm6, %%mm3              \n\t"
524                 "pmaddwd %%mm7, %%mm0           \n\t"
525                 "pmaddwd %%mm7, %%mm3           \n\t"
526                 "pand %%mm5, %%mm1              \n\t"
527                 "pand %%mm5, %%mm4              \n\t"
528                 "por %%mm1, %%mm0               \n\t"   
529                 "por %%mm4, %%mm3               \n\t"
530                 "psrld $6, %%mm0                \n\t"
531                 "pslld $10, %%mm3               \n\t"
532                 "por %%mm3, %%mm0               \n\t"
533                 MOVNTQ" %%mm0, (%0)             \n\t"
534                 "add $16, %1                    \n\t"
535                 "add $8, %0                     \n\t"
536                 "cmp %2, %1                     \n\t"
537                 " jb 1b                         \n\t"
538                 : "+r" (d), "+r"(s)
539                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
540         );
541 #else
542         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
543         __asm __volatile(
544             "movq       %0, %%mm7\n\t"
545             "movq       %1, %%mm6\n\t"
546             ::"m"(red_15mask),"m"(green_15mask));
547         while(s < mm_end)
548         {
549             __asm __volatile(
550                 PREFETCH" 32%1\n\t"
551                 "movd   %1, %%mm0\n\t"
552                 "movd   4%1, %%mm3\n\t"
553                 "punpckldq 8%1, %%mm0\n\t"
554                 "punpckldq 12%1, %%mm3\n\t"
555                 "movq   %%mm0, %%mm1\n\t"
556                 "movq   %%mm0, %%mm2\n\t"
557                 "movq   %%mm3, %%mm4\n\t"
558                 "movq   %%mm3, %%mm5\n\t"
559                 "psrlq  $3, %%mm0\n\t"
560                 "psrlq  $3, %%mm3\n\t"
561                 "pand   %2, %%mm0\n\t"
562                 "pand   %2, %%mm3\n\t"
563                 "psrlq  $6, %%mm1\n\t"
564                 "psrlq  $6, %%mm4\n\t"
565                 "pand   %%mm6, %%mm1\n\t"
566                 "pand   %%mm6, %%mm4\n\t"
567                 "psrlq  $9, %%mm2\n\t"
568                 "psrlq  $9, %%mm5\n\t"
569                 "pand   %%mm7, %%mm2\n\t"
570                 "pand   %%mm7, %%mm5\n\t"
571                 "por    %%mm1, %%mm0\n\t"
572                 "por    %%mm4, %%mm3\n\t"
573                 "por    %%mm2, %%mm0\n\t"
574                 "por    %%mm5, %%mm3\n\t"
575                 "psllq  $16, %%mm3\n\t"
576                 "por    %%mm3, %%mm0\n\t"
577                 MOVNTQ" %%mm0, %0\n\t"
578                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
579                 d += 4;
580                 s += 16;
581         }
582 #endif
583         __asm __volatile(SFENCE:::"memory");
584         __asm __volatile(EMMS:::"memory");
585 #endif
586         while(s < end)
587         {
588                 register int rgb = *(uint32_t*)s; s += 4;
589                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
590         }
591 }
592
593 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
594 {
595         const uint8_t *s = src;
596         const uint8_t *end;
597 #ifdef HAVE_MMX
598         const uint8_t *mm_end;
599 #endif
600         uint16_t *d = (uint16_t *)dst;
601         end = s + src_size;
602 #ifdef HAVE_MMX
603         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
604         __asm __volatile(
605             "movq       %0, %%mm7\n\t"
606             "movq       %1, %%mm6\n\t"
607             ::"m"(red_15mask),"m"(green_15mask));
608         mm_end = end - 15;
609         while(s < mm_end)
610         {
611             __asm __volatile(
612                 PREFETCH" 32%1\n\t"
613                 "movd   %1, %%mm0\n\t"
614                 "movd   4%1, %%mm3\n\t"
615                 "punpckldq 8%1, %%mm0\n\t"
616                 "punpckldq 12%1, %%mm3\n\t"
617                 "movq   %%mm0, %%mm1\n\t"
618                 "movq   %%mm0, %%mm2\n\t"
619                 "movq   %%mm3, %%mm4\n\t"
620                 "movq   %%mm3, %%mm5\n\t"
621                 "psllq  $7, %%mm0\n\t"
622                 "psllq  $7, %%mm3\n\t"
623                 "pand   %%mm7, %%mm0\n\t"
624                 "pand   %%mm7, %%mm3\n\t"
625                 "psrlq  $6, %%mm1\n\t"
626                 "psrlq  $6, %%mm4\n\t"
627                 "pand   %%mm6, %%mm1\n\t"
628                 "pand   %%mm6, %%mm4\n\t"
629                 "psrlq  $19, %%mm2\n\t"
630                 "psrlq  $19, %%mm5\n\t"
631                 "pand   %2, %%mm2\n\t"
632                 "pand   %2, %%mm5\n\t"
633                 "por    %%mm1, %%mm0\n\t"
634                 "por    %%mm4, %%mm3\n\t"
635                 "por    %%mm2, %%mm0\n\t"
636                 "por    %%mm5, %%mm3\n\t"
637                 "psllq  $16, %%mm3\n\t"
638                 "por    %%mm3, %%mm0\n\t"
639                 MOVNTQ" %%mm0, %0\n\t"
640                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
641                 d += 4;
642                 s += 16;
643         }
644         __asm __volatile(SFENCE:::"memory");
645         __asm __volatile(EMMS:::"memory");
646 #endif
647         while(s < end)
648         {
649                 register int rgb = *(uint32_t*)s; s += 4;
650                 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
651         }
652 }
653
654 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
655 {
656         const uint8_t *s = src;
657         const uint8_t *end;
658 #ifdef HAVE_MMX
659         const uint8_t *mm_end;
660 #endif
661         uint16_t *d = (uint16_t *)dst;
662         end = s + src_size;
663 #ifdef HAVE_MMX
664         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
665         __asm __volatile(
666             "movq       %0, %%mm7\n\t"
667             "movq       %1, %%mm6\n\t"
668             ::"m"(red_16mask),"m"(green_16mask));
669         mm_end = end - 11;
670         while(s < mm_end)
671         {
672             __asm __volatile(
673                 PREFETCH" 32%1\n\t"
674                 "movd   %1, %%mm0\n\t"
675                 "movd   3%1, %%mm3\n\t"
676                 "punpckldq 6%1, %%mm0\n\t"
677                 "punpckldq 9%1, %%mm3\n\t"
678                 "movq   %%mm0, %%mm1\n\t"
679                 "movq   %%mm0, %%mm2\n\t"
680                 "movq   %%mm3, %%mm4\n\t"
681                 "movq   %%mm3, %%mm5\n\t"
682                 "psrlq  $3, %%mm0\n\t"
683                 "psrlq  $3, %%mm3\n\t"
684                 "pand   %2, %%mm0\n\t"
685                 "pand   %2, %%mm3\n\t"
686                 "psrlq  $5, %%mm1\n\t"
687                 "psrlq  $5, %%mm4\n\t"
688                 "pand   %%mm6, %%mm1\n\t"
689                 "pand   %%mm6, %%mm4\n\t"
690                 "psrlq  $8, %%mm2\n\t"
691                 "psrlq  $8, %%mm5\n\t"
692                 "pand   %%mm7, %%mm2\n\t"
693                 "pand   %%mm7, %%mm5\n\t"
694                 "por    %%mm1, %%mm0\n\t"
695                 "por    %%mm4, %%mm3\n\t"
696                 "por    %%mm2, %%mm0\n\t"
697                 "por    %%mm5, %%mm3\n\t"
698                 "psllq  $16, %%mm3\n\t"
699                 "por    %%mm3, %%mm0\n\t"
700                 MOVNTQ" %%mm0, %0\n\t"
701                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
702                 d += 4;
703                 s += 12;
704         }
705         __asm __volatile(SFENCE:::"memory");
706         __asm __volatile(EMMS:::"memory");
707 #endif
708         while(s < end)
709         {
710                 const int b= *s++;
711                 const int g= *s++;
712                 const int r= *s++;
713                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
714         }
715 }
716
717 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
718 {
719         const uint8_t *s = src;
720         const uint8_t *end;
721 #ifdef HAVE_MMX
722         const uint8_t *mm_end;
723 #endif
724         uint16_t *d = (uint16_t *)dst;
725         end = s + src_size;
726 #ifdef HAVE_MMX
727         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
728         __asm __volatile(
729             "movq       %0, %%mm7\n\t"
730             "movq       %1, %%mm6\n\t"
731             ::"m"(red_16mask),"m"(green_16mask));
732         mm_end = end - 15;
733         while(s < mm_end)
734         {
735             __asm __volatile(
736                 PREFETCH" 32%1\n\t"
737                 "movd   %1, %%mm0\n\t"
738                 "movd   3%1, %%mm3\n\t"
739                 "punpckldq 6%1, %%mm0\n\t"
740                 "punpckldq 9%1, %%mm3\n\t"
741                 "movq   %%mm0, %%mm1\n\t"
742                 "movq   %%mm0, %%mm2\n\t"
743                 "movq   %%mm3, %%mm4\n\t"
744                 "movq   %%mm3, %%mm5\n\t"
745                 "psllq  $8, %%mm0\n\t"
746                 "psllq  $8, %%mm3\n\t"
747                 "pand   %%mm7, %%mm0\n\t"
748                 "pand   %%mm7, %%mm3\n\t"
749                 "psrlq  $5, %%mm1\n\t"
750                 "psrlq  $5, %%mm4\n\t"
751                 "pand   %%mm6, %%mm1\n\t"
752                 "pand   %%mm6, %%mm4\n\t"
753                 "psrlq  $19, %%mm2\n\t"
754                 "psrlq  $19, %%mm5\n\t"
755                 "pand   %2, %%mm2\n\t"
756                 "pand   %2, %%mm5\n\t"
757                 "por    %%mm1, %%mm0\n\t"
758                 "por    %%mm4, %%mm3\n\t"
759                 "por    %%mm2, %%mm0\n\t"
760                 "por    %%mm5, %%mm3\n\t"
761                 "psllq  $16, %%mm3\n\t"
762                 "por    %%mm3, %%mm0\n\t"
763                 MOVNTQ" %%mm0, %0\n\t"
764                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
765                 d += 4;
766                 s += 12;
767         }
768         __asm __volatile(SFENCE:::"memory");
769         __asm __volatile(EMMS:::"memory");
770 #endif
771         while(s < end)
772         {
773                 const int r= *s++;
774                 const int g= *s++;
775                 const int b= *s++;
776                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
777         }
778 }
779
780 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
781 {
782         const uint8_t *s = src;
783         const uint8_t *end;
784 #ifdef HAVE_MMX
785         const uint8_t *mm_end;
786 #endif
787         uint16_t *d = (uint16_t *)dst;
788         end = s + src_size;
789 #ifdef HAVE_MMX
790         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
791         __asm __volatile(
792             "movq       %0, %%mm7\n\t"
793             "movq       %1, %%mm6\n\t"
794             ::"m"(red_15mask),"m"(green_15mask));
795         mm_end = end - 11;
796         while(s < mm_end)
797         {
798             __asm __volatile(
799                 PREFETCH" 32%1\n\t"
800                 "movd   %1, %%mm0\n\t"
801                 "movd   3%1, %%mm3\n\t"
802                 "punpckldq 6%1, %%mm0\n\t"
803                 "punpckldq 9%1, %%mm3\n\t"
804                 "movq   %%mm0, %%mm1\n\t"
805                 "movq   %%mm0, %%mm2\n\t"
806                 "movq   %%mm3, %%mm4\n\t"
807                 "movq   %%mm3, %%mm5\n\t"
808                 "psrlq  $3, %%mm0\n\t"
809                 "psrlq  $3, %%mm3\n\t"
810                 "pand   %2, %%mm0\n\t"
811                 "pand   %2, %%mm3\n\t"
812                 "psrlq  $6, %%mm1\n\t"
813                 "psrlq  $6, %%mm4\n\t"
814                 "pand   %%mm6, %%mm1\n\t"
815                 "pand   %%mm6, %%mm4\n\t"
816                 "psrlq  $9, %%mm2\n\t"
817                 "psrlq  $9, %%mm5\n\t"
818                 "pand   %%mm7, %%mm2\n\t"
819                 "pand   %%mm7, %%mm5\n\t"
820                 "por    %%mm1, %%mm0\n\t"
821                 "por    %%mm4, %%mm3\n\t"
822                 "por    %%mm2, %%mm0\n\t"
823                 "por    %%mm5, %%mm3\n\t"
824                 "psllq  $16, %%mm3\n\t"
825                 "por    %%mm3, %%mm0\n\t"
826                 MOVNTQ" %%mm0, %0\n\t"
827                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
828                 d += 4;
829                 s += 12;
830         }
831         __asm __volatile(SFENCE:::"memory");
832         __asm __volatile(EMMS:::"memory");
833 #endif
834         while(s < end)
835         {
836                 const int b= *s++;
837                 const int g= *s++;
838                 const int r= *s++;
839                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
840         }
841 }
842
843 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
844 {
845         const uint8_t *s = src;
846         const uint8_t *end;
847 #ifdef HAVE_MMX
848         const uint8_t *mm_end;
849 #endif
850         uint16_t *d = (uint16_t *)dst;
851         end = s + src_size;
852 #ifdef HAVE_MMX
853         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
854         __asm __volatile(
855             "movq       %0, %%mm7\n\t"
856             "movq       %1, %%mm6\n\t"
857             ::"m"(red_15mask),"m"(green_15mask));
858         mm_end = end - 15;
859         while(s < mm_end)
860         {
861             __asm __volatile(
862                 PREFETCH" 32%1\n\t"
863                 "movd   %1, %%mm0\n\t"
864                 "movd   3%1, %%mm3\n\t"
865                 "punpckldq 6%1, %%mm0\n\t"
866                 "punpckldq 9%1, %%mm3\n\t"
867                 "movq   %%mm0, %%mm1\n\t"
868                 "movq   %%mm0, %%mm2\n\t"
869                 "movq   %%mm3, %%mm4\n\t"
870                 "movq   %%mm3, %%mm5\n\t"
871                 "psllq  $7, %%mm0\n\t"
872                 "psllq  $7, %%mm3\n\t"
873                 "pand   %%mm7, %%mm0\n\t"
874                 "pand   %%mm7, %%mm3\n\t"
875                 "psrlq  $6, %%mm1\n\t"
876                 "psrlq  $6, %%mm4\n\t"
877                 "pand   %%mm6, %%mm1\n\t"
878                 "pand   %%mm6, %%mm4\n\t"
879                 "psrlq  $19, %%mm2\n\t"
880                 "psrlq  $19, %%mm5\n\t"
881                 "pand   %2, %%mm2\n\t"
882                 "pand   %2, %%mm5\n\t"
883                 "por    %%mm1, %%mm0\n\t"
884                 "por    %%mm4, %%mm3\n\t"
885                 "por    %%mm2, %%mm0\n\t"
886                 "por    %%mm5, %%mm3\n\t"
887                 "psllq  $16, %%mm3\n\t"
888                 "por    %%mm3, %%mm0\n\t"
889                 MOVNTQ" %%mm0, %0\n\t"
890                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
891                 d += 4;
892                 s += 12;
893         }
894         __asm __volatile(SFENCE:::"memory");
895         __asm __volatile(EMMS:::"memory");
896 #endif
897         while(s < end)
898         {
899                 const int r= *s++;
900                 const int g= *s++;
901                 const int b= *s++;
902                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
903         }
904 }
905
906 /*
907   I use here less accurate approximation by simply
908  left-shifting the input
909   value and filling the low order bits with
910  zeroes. This method improves png's
911   compression but this scheme cannot reproduce white exactly, since it does not
912   generate an all-ones maximum value; the net effect is to darken the
913   image slightly.
914
915   The better method should be "left bit replication":
916
917    4 3 2 1 0
918    ---------
919    1 1 0 1 1
920
921    7 6 5 4 3  2 1 0
922    ----------------
923    1 1 0 1 1  1 1 0
924    |=======|  |===|
925        |      Leftmost Bits Repeated to Fill Open Bits
926        |
927    Original Bits
928 */
929 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
930 {
931         const uint16_t *end;
932 #ifdef HAVE_MMX
933         const uint16_t *mm_end;
934 #endif
935         uint8_t *d = (uint8_t *)dst;
936         const uint16_t *s = (uint16_t *)src;
937         end = s + src_size/2;
938 #ifdef HAVE_MMX
939         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
940         mm_end = end - 7;
941         while(s < mm_end)
942         {
943             __asm __volatile(
944                 PREFETCH" 32%1\n\t"
945                 "movq   %1, %%mm0\n\t"
946                 "movq   %1, %%mm1\n\t"
947                 "movq   %1, %%mm2\n\t"
948                 "pand   %2, %%mm0\n\t"
949                 "pand   %3, %%mm1\n\t"
950                 "pand   %4, %%mm2\n\t"
951                 "psllq  $3, %%mm0\n\t"
952                 "psrlq  $2, %%mm1\n\t"
953                 "psrlq  $7, %%mm2\n\t"
954                 "movq   %%mm0, %%mm3\n\t"
955                 "movq   %%mm1, %%mm4\n\t"
956                 "movq   %%mm2, %%mm5\n\t"
957                 "punpcklwd %5, %%mm0\n\t"
958                 "punpcklwd %5, %%mm1\n\t"
959                 "punpcklwd %5, %%mm2\n\t"
960                 "punpckhwd %5, %%mm3\n\t"
961                 "punpckhwd %5, %%mm4\n\t"
962                 "punpckhwd %5, %%mm5\n\t"
963                 "psllq  $8, %%mm1\n\t"
964                 "psllq  $16, %%mm2\n\t"
965                 "por    %%mm1, %%mm0\n\t"
966                 "por    %%mm2, %%mm0\n\t"
967                 "psllq  $8, %%mm4\n\t"
968                 "psllq  $16, %%mm5\n\t"
969                 "por    %%mm4, %%mm3\n\t"
970                 "por    %%mm5, %%mm3\n\t"
971
972                 "movq   %%mm0, %%mm6\n\t"
973                 "movq   %%mm3, %%mm7\n\t"
974                 
975                 "movq   8%1, %%mm0\n\t"
976                 "movq   8%1, %%mm1\n\t"
977                 "movq   8%1, %%mm2\n\t"
978                 "pand   %2, %%mm0\n\t"
979                 "pand   %3, %%mm1\n\t"
980                 "pand   %4, %%mm2\n\t"
981                 "psllq  $3, %%mm0\n\t"
982                 "psrlq  $2, %%mm1\n\t"
983                 "psrlq  $7, %%mm2\n\t"
984                 "movq   %%mm0, %%mm3\n\t"
985                 "movq   %%mm1, %%mm4\n\t"
986                 "movq   %%mm2, %%mm5\n\t"
987                 "punpcklwd %5, %%mm0\n\t"
988                 "punpcklwd %5, %%mm1\n\t"
989                 "punpcklwd %5, %%mm2\n\t"
990                 "punpckhwd %5, %%mm3\n\t"
991                 "punpckhwd %5, %%mm4\n\t"
992                 "punpckhwd %5, %%mm5\n\t"
993                 "psllq  $8, %%mm1\n\t"
994                 "psllq  $16, %%mm2\n\t"
995                 "por    %%mm1, %%mm0\n\t"
996                 "por    %%mm2, %%mm0\n\t"
997                 "psllq  $8, %%mm4\n\t"
998                 "psllq  $16, %%mm5\n\t"
999                 "por    %%mm4, %%mm3\n\t"
1000                 "por    %%mm5, %%mm3\n\t"
1001
1002                 :"=m"(*d)
1003                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1004                 :"memory");
1005             /* Borrowed 32 to 24 */
1006             __asm __volatile(
1007                 "movq   %%mm0, %%mm4\n\t"
1008                 "movq   %%mm3, %%mm5\n\t"
1009                 "movq   %%mm6, %%mm0\n\t"
1010                 "movq   %%mm7, %%mm1\n\t"
1011                 
1012                 "movq   %%mm4, %%mm6\n\t"
1013                 "movq   %%mm5, %%mm7\n\t"
1014                 "movq   %%mm0, %%mm2\n\t"
1015                 "movq   %%mm1, %%mm3\n\t"
1016
1017                 "psrlq  $8, %%mm2\n\t"
1018                 "psrlq  $8, %%mm3\n\t"
1019                 "psrlq  $8, %%mm6\n\t"
1020                 "psrlq  $8, %%mm7\n\t"
1021                 "pand   %2, %%mm0\n\t"
1022                 "pand   %2, %%mm1\n\t"
1023                 "pand   %2, %%mm4\n\t"
1024                 "pand   %2, %%mm5\n\t"
1025                 "pand   %3, %%mm2\n\t"
1026                 "pand   %3, %%mm3\n\t"
1027                 "pand   %3, %%mm6\n\t"
1028                 "pand   %3, %%mm7\n\t"
1029                 "por    %%mm2, %%mm0\n\t"
1030                 "por    %%mm3, %%mm1\n\t"
1031                 "por    %%mm6, %%mm4\n\t"
1032                 "por    %%mm7, %%mm5\n\t"
1033
1034                 "movq   %%mm1, %%mm2\n\t"
1035                 "movq   %%mm4, %%mm3\n\t"
1036                 "psllq  $48, %%mm2\n\t"
1037                 "psllq  $32, %%mm3\n\t"
1038                 "pand   %4, %%mm2\n\t"
1039                 "pand   %5, %%mm3\n\t"
1040                 "por    %%mm2, %%mm0\n\t"
1041                 "psrlq  $16, %%mm1\n\t"
1042                 "psrlq  $32, %%mm4\n\t"
1043                 "psllq  $16, %%mm5\n\t"
1044                 "por    %%mm3, %%mm1\n\t"
1045                 "pand   %6, %%mm5\n\t"
1046                 "por    %%mm5, %%mm4\n\t"
1047
1048                 MOVNTQ" %%mm0, %0\n\t"
1049                 MOVNTQ" %%mm1, 8%0\n\t"
1050                 MOVNTQ" %%mm4, 16%0"
1051
1052                 :"=m"(*d)
1053                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1054                 :"memory");
1055                 d += 24;
1056                 s += 8;
1057         }
1058         __asm __volatile(SFENCE:::"memory");
1059         __asm __volatile(EMMS:::"memory");
1060 #endif
1061         while(s < end)
1062         {
1063                 register uint16_t bgr;
1064                 bgr = *s++;
1065                 *d++ = (bgr&0x1F)<<3;
1066                 *d++ = (bgr&0x3E0)>>2;
1067                 *d++ = (bgr&0x7C00)>>7;
1068         }
1069 }
1070
1071 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1072 {
1073         const uint16_t *end;
1074 #ifdef HAVE_MMX
1075         const uint16_t *mm_end;
1076 #endif
1077         uint8_t *d = (uint8_t *)dst;
1078         const uint16_t *s = (const uint16_t *)src;
1079         end = s + src_size/2;
1080 #ifdef HAVE_MMX
1081         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1082         mm_end = end - 7;
1083         while(s < mm_end)
1084         {
1085             __asm __volatile(
1086                 PREFETCH" 32%1\n\t"
1087                 "movq   %1, %%mm0\n\t"
1088                 "movq   %1, %%mm1\n\t"
1089                 "movq   %1, %%mm2\n\t"
1090                 "pand   %2, %%mm0\n\t"
1091                 "pand   %3, %%mm1\n\t"
1092                 "pand   %4, %%mm2\n\t"
1093                 "psllq  $3, %%mm0\n\t"
1094                 "psrlq  $3, %%mm1\n\t"
1095                 "psrlq  $8, %%mm2\n\t"
1096                 "movq   %%mm0, %%mm3\n\t"
1097                 "movq   %%mm1, %%mm4\n\t"
1098                 "movq   %%mm2, %%mm5\n\t"
1099                 "punpcklwd %5, %%mm0\n\t"
1100                 "punpcklwd %5, %%mm1\n\t"
1101                 "punpcklwd %5, %%mm2\n\t"
1102                 "punpckhwd %5, %%mm3\n\t"
1103                 "punpckhwd %5, %%mm4\n\t"
1104                 "punpckhwd %5, %%mm5\n\t"
1105                 "psllq  $8, %%mm1\n\t"
1106                 "psllq  $16, %%mm2\n\t"
1107                 "por    %%mm1, %%mm0\n\t"
1108                 "por    %%mm2, %%mm0\n\t"
1109                 "psllq  $8, %%mm4\n\t"
1110                 "psllq  $16, %%mm5\n\t"
1111                 "por    %%mm4, %%mm3\n\t"
1112                 "por    %%mm5, %%mm3\n\t"
1113                 
1114                 "movq   %%mm0, %%mm6\n\t"
1115                 "movq   %%mm3, %%mm7\n\t"
1116
1117                 "movq   8%1, %%mm0\n\t"
1118                 "movq   8%1, %%mm1\n\t"
1119                 "movq   8%1, %%mm2\n\t"
1120                 "pand   %2, %%mm0\n\t"
1121                 "pand   %3, %%mm1\n\t"
1122                 "pand   %4, %%mm2\n\t"
1123                 "psllq  $3, %%mm0\n\t"
1124                 "psrlq  $3, %%mm1\n\t"
1125                 "psrlq  $8, %%mm2\n\t"
1126                 "movq   %%mm0, %%mm3\n\t"
1127                 "movq   %%mm1, %%mm4\n\t"
1128                 "movq   %%mm2, %%mm5\n\t"
1129                 "punpcklwd %5, %%mm0\n\t"
1130                 "punpcklwd %5, %%mm1\n\t"
1131                 "punpcklwd %5, %%mm2\n\t"
1132                 "punpckhwd %5, %%mm3\n\t"
1133                 "punpckhwd %5, %%mm4\n\t"
1134                 "punpckhwd %5, %%mm5\n\t"
1135                 "psllq  $8, %%mm1\n\t"
1136                 "psllq  $16, %%mm2\n\t"
1137                 "por    %%mm1, %%mm0\n\t"
1138                 "por    %%mm2, %%mm0\n\t"
1139                 "psllq  $8, %%mm4\n\t"
1140                 "psllq  $16, %%mm5\n\t"
1141                 "por    %%mm4, %%mm3\n\t"
1142                 "por    %%mm5, %%mm3\n\t"
1143                 :"=m"(*d)
1144                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1145                 :"memory");
1146             /* Borrowed 32 to 24 */
1147             __asm __volatile(
1148                 "movq   %%mm0, %%mm4\n\t"
1149                 "movq   %%mm3, %%mm5\n\t"
1150                 "movq   %%mm6, %%mm0\n\t"
1151                 "movq   %%mm7, %%mm1\n\t"
1152                 
1153                 "movq   %%mm4, %%mm6\n\t"
1154                 "movq   %%mm5, %%mm7\n\t"
1155                 "movq   %%mm0, %%mm2\n\t"
1156                 "movq   %%mm1, %%mm3\n\t"
1157
1158                 "psrlq  $8, %%mm2\n\t"
1159                 "psrlq  $8, %%mm3\n\t"
1160                 "psrlq  $8, %%mm6\n\t"
1161                 "psrlq  $8, %%mm7\n\t"
1162                 "pand   %2, %%mm0\n\t"
1163                 "pand   %2, %%mm1\n\t"
1164                 "pand   %2, %%mm4\n\t"
1165                 "pand   %2, %%mm5\n\t"
1166                 "pand   %3, %%mm2\n\t"
1167                 "pand   %3, %%mm3\n\t"
1168                 "pand   %3, %%mm6\n\t"
1169                 "pand   %3, %%mm7\n\t"
1170                 "por    %%mm2, %%mm0\n\t"
1171                 "por    %%mm3, %%mm1\n\t"
1172                 "por    %%mm6, %%mm4\n\t"
1173                 "por    %%mm7, %%mm5\n\t"
1174
1175                 "movq   %%mm1, %%mm2\n\t"
1176                 "movq   %%mm4, %%mm3\n\t"
1177                 "psllq  $48, %%mm2\n\t"
1178                 "psllq  $32, %%mm3\n\t"
1179                 "pand   %4, %%mm2\n\t"
1180                 "pand   %5, %%mm3\n\t"
1181                 "por    %%mm2, %%mm0\n\t"
1182                 "psrlq  $16, %%mm1\n\t"
1183                 "psrlq  $32, %%mm4\n\t"
1184                 "psllq  $16, %%mm5\n\t"
1185                 "por    %%mm3, %%mm1\n\t"
1186                 "pand   %6, %%mm5\n\t"
1187                 "por    %%mm5, %%mm4\n\t"
1188
1189                 MOVNTQ" %%mm0, %0\n\t"
1190                 MOVNTQ" %%mm1, 8%0\n\t"
1191                 MOVNTQ" %%mm4, 16%0"
1192
1193                 :"=m"(*d)
1194                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1195                 :"memory");
1196                 d += 24;
1197                 s += 8;
1198         }
1199         __asm __volatile(SFENCE:::"memory");
1200         __asm __volatile(EMMS:::"memory");
1201 #endif
1202         while(s < end)
1203         {
1204                 register uint16_t bgr;
1205                 bgr = *s++;
1206                 *d++ = (bgr&0x1F)<<3;
1207                 *d++ = (bgr&0x7E0)>>3;
1208                 *d++ = (bgr&0xF800)>>8;
1209         }
1210 }
1211
1212 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1213 {
1214         const uint16_t *end;
1215 #ifdef HAVE_MMX
1216         const uint16_t *mm_end;
1217 #endif
1218         uint8_t *d = (uint8_t *)dst;
1219         const uint16_t *s = (const uint16_t *)src;
1220         end = s + src_size/2;
1221 #ifdef HAVE_MMX
1222         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1223         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1224         mm_end = end - 3;
1225         while(s < mm_end)
1226         {
1227             __asm __volatile(
1228                 PREFETCH" 32%1\n\t"
1229                 "movq   %1, %%mm0\n\t"
1230                 "movq   %1, %%mm1\n\t"
1231                 "movq   %1, %%mm2\n\t"
1232                 "pand   %2, %%mm0\n\t"
1233                 "pand   %3, %%mm1\n\t"
1234                 "pand   %4, %%mm2\n\t"
1235                 "psllq  $3, %%mm0\n\t"
1236                 "psrlq  $2, %%mm1\n\t"
1237                 "psrlq  $7, %%mm2\n\t"
1238                 "movq   %%mm0, %%mm3\n\t"
1239                 "movq   %%mm1, %%mm4\n\t"
1240                 "movq   %%mm2, %%mm5\n\t"
1241                 "punpcklwd %%mm7, %%mm0\n\t"
1242                 "punpcklwd %%mm7, %%mm1\n\t"
1243                 "punpcklwd %%mm7, %%mm2\n\t"
1244                 "punpckhwd %%mm7, %%mm3\n\t"
1245                 "punpckhwd %%mm7, %%mm4\n\t"
1246                 "punpckhwd %%mm7, %%mm5\n\t"
1247                 "psllq  $8, %%mm1\n\t"
1248                 "psllq  $16, %%mm2\n\t"
1249                 "por    %%mm1, %%mm0\n\t"
1250                 "por    %%mm2, %%mm0\n\t"
1251                 "psllq  $8, %%mm4\n\t"
1252                 "psllq  $16, %%mm5\n\t"
1253                 "por    %%mm4, %%mm3\n\t"
1254                 "por    %%mm5, %%mm3\n\t"
1255                 MOVNTQ" %%mm0, %0\n\t"
1256                 MOVNTQ" %%mm3, 8%0\n\t"
1257                 :"=m"(*d)
1258                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1259                 :"memory");
1260                 d += 16;
1261                 s += 4;
1262         }
1263         __asm __volatile(SFENCE:::"memory");
1264         __asm __volatile(EMMS:::"memory");
1265 #endif
1266         while(s < end)
1267         {
1268 #if 0 //slightly slower on athlon
1269                 int bgr= *s++;
1270                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1271 #else
1272                 register uint16_t bgr;
1273                 bgr = *s++;
1274 #ifdef WORDS_BIGENDIAN
1275                 *d++ = 0;
1276                 *d++ = (bgr&0x7C00)>>7;
1277                 *d++ = (bgr&0x3E0)>>2;
1278                 *d++ = (bgr&0x1F)<<3;
1279 #else
1280                 *d++ = (bgr&0x1F)<<3;
1281                 *d++ = (bgr&0x3E0)>>2;
1282                 *d++ = (bgr&0x7C00)>>7;
1283                 *d++ = 0;
1284 #endif
1285
1286 #endif
1287         }
1288 }
1289
1290 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1291 {
1292         const uint16_t *end;
1293 #ifdef HAVE_MMX
1294         const uint16_t *mm_end;
1295 #endif
1296         uint8_t *d = (uint8_t *)dst;
1297         const uint16_t *s = (uint16_t *)src;
1298         end = s + src_size/2;
1299 #ifdef HAVE_MMX
1300         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1301         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1302         mm_end = end - 3;
1303         while(s < mm_end)
1304         {
1305             __asm __volatile(
1306                 PREFETCH" 32%1\n\t"
1307                 "movq   %1, %%mm0\n\t"
1308                 "movq   %1, %%mm1\n\t"
1309                 "movq   %1, %%mm2\n\t"
1310                 "pand   %2, %%mm0\n\t"
1311                 "pand   %3, %%mm1\n\t"
1312                 "pand   %4, %%mm2\n\t"
1313                 "psllq  $3, %%mm0\n\t"
1314                 "psrlq  $3, %%mm1\n\t"
1315                 "psrlq  $8, %%mm2\n\t"
1316                 "movq   %%mm0, %%mm3\n\t"
1317                 "movq   %%mm1, %%mm4\n\t"
1318                 "movq   %%mm2, %%mm5\n\t"
1319                 "punpcklwd %%mm7, %%mm0\n\t"
1320                 "punpcklwd %%mm7, %%mm1\n\t"
1321                 "punpcklwd %%mm7, %%mm2\n\t"
1322                 "punpckhwd %%mm7, %%mm3\n\t"
1323                 "punpckhwd %%mm7, %%mm4\n\t"
1324                 "punpckhwd %%mm7, %%mm5\n\t"
1325                 "psllq  $8, %%mm1\n\t"
1326                 "psllq  $16, %%mm2\n\t"
1327                 "por    %%mm1, %%mm0\n\t"
1328                 "por    %%mm2, %%mm0\n\t"
1329                 "psllq  $8, %%mm4\n\t"
1330                 "psllq  $16, %%mm5\n\t"
1331                 "por    %%mm4, %%mm3\n\t"
1332                 "por    %%mm5, %%mm3\n\t"
1333                 MOVNTQ" %%mm0, %0\n\t"
1334                 MOVNTQ" %%mm3, 8%0\n\t"
1335                 :"=m"(*d)
1336                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1337                 :"memory");
1338                 d += 16;
1339                 s += 4;
1340         }
1341         __asm __volatile(SFENCE:::"memory");
1342         __asm __volatile(EMMS:::"memory");
1343 #endif
1344         while(s < end)
1345         {
1346                 register uint16_t bgr;
1347                 bgr = *s++;
1348 #ifdef WORDS_BIGENDIAN
1349                 *d++ = 0;
1350                 *d++ = (bgr&0xF800)>>8;
1351                 *d++ = (bgr&0x7E0)>>3;
1352                 *d++ = (bgr&0x1F)<<3;
1353 #else
1354                 *d++ = (bgr&0x1F)<<3;
1355                 *d++ = (bgr&0x7E0)>>3;
1356                 *d++ = (bgr&0xF800)>>8;
1357                 *d++ = 0;
1358 #endif
1359         }
1360 }
1361
1362 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1363 {
1364 #ifdef HAVE_MMX
1365 /* TODO: unroll this loop */
1366         asm volatile (
1367                 "xor %%"REG_a", %%"REG_a"       \n\t"
1368                 ASMALIGN(4)
1369                 "1:                             \n\t"
1370                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1371                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1372                 "movq %%mm0, %%mm1              \n\t"
1373                 "movq %%mm0, %%mm2              \n\t"
1374                 "pslld $16, %%mm0               \n\t"
1375                 "psrld $16, %%mm1               \n\t"
1376                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1377                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1378                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1379                 "por %%mm0, %%mm2               \n\t"
1380                 "por %%mm1, %%mm2               \n\t"
1381                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1382                 "add $8, %%"REG_a"              \n\t"
1383                 "cmp %2, %%"REG_a"              \n\t"
1384                 " jb 1b                         \n\t"
1385                 :: "r" (src), "r"(dst), "r" (src_size-7)
1386                 : "%"REG_a
1387         );
1388
1389         __asm __volatile(SFENCE:::"memory");
1390         __asm __volatile(EMMS:::"memory");
1391 #else
1392         unsigned i;
1393         unsigned num_pixels = src_size >> 2;
1394         for(i=0; i<num_pixels; i++)
1395         {
1396 #ifdef WORDS_BIGENDIAN  
1397           dst[4*i + 1] = src[4*i + 3];
1398           dst[4*i + 2] = src[4*i + 2];
1399           dst[4*i + 3] = src[4*i + 1];
1400 #else
1401           dst[4*i + 0] = src[4*i + 2];
1402           dst[4*i + 1] = src[4*i + 1];
1403           dst[4*i + 2] = src[4*i + 0];
1404 #endif
1405         }
1406 #endif
1407 }
1408
1409 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1410 {
1411         unsigned i;
1412 #ifdef HAVE_MMX
1413         long mmx_size= 23 - src_size;
1414         asm volatile (
1415                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1416                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1417                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1418                 ASMALIGN(4)
1419                 "1:                             \n\t"
1420                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1421                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1422                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1423                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1424                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1425                 "pand %%mm5, %%mm0              \n\t"
1426                 "pand %%mm6, %%mm1              \n\t"
1427                 "pand %%mm7, %%mm2              \n\t"
1428                 "por %%mm0, %%mm1               \n\t"
1429                 "por %%mm2, %%mm1               \n\t"                
1430                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1431                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1432                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1433                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1434                 "pand %%mm7, %%mm0              \n\t"
1435                 "pand %%mm5, %%mm1              \n\t"
1436                 "pand %%mm6, %%mm2              \n\t"
1437                 "por %%mm0, %%mm1               \n\t"
1438                 "por %%mm2, %%mm1               \n\t"                
1439                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1440                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1441                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1442                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1443                 "pand %%mm6, %%mm0              \n\t"
1444                 "pand %%mm7, %%mm1              \n\t"
1445                 "pand %%mm5, %%mm2              \n\t"
1446                 "por %%mm0, %%mm1               \n\t"
1447                 "por %%mm2, %%mm1               \n\t"                
1448                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1449                 "add $24, %%"REG_a"             \n\t"
1450                 " js 1b                         \n\t"
1451                 : "+a" (mmx_size)
1452                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1453         );
1454
1455         __asm __volatile(SFENCE:::"memory");
1456         __asm __volatile(EMMS:::"memory");
1457
1458         if(mmx_size==23) return; //finihsed, was multiple of 8
1459
1460         src+= src_size;
1461         dst+= src_size;
1462         src_size= 23-mmx_size;
1463         src-= src_size;
1464         dst-= src_size;
1465 #endif
1466         for(i=0; i<src_size; i+=3)
1467         {
1468                 register uint8_t x;
1469                 x          = src[i + 2];
1470                 dst[i + 1] = src[i + 1];
1471                 dst[i + 2] = src[i + 0];
1472                 dst[i + 0] = x;
1473         }
1474 }
1475
1476 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477         long width, long height,
1478         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1479 {
1480         long y;
1481         const long chromWidth= width>>1;
1482         for(y=0; y<height; y++)
1483         {
1484 #ifdef HAVE_MMX
1485 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1486                 asm volatile(
1487                         "xor %%"REG_a", %%"REG_a"       \n\t"
1488                         ASMALIGN(4)
1489                         "1:                             \n\t"
1490                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1491                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1492                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1493                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1494                         "movq %%mm0, %%mm2              \n\t" // U(0)
1495                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1496                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1497                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1498
1499                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1500                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1501                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1502                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1503                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1504                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1505                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1506                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1507
1508                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1509                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1510                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1511                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1512
1513                         "add $8, %%"REG_a"              \n\t"
1514                         "cmp %4, %%"REG_a"              \n\t"
1515                         " jb 1b                         \n\t"
1516                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1517                         : "%"REG_a
1518                 );
1519 #else
1520
1521 #if defined ARCH_ALPHA && defined HAVE_MVI
1522 #define pl2yuy2(n)                                      \
1523         y1 = yc[n];                                     \
1524         y2 = yc2[n];                                    \
1525         u = uc[n];                                      \
1526         v = vc[n];                                      \
1527         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1528         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1529         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1530         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1531         yuv1 = (u << 8) + (v << 24);                    \
1532         yuv2 = yuv1 + y2;                               \
1533         yuv1 += y1;                                     \
1534         qdst[n] = yuv1;                                 \
1535         qdst2[n] = yuv2;
1536
1537                 int i;
1538                 uint64_t *qdst = (uint64_t *) dst;
1539                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1540                 const uint32_t *yc = (uint32_t *) ysrc;
1541                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1542                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1543                 for(i = 0; i < chromWidth; i += 8){
1544                         uint64_t y1, y2, yuv1, yuv2;
1545                         uint64_t u, v;
1546                         /* Prefetch */
1547                         asm("ldq $31,64(%0)" :: "r"(yc));
1548                         asm("ldq $31,64(%0)" :: "r"(yc2));
1549                         asm("ldq $31,64(%0)" :: "r"(uc));
1550                         asm("ldq $31,64(%0)" :: "r"(vc));
1551
1552                         pl2yuy2(0);
1553                         pl2yuy2(1);
1554                         pl2yuy2(2);
1555                         pl2yuy2(3);
1556
1557                         yc += 4;
1558                         yc2 += 4;
1559                         uc += 4;
1560                         vc += 4;
1561                         qdst += 4;
1562                         qdst2 += 4;
1563                 }
1564                 y++;
1565                 ysrc += lumStride;
1566                 dst += dstStride;
1567
1568 #elif __WORDSIZE >= 64
1569                 int i;
1570                 uint64_t *ldst = (uint64_t *) dst;
1571                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1572                 for(i = 0; i < chromWidth; i += 2){
1573                         uint64_t k, l;
1574                         k = yc[0] + (uc[0] << 8) +
1575                             (yc[1] << 16) + (vc[0] << 24);
1576                         l = yc[2] + (uc[1] << 8) +
1577                             (yc[3] << 16) + (vc[1] << 24);
1578                         *ldst++ = k + (l << 32);
1579                         yc += 4;
1580                         uc += 2;
1581                         vc += 2;
1582                 }
1583
1584 #else
1585                 int i, *idst = (int32_t *) dst;
1586                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1587                 for(i = 0; i < chromWidth; i++){
1588 #ifdef WORDS_BIGENDIAN
1589                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1590                             (yc[1] << 8) + (vc[0] << 0);
1591 #else
1592                         *idst++ = yc[0] + (uc[0] << 8) +
1593                             (yc[1] << 16) + (vc[0] << 24);
1594 #endif
1595                         yc += 2;
1596                         uc++;
1597                         vc++;
1598                 }
1599 #endif
1600 #endif
1601                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1602                 {
1603                         usrc += chromStride;
1604                         vsrc += chromStride;
1605                 }
1606                 ysrc += lumStride;
1607                 dst += dstStride;
1608         }
1609 #ifdef HAVE_MMX
1610 asm(    EMMS" \n\t"
1611         SFENCE" \n\t"
1612         :::"memory");
1613 #endif
1614 }
1615
1616 /**
1617  *
1618  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1619  * problem for anyone then tell me, and ill fix it)
1620  */
1621 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1622         long width, long height,
1623         long lumStride, long chromStride, long dstStride)
1624 {
1625         //FIXME interpolate chroma
1626         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1627 }
1628
1629 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1630         long width, long height,
1631         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1632 {
1633         long y;
1634         const long chromWidth= width>>1;
1635         for(y=0; y<height; y++)
1636         {
1637 #ifdef HAVE_MMX
1638 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1639                 asm volatile(
1640                         "xor %%"REG_a", %%"REG_a"       \n\t"
1641                         ASMALIGN(4)
1642                         "1:                             \n\t"
1643                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1644                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1645                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1646                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1647                         "movq %%mm0, %%mm2              \n\t" // U(0)
1648                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1649                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1650                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1651
1652                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1653                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1654                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1655                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1656                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1657                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1658                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1659                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1660
1661                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1662                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1663                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1664                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1665
1666                         "add $8, %%"REG_a"              \n\t"
1667                         "cmp %4, %%"REG_a"              \n\t"
1668                         " jb 1b                         \n\t"
1669                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1670                         : "%"REG_a
1671                 );
1672 #else
1673 //FIXME adapt the alpha asm code from yv12->yuy2
1674
1675 #if __WORDSIZE >= 64
1676                 int i;
1677                 uint64_t *ldst = (uint64_t *) dst;
1678                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1679                 for(i = 0; i < chromWidth; i += 2){
1680                         uint64_t k, l;
1681                         k = uc[0] + (yc[0] << 8) +
1682                             (vc[0] << 16) + (yc[1] << 24);
1683                         l = uc[1] + (yc[2] << 8) +
1684                             (vc[1] << 16) + (yc[3] << 24);
1685                         *ldst++ = k + (l << 32);
1686                         yc += 4;
1687                         uc += 2;
1688                         vc += 2;
1689                 }
1690
1691 #else
1692                 int i, *idst = (int32_t *) dst;
1693                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1694                 for(i = 0; i < chromWidth; i++){
1695 #ifdef WORDS_BIGENDIAN
1696                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1697                             (vc[0] << 8) + (yc[1] << 0);
1698 #else
1699                         *idst++ = uc[0] + (yc[0] << 8) +
1700                             (vc[0] << 16) + (yc[1] << 24);
1701 #endif
1702                         yc += 2;
1703                         uc++;
1704                         vc++;
1705                 }
1706 #endif
1707 #endif
1708                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1709                 {
1710                         usrc += chromStride;
1711                         vsrc += chromStride;
1712                 }
1713                 ysrc += lumStride;
1714                 dst += dstStride;
1715         }
1716 #ifdef HAVE_MMX
1717 asm(    EMMS" \n\t"
1718         SFENCE" \n\t"
1719         :::"memory");
1720 #endif
1721 }
1722
1723 /**
1724  *
1725  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1726  * problem for anyone then tell me, and ill fix it)
1727  */
1728 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1729         long width, long height,
1730         long lumStride, long chromStride, long dstStride)
1731 {
1732         //FIXME interpolate chroma
1733         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1734 }
1735
1736 /**
1737  *
1738  * width should be a multiple of 16
1739  */
1740 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1741         long width, long height,
1742         long lumStride, long chromStride, long dstStride)
1743 {
1744         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1745 }
1746
1747 /**
1748  *
1749  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1750  * problem for anyone then tell me, and ill fix it)
1751  */
1752 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1753         long width, long height,
1754         long lumStride, long chromStride, long srcStride)
1755 {
1756         long y;
1757         const long chromWidth= width>>1;
1758         for(y=0; y<height; y+=2)
1759         {
1760 #ifdef HAVE_MMX
1761                 asm volatile(
1762                         "xor %%"REG_a", %%"REG_a"       \n\t"
1763                         "pcmpeqw %%mm7, %%mm7           \n\t"
1764                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1765                         ASMALIGN(4)
1766                         "1:                             \n\t"
1767                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1768                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1769                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1770                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1771                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1772                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1773                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1774                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1775                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1776                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1777                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1778
1779                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1780
1781                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1782                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1783                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1784                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1785                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1786                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1787                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1788                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1789                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1790                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1791
1792                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1793
1794                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1795                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1796                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1797                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1798                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1799                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1800                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1801                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1802
1803                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1804                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1805
1806                         "add $8, %%"REG_a"              \n\t"
1807                         "cmp %4, %%"REG_a"              \n\t"
1808                         " jb 1b                         \n\t"
1809                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1810                         : "memory", "%"REG_a
1811                 );
1812
1813                 ydst += lumStride;
1814                 src  += srcStride;
1815
1816                 asm volatile(
1817                         "xor %%"REG_a", %%"REG_a"       \n\t"
1818                         ASMALIGN(4)
1819                         "1:                             \n\t"
1820                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1821                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1822                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1823                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1824                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1825                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1826                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1827                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1828                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1829                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1830                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1831
1832                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1833                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1834
1835                         "add $8, %%"REG_a"              \n\t"
1836                         "cmp %4, %%"REG_a"              \n\t"
1837                         " jb 1b                         \n\t"
1838
1839                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1840                         : "memory", "%"REG_a
1841                 );
1842 #else
1843                 long i;
1844                 for(i=0; i<chromWidth; i++)
1845                 {
1846                         ydst[2*i+0]     = src[4*i+0];
1847                         udst[i]         = src[4*i+1];
1848                         ydst[2*i+1]     = src[4*i+2];
1849                         vdst[i]         = src[4*i+3];
1850                 }
1851                 ydst += lumStride;
1852                 src  += srcStride;
1853
1854                 for(i=0; i<chromWidth; i++)
1855                 {
1856                         ydst[2*i+0]     = src[4*i+0];
1857                         ydst[2*i+1]     = src[4*i+2];
1858                 }
1859 #endif
1860                 udst += chromStride;
1861                 vdst += chromStride;
1862                 ydst += lumStride;
1863                 src  += srcStride;
1864         }
1865 #ifdef HAVE_MMX
1866 asm volatile(   EMMS" \n\t"
1867                 SFENCE" \n\t"
1868                 :::"memory");
1869 #endif
1870 }
1871
1872 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1873         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1874         long width, long height, long lumStride, long chromStride)
1875 {
1876         /* Y Plane */
1877         memcpy(ydst, ysrc, width*height);
1878
1879         /* XXX: implement upscaling for U,V */
1880 }
1881
1882 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1883 {
1884         long x,y;
1885         
1886         dst[0]= src[0];
1887         
1888         // first line
1889         for(x=0; x<srcWidth-1; x++){
1890                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1891                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1892         }
1893         dst[2*srcWidth-1]= src[srcWidth-1];
1894         
1895         dst+= dstStride;
1896
1897         for(y=1; y<srcHeight; y++){
1898 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1899                 const long mmxSize= srcWidth&~15;
1900                 asm volatile(
1901                         "mov %4, %%"REG_a"              \n\t"
1902                         "1:                             \n\t"
1903                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1904                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1905                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1906                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1907                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1908                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1909                         PAVGB" %%mm0, %%mm5             \n\t"
1910                         PAVGB" %%mm0, %%mm3             \n\t"
1911                         PAVGB" %%mm0, %%mm5             \n\t"
1912                         PAVGB" %%mm0, %%mm3             \n\t"
1913                         PAVGB" %%mm1, %%mm4             \n\t"
1914                         PAVGB" %%mm1, %%mm2             \n\t"
1915                         PAVGB" %%mm1, %%mm4             \n\t"
1916                         PAVGB" %%mm1, %%mm2             \n\t"
1917                         "movq %%mm5, %%mm7              \n\t"
1918                         "movq %%mm4, %%mm6              \n\t"
1919                         "punpcklbw %%mm3, %%mm5         \n\t"
1920                         "punpckhbw %%mm3, %%mm7         \n\t"
1921                         "punpcklbw %%mm2, %%mm4         \n\t"
1922                         "punpckhbw %%mm2, %%mm6         \n\t"
1923 #if 1
1924                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1925                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1926                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1927                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1928 #else
1929                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1930                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1931                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1932                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1933 #endif
1934                         "add $8, %%"REG_a"              \n\t"
1935                         " js 1b                         \n\t"
1936                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1937                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1938                            "g" (-mmxSize)
1939                         : "%"REG_a
1940
1941                 );
1942 #else
1943                 const long mmxSize=1;
1944 #endif
1945                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1946                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1947
1948                 for(x=mmxSize-1; x<srcWidth-1; x++){
1949                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1950                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1951                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1952                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1953                 }
1954                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1955                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1956
1957                 dst+=dstStride*2;
1958                 src+=srcStride;
1959         }
1960         
1961         // last line
1962 #if 1
1963         dst[0]= src[0];
1964         
1965         for(x=0; x<srcWidth-1; x++){
1966                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1967                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1968         }
1969         dst[2*srcWidth-1]= src[srcWidth-1];
1970 #else
1971         for(x=0; x<srcWidth; x++){
1972                 dst[2*x+0]=
1973                 dst[2*x+1]= src[x];
1974         }
1975 #endif
1976
1977 #ifdef HAVE_MMX
1978 asm volatile(   EMMS" \n\t"
1979                 SFENCE" \n\t"
1980                 :::"memory");
1981 #endif
1982 }
1983
1984 /**
1985  *
1986  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1987  * problem for anyone then tell me, and ill fix it)
1988  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1989  */
1990 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1991         long width, long height,
1992         long lumStride, long chromStride, long srcStride)
1993 {
1994         long y;
1995         const long chromWidth= width>>1;
1996         for(y=0; y<height; y+=2)
1997         {
1998 #ifdef HAVE_MMX
1999                 asm volatile(
2000                         "xorl %%eax, %%eax              \n\t"
2001                         "pcmpeqw %%mm7, %%mm7           \n\t"
2002                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
2003                         ASMALIGN(4)
2004                         "1:                             \n\t"
2005                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2006                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
2007                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
2008                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
2009                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
2010                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
2011                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
2012                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2013                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2014                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
2015                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
2016
2017                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
2018
2019                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2020                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2021                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2022                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2023                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2024                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2025                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2026                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2027                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2028                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2029
2030                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2031
2032                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2033                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2034                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2035                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2036                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2037                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2038                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2039                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2040
2041                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2042                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2043
2044                         "addl $8, %%eax                 \n\t"
2045                         "cmpl %4, %%eax                 \n\t"
2046                         " jb 1b                         \n\t"
2047                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2048                         : "memory", "%eax"
2049                 );
2050
2051                 ydst += lumStride;
2052                 src  += srcStride;
2053
2054                 asm volatile(
2055                         "xorl %%eax, %%eax              \n\t"
2056                         ASMALIGN(4)
2057                         "1:                             \n\t"
2058                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2059                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2060                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2061                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2062                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2063                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2064                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2065                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2066                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2067                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2068                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2069
2070                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2071                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2072
2073                         "addl $8, %%eax                 \n\t"
2074                         "cmpl %4, %%eax                 \n\t"
2075                         " jb 1b                         \n\t"
2076
2077                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2078                         : "memory", "%eax"
2079                 );
2080 #else
2081                 long i;
2082                 for(i=0; i<chromWidth; i++)
2083                 {
2084                         udst[i]         = src[4*i+0];
2085                         ydst[2*i+0]     = src[4*i+1];
2086                         vdst[i]         = src[4*i+2];
2087                         ydst[2*i+1]     = src[4*i+3];
2088                 }
2089                 ydst += lumStride;
2090                 src  += srcStride;
2091
2092                 for(i=0; i<chromWidth; i++)
2093                 {
2094                         ydst[2*i+0]     = src[4*i+1];
2095                         ydst[2*i+1]     = src[4*i+3];
2096                 }
2097 #endif
2098                 udst += chromStride;
2099                 vdst += chromStride;
2100                 ydst += lumStride;
2101                 src  += srcStride;
2102         }
2103 #ifdef HAVE_MMX
2104 asm volatile(   EMMS" \n\t"
2105                 SFENCE" \n\t"
2106                 :::"memory");
2107 #endif
2108 }
2109
2110 /**
2111  *
2112  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2113  * problem for anyone then tell me, and ill fix it)
2114  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2115  */
2116 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2117         long width, long height,
2118         long lumStride, long chromStride, long srcStride)
2119 {
2120         long y;
2121         const long chromWidth= width>>1;
2122 #ifdef HAVE_MMX
2123         for(y=0; y<height-2; y+=2)
2124         {
2125                 long i;
2126                 for(i=0; i<2; i++)
2127                 {
2128                         asm volatile(
2129                                 "mov %2, %%"REG_a"              \n\t"
2130                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2131                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2132                                 "pxor %%mm7, %%mm7              \n\t"
2133                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2134                                 ASMALIGN(4)
2135                                 "1:                             \n\t"
2136                                 PREFETCH" 64(%0, %%"REG_d")     \n\t"
2137                                 "movd (%0, %%"REG_d"), %%mm0    \n\t"
2138                                 "movd 3(%0, %%"REG_d"), %%mm1   \n\t"
2139                                 "punpcklbw %%mm7, %%mm0         \n\t"
2140                                 "punpcklbw %%mm7, %%mm1         \n\t"
2141                                 "movd 6(%0, %%"REG_d"), %%mm2   \n\t"
2142                                 "movd 9(%0, %%"REG_d"), %%mm3   \n\t"
2143                                 "punpcklbw %%mm7, %%mm2         \n\t"
2144                                 "punpcklbw %%mm7, %%mm3         \n\t"
2145                                 "pmaddwd %%mm6, %%mm0           \n\t"
2146                                 "pmaddwd %%mm6, %%mm1           \n\t"
2147                                 "pmaddwd %%mm6, %%mm2           \n\t"
2148                                 "pmaddwd %%mm6, %%mm3           \n\t"
2149 #ifndef FAST_BGR2YV12
2150                                 "psrad $8, %%mm0                \n\t"
2151                                 "psrad $8, %%mm1                \n\t"
2152                                 "psrad $8, %%mm2                \n\t"
2153                                 "psrad $8, %%mm3                \n\t"
2154 #endif
2155                                 "packssdw %%mm1, %%mm0          \n\t"
2156                                 "packssdw %%mm3, %%mm2          \n\t"
2157                                 "pmaddwd %%mm5, %%mm0           \n\t"
2158                                 "pmaddwd %%mm5, %%mm2           \n\t"
2159                                 "packssdw %%mm2, %%mm0          \n\t"
2160                                 "psraw $7, %%mm0                \n\t"
2161
2162                                 "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2163                                 "movd 15(%0, %%"REG_d"), %%mm1  \n\t"
2164                                 "punpcklbw %%mm7, %%mm4         \n\t"
2165                                 "punpcklbw %%mm7, %%mm1         \n\t"
2166                                 "movd 18(%0, %%"REG_d"), %%mm2  \n\t"
2167                                 "movd 21(%0, %%"REG_d"), %%mm3  \n\t"
2168                                 "punpcklbw %%mm7, %%mm2         \n\t"
2169                                 "punpcklbw %%mm7, %%mm3         \n\t"
2170                                 "pmaddwd %%mm6, %%mm4           \n\t"
2171                                 "pmaddwd %%mm6, %%mm1           \n\t"
2172                                 "pmaddwd %%mm6, %%mm2           \n\t"
2173                                 "pmaddwd %%mm6, %%mm3           \n\t"
2174 #ifndef FAST_BGR2YV12
2175                                 "psrad $8, %%mm4                \n\t"
2176                                 "psrad $8, %%mm1                \n\t"
2177                                 "psrad $8, %%mm2                \n\t"
2178                                 "psrad $8, %%mm3                \n\t"
2179 #endif
2180                                 "packssdw %%mm1, %%mm4          \n\t"
2181                                 "packssdw %%mm3, %%mm2          \n\t"
2182                                 "pmaddwd %%mm5, %%mm4           \n\t"
2183                                 "pmaddwd %%mm5, %%mm2           \n\t"
2184                                 "add $24, %%"REG_d"             \n\t"
2185                                 "packssdw %%mm2, %%mm4          \n\t"
2186                                 "psraw $7, %%mm4                \n\t"
2187
2188                                 "packuswb %%mm4, %%mm0          \n\t"
2189                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2190
2191                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2192                                 "add $8, %%"REG_a"              \n\t"
2193                                 " js 1b                         \n\t"
2194                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2195                                 : "%"REG_a, "%"REG_d
2196                         );
2197                         ydst += lumStride;
2198                         src  += srcStride;
2199                 }
2200                 src -= srcStride*2;
2201                 asm volatile(
2202                         "mov %4, %%"REG_a"              \n\t"
2203                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2204                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2205                         "pxor %%mm7, %%mm7              \n\t"
2206                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2207                         "add %%"REG_d", %%"REG_d"       \n\t"
2208                         ASMALIGN(4)
2209                         "1:                             \n\t"
2210                         PREFETCH" 64(%0, %%"REG_d")     \n\t"
2211                         PREFETCH" 64(%1, %%"REG_d")     \n\t"
2212 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2213                         "movq (%0, %%"REG_d"), %%mm0    \n\t"
2214                         "movq (%1, %%"REG_d"), %%mm1    \n\t"
2215                         "movq 6(%0, %%"REG_d"), %%mm2   \n\t"
2216                         "movq 6(%1, %%"REG_d"), %%mm3   \n\t"
2217                         PAVGB" %%mm1, %%mm0             \n\t"
2218                         PAVGB" %%mm3, %%mm2             \n\t"
2219                         "movq %%mm0, %%mm1              \n\t"
2220                         "movq %%mm2, %%mm3              \n\t"
2221                         "psrlq $24, %%mm0               \n\t"
2222                         "psrlq $24, %%mm2               \n\t"
2223                         PAVGB" %%mm1, %%mm0             \n\t"
2224                         PAVGB" %%mm3, %%mm2             \n\t"
2225                         "punpcklbw %%mm7, %%mm0         \n\t"
2226                         "punpcklbw %%mm7, %%mm2         \n\t"
2227 #else
2228                         "movd (%0, %%"REG_d"), %%mm0    \n\t"
2229                         "movd (%1, %%"REG_d"), %%mm1    \n\t"
2230                         "movd 3(%0, %%"REG_d"), %%mm2   \n\t"
2231                         "movd 3(%1, %%"REG_d"), %%mm3   \n\t"
2232                         "punpcklbw %%mm7, %%mm0         \n\t"
2233                         "punpcklbw %%mm7, %%mm1         \n\t"
2234                         "punpcklbw %%mm7, %%mm2         \n\t"
2235                         "punpcklbw %%mm7, %%mm3         \n\t"
2236                         "paddw %%mm1, %%mm0             \n\t"
2237                         "paddw %%mm3, %%mm2             \n\t"
2238                         "paddw %%mm2, %%mm0             \n\t"
2239                         "movd 6(%0, %%"REG_d"), %%mm4   \n\t"
2240                         "movd 6(%1, %%"REG_d"), %%mm1   \n\t"
2241                         "movd 9(%0, %%"REG_d"), %%mm2   \n\t"
2242                         "movd 9(%1, %%"REG_d"), %%mm3   \n\t"
2243                         "punpcklbw %%mm7, %%mm4         \n\t"
2244                         "punpcklbw %%mm7, %%mm1         \n\t"
2245                         "punpcklbw %%mm7, %%mm2         \n\t"
2246                         "punpcklbw %%mm7, %%mm3         \n\t"
2247                         "paddw %%mm1, %%mm4             \n\t"
2248                         "paddw %%mm3, %%mm2             \n\t"
2249                         "paddw %%mm4, %%mm2             \n\t"
2250                         "psrlw $2, %%mm0                \n\t"
2251                         "psrlw $2, %%mm2                \n\t"
2252 #endif
2253                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2254                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2255
2256                         "pmaddwd %%mm0, %%mm1           \n\t"
2257                         "pmaddwd %%mm2, %%mm3           \n\t"
2258                         "pmaddwd %%mm6, %%mm0           \n\t"
2259                         "pmaddwd %%mm6, %%mm2           \n\t"
2260 #ifndef FAST_BGR2YV12
2261                         "psrad $8, %%mm0                \n\t"
2262                         "psrad $8, %%mm1                \n\t"
2263                         "psrad $8, %%mm2                \n\t"
2264                         "psrad $8, %%mm3                \n\t"
2265 #endif
2266                         "packssdw %%mm2, %%mm0          \n\t"
2267                         "packssdw %%mm3, %%mm1          \n\t"
2268                         "pmaddwd %%mm5, %%mm0           \n\t"
2269                         "pmaddwd %%mm5, %%mm1           \n\t"
2270                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2271                         "psraw $7, %%mm0                \n\t"
2272
2273 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2274                         "movq 12(%0, %%"REG_d"), %%mm4  \n\t"
2275                         "movq 12(%1, %%"REG_d"), %%mm1  \n\t"
2276                         "movq 18(%0, %%"REG_d"), %%mm2  \n\t"
2277                         "movq 18(%1, %%"REG_d"), %%mm3  \n\t"
2278                         PAVGB" %%mm1, %%mm4             \n\t"
2279                         PAVGB" %%mm3, %%mm2             \n\t"
2280                         "movq %%mm4, %%mm1              \n\t"
2281                         "movq %%mm2, %%mm3              \n\t"
2282                         "psrlq $24, %%mm4               \n\t"
2283                         "psrlq $24, %%mm2               \n\t"
2284                         PAVGB" %%mm1, %%mm4             \n\t"
2285                         PAVGB" %%mm3, %%mm2             \n\t"
2286                         "punpcklbw %%mm7, %%mm4         \n\t"
2287                         "punpcklbw %%mm7, %%mm2         \n\t"
2288 #else
2289                         "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2290                         "movd 12(%1, %%"REG_d"), %%mm1  \n\t"
2291                         "movd 15(%0, %%"REG_d"), %%mm2  \n\t"
2292                         "movd 15(%1, %%"REG_d"), %%mm3  \n\t"
2293                         "punpcklbw %%mm7, %%mm4         \n\t"
2294                         "punpcklbw %%mm7, %%mm1         \n\t"
2295                         "punpcklbw %%mm7, %%mm2         \n\t"
2296                         "punpcklbw %%mm7, %%mm3         \n\t"
2297                         "paddw %%mm1, %%mm4             \n\t"
2298                         "paddw %%mm3, %%mm2             \n\t"
2299                         "paddw %%mm2, %%mm4             \n\t"
2300                         "movd 18(%0, %%"REG_d"), %%mm5  \n\t"
2301                         "movd 18(%1, %%"REG_d"), %%mm1  \n\t"
2302                         "movd 21(%0, %%"REG_d"), %%mm2  \n\t"
2303                         "movd 21(%1, %%"REG_d"), %%mm3  \n\t"
2304                         "punpcklbw %%mm7, %%mm5         \n\t"
2305                         "punpcklbw %%mm7, %%mm1         \n\t"
2306                         "punpcklbw %%mm7, %%mm2         \n\t"
2307                         "punpcklbw %%mm7, %%mm3         \n\t"
2308                         "paddw %%mm1, %%mm5             \n\t"
2309                         "paddw %%mm3, %%mm2             \n\t"
2310                         "paddw %%mm5, %%mm2             \n\t"
2311                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2312                         "psrlw $2, %%mm4                \n\t"
2313                         "psrlw $2, %%mm2                \n\t"
2314 #endif
2315                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2316                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2317
2318                         "pmaddwd %%mm4, %%mm1           \n\t"
2319                         "pmaddwd %%mm2, %%mm3           \n\t"
2320                         "pmaddwd %%mm6, %%mm4           \n\t"
2321                         "pmaddwd %%mm6, %%mm2           \n\t"
2322 #ifndef FAST_BGR2YV12
2323                         "psrad $8, %%mm4                \n\t"
2324                         "psrad $8, %%mm1                \n\t"
2325                         "psrad $8, %%mm2                \n\t"
2326                         "psrad $8, %%mm3                \n\t"
2327 #endif
2328                         "packssdw %%mm2, %%mm4          \n\t"
2329                         "packssdw %%mm3, %%mm1          \n\t"
2330                         "pmaddwd %%mm5, %%mm4           \n\t"
2331                         "pmaddwd %%mm5, %%mm1           \n\t"
2332                         "add $24, %%"REG_d"             \n\t"
2333                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2334                         "psraw $7, %%mm4                \n\t"
2335
2336                         "movq %%mm0, %%mm1              \n\t"
2337                         "punpckldq %%mm4, %%mm0         \n\t"
2338                         "punpckhdq %%mm4, %%mm1         \n\t"
2339                         "packsswb %%mm1, %%mm0          \n\t"
2340                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2341                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2342                         "punpckhdq %%mm0, %%mm0         \n\t"
2343                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2344                         "add $4, %%"REG_a"              \n\t"
2345                         " js 1b                         \n\t"
2346                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2347                         : "%"REG_a, "%"REG_d
2348                 );
2349
2350                 udst += chromStride;
2351                 vdst += chromStride;
2352                 src  += srcStride*2;
2353         }
2354
2355         asm volatile(   EMMS" \n\t"
2356                         SFENCE" \n\t"
2357                         :::"memory");
2358 #else
2359         y=0;
2360 #endif
2361         for(; y<height; y+=2)
2362         {
2363                 long i;
2364                 for(i=0; i<chromWidth; i++)
2365                 {
2366                         unsigned int b= src[6*i+0];
2367                         unsigned int g= src[6*i+1];
2368                         unsigned int r= src[6*i+2];
2369
2370                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2371                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2372                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2373
2374                         udst[i]         = U;
2375                         vdst[i]         = V;
2376                         ydst[2*i]       = Y;
2377
2378                         b= src[6*i+3];
2379                         g= src[6*i+4];
2380                         r= src[6*i+5];
2381
2382                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2383                         ydst[2*i+1]     = Y;
2384                 }
2385                 ydst += lumStride;
2386                 src  += srcStride;
2387
2388                 for(i=0; i<chromWidth; i++)
2389                 {
2390                         unsigned int b= src[6*i+0];
2391                         unsigned int g= src[6*i+1];
2392                         unsigned int r= src[6*i+2];
2393
2394                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2395
2396                         ydst[2*i]       = Y;
2397
2398                         b= src[6*i+3];
2399                         g= src[6*i+4];
2400                         r= src[6*i+5];
2401
2402                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403                         ydst[2*i+1]     = Y;
2404                 }
2405                 udst += chromStride;
2406                 vdst += chromStride;
2407                 ydst += lumStride;
2408                 src  += srcStride;
2409         }
2410 }
2411
2412 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2413                             long width, long height, long src1Stride,
2414                             long src2Stride, long dstStride){
2415         long h;
2416
2417         for(h=0; h < height; h++)
2418         {
2419                 long w;
2420
2421 #ifdef HAVE_MMX
2422 #ifdef HAVE_SSE2
2423                 asm(
2424                         "xor %%"REG_a", %%"REG_a"       \n\t"
2425                         "1:                             \n\t"
2426                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2427                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2428                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2429                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2430                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2431                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2432                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2433                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2434                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2435                         "add $16, %%"REG_a"             \n\t"
2436                         "cmp %3, %%"REG_a"              \n\t"
2437                         " jb 1b                         \n\t"
2438                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2439                         : "memory", "%"REG_a""
2440                 );
2441 #else
2442                 asm(
2443                         "xor %%"REG_a", %%"REG_a"       \n\t"
2444                         "1:                             \n\t"
2445                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2446                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2447                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2448                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2449                         "movq %%mm0, %%mm1              \n\t"
2450                         "movq %%mm2, %%mm3              \n\t"
2451                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2452                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2453                         "punpcklbw %%mm4, %%mm0         \n\t"
2454                         "punpckhbw %%mm4, %%mm1         \n\t"
2455                         "punpcklbw %%mm5, %%mm2         \n\t"
2456                         "punpckhbw %%mm5, %%mm3         \n\t"
2457                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2458                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2459                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2460                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2461                         "add $16, %%"REG_a"             \n\t"
2462                         "cmp %3, %%"REG_a"              \n\t"
2463                         " jb 1b                         \n\t"
2464                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465                         : "memory", "%"REG_a
2466                 );
2467 #endif
2468                 for(w= (width&(~15)); w < width; w++)
2469                 {
2470                         dest[2*w+0] = src1[w];
2471                         dest[2*w+1] = src2[w];
2472                 }
2473 #else
2474                 for(w=0; w < width; w++)
2475                 {
2476                         dest[2*w+0] = src1[w];
2477                         dest[2*w+1] = src2[w];
2478                 }
2479 #endif
2480                 dest += dstStride;
2481                 src1 += src1Stride;
2482                 src2 += src2Stride;
2483         }
2484 #ifdef HAVE_MMX
2485         asm(
2486                 EMMS" \n\t"
2487                 SFENCE" \n\t"
2488                 ::: "memory"
2489                 );
2490 #endif
2491 }
2492
2493 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2494                         uint8_t *dst1, uint8_t *dst2,
2495                         long width, long height,
2496                         long srcStride1, long srcStride2,
2497                         long dstStride1, long dstStride2)
2498 {
2499     long y,x,w,h;
2500     w=width/2; h=height/2;
2501 #ifdef HAVE_MMX
2502     asm volatile(
2503         PREFETCH" %0\n\t"
2504         PREFETCH" %1\n\t"
2505         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2506 #endif
2507     for(y=0;y<h;y++){
2508         const uint8_t* s1=src1+srcStride1*(y>>1);
2509         uint8_t* d=dst1+dstStride1*y;
2510         x=0;
2511 #ifdef HAVE_MMX
2512         for(;x<w-31;x+=32)
2513         {
2514             asm volatile(
2515                 PREFETCH" 32%1\n\t"
2516                 "movq   %1, %%mm0\n\t"
2517                 "movq   8%1, %%mm2\n\t"
2518                 "movq   16%1, %%mm4\n\t"
2519                 "movq   24%1, %%mm6\n\t"
2520                 "movq   %%mm0, %%mm1\n\t"
2521                 "movq   %%mm2, %%mm3\n\t"
2522                 "movq   %%mm4, %%mm5\n\t"
2523                 "movq   %%mm6, %%mm7\n\t"
2524                 "punpcklbw %%mm0, %%mm0\n\t"
2525                 "punpckhbw %%mm1, %%mm1\n\t"
2526                 "punpcklbw %%mm2, %%mm2\n\t"
2527                 "punpckhbw %%mm3, %%mm3\n\t"
2528                 "punpcklbw %%mm4, %%mm4\n\t"
2529                 "punpckhbw %%mm5, %%mm5\n\t"
2530                 "punpcklbw %%mm6, %%mm6\n\t"
2531                 "punpckhbw %%mm7, %%mm7\n\t"
2532                 MOVNTQ" %%mm0, %0\n\t"
2533                 MOVNTQ" %%mm1, 8%0\n\t"
2534                 MOVNTQ" %%mm2, 16%0\n\t"
2535                 MOVNTQ" %%mm3, 24%0\n\t"
2536                 MOVNTQ" %%mm4, 32%0\n\t"
2537                 MOVNTQ" %%mm5, 40%0\n\t"
2538                 MOVNTQ" %%mm6, 48%0\n\t"
2539                 MOVNTQ" %%mm7, 56%0"
2540                 :"=m"(d[2*x])
2541                 :"m"(s1[x])
2542                 :"memory");
2543         }
2544 #endif
2545         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2546     }
2547     for(y=0;y<h;y++){
2548         const uint8_t* s2=src2+srcStride2*(y>>1);
2549         uint8_t* d=dst2+dstStride2*y;
2550         x=0;
2551 #ifdef HAVE_MMX
2552         for(;x<w-31;x+=32)
2553         {
2554             asm volatile(
2555                 PREFETCH" 32%1\n\t"
2556                 "movq   %1, %%mm0\n\t"
2557                 "movq   8%1, %%mm2\n\t"
2558                 "movq   16%1, %%mm4\n\t"
2559                 "movq   24%1, %%mm6\n\t"
2560                 "movq   %%mm0, %%mm1\n\t"
2561                 "movq   %%mm2, %%mm3\n\t"
2562                 "movq   %%mm4, %%mm5\n\t"
2563                 "movq   %%mm6, %%mm7\n\t"
2564                 "punpcklbw %%mm0, %%mm0\n\t"
2565                 "punpckhbw %%mm1, %%mm1\n\t"
2566                 "punpcklbw %%mm2, %%mm2\n\t"
2567                 "punpckhbw %%mm3, %%mm3\n\t"
2568                 "punpcklbw %%mm4, %%mm4\n\t"
2569                 "punpckhbw %%mm5, %%mm5\n\t"
2570                 "punpcklbw %%mm6, %%mm6\n\t"
2571                 "punpckhbw %%mm7, %%mm7\n\t"
2572                 MOVNTQ" %%mm0, %0\n\t"
2573                 MOVNTQ" %%mm1, 8%0\n\t"
2574                 MOVNTQ" %%mm2, 16%0\n\t"
2575                 MOVNTQ" %%mm3, 24%0\n\t"
2576                 MOVNTQ" %%mm4, 32%0\n\t"
2577                 MOVNTQ" %%mm5, 40%0\n\t"
2578                 MOVNTQ" %%mm6, 48%0\n\t"
2579                 MOVNTQ" %%mm7, 56%0"
2580                 :"=m"(d[2*x])
2581                 :"m"(s2[x])
2582                 :"memory");
2583         }
2584 #endif
2585         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2586     }
2587 #ifdef HAVE_MMX
2588         asm(
2589                 EMMS" \n\t"
2590                 SFENCE" \n\t"
2591                 ::: "memory"
2592                 );
2593 #endif
2594 }
2595
2596 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2597                         uint8_t *dst,
2598                         long width, long height,
2599                         long srcStride1, long srcStride2,
2600                         long srcStride3, long dstStride)
2601 {
2602     long y,x,w,h;
2603     w=width/2; h=height;
2604     for(y=0;y<h;y++){
2605         const uint8_t* yp=src1+srcStride1*y;
2606         const uint8_t* up=src2+srcStride2*(y>>2);
2607         const uint8_t* vp=src3+srcStride3*(y>>2);
2608         uint8_t* d=dst+dstStride*y;
2609         x=0;
2610 #ifdef HAVE_MMX
2611         for(;x<w-7;x+=8)
2612         {
2613             asm volatile(
2614                 PREFETCH" 32(%1, %0)\n\t"
2615                 PREFETCH" 32(%2, %0)\n\t"
2616                 PREFETCH" 32(%3, %0)\n\t"
2617                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2618                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2619                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2620                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2621                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2622                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2623                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2624                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2625                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2626                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2627
2628                 "movq   %%mm1, %%mm6\n\t"
2629                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2630                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2631                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2632                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2633                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2634                 
2635                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2636                 "movq   8(%1, %0, 4), %%mm0\n\t"
2637                 "movq   %%mm0, %%mm3\n\t"
2638                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2639                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2640                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2641                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2642
2643                 "movq   %%mm4, %%mm6\n\t"
2644                 "movq   16(%1, %0, 4), %%mm0\n\t"
2645                 "movq   %%mm0, %%mm3\n\t"
2646                 "punpcklbw %%mm5, %%mm4\n\t"
2647                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2648                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2649                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2650                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2651                 
2652                 "punpckhbw %%mm5, %%mm6\n\t"
2653                 "movq   24(%1, %0, 4), %%mm0\n\t"
2654                 "movq   %%mm0, %%mm3\n\t"
2655                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2656                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2657                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2658                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2659
2660                 : "+r" (x)
2661                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2662                 :"memory");
2663         }
2664 #endif
2665         for(; x<w; x++)
2666         {
2667             const long x2= x<<2;
2668             d[8*x+0]=yp[x2];
2669             d[8*x+1]=up[x];
2670             d[8*x+2]=yp[x2+1];
2671             d[8*x+3]=vp[x];
2672             d[8*x+4]=yp[x2+2];
2673             d[8*x+5]=up[x];
2674             d[8*x+6]=yp[x2+3];
2675             d[8*x+7]=vp[x];
2676         }
2677     }
2678 #ifdef HAVE_MMX
2679         asm(
2680                 EMMS" \n\t"
2681                 SFENCE" \n\t"
2682                 ::: "memory"
2683                 );
2684 #endif
2685 }