]> git.sesse.net Git - ffmpeg/blob - libswscale/rgb2rgb_template.c
Add 15 bit support, patch by Bobby Bingham, uhmmmm gmail com.
[ffmpeg] / libswscale / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  *
11  * This file is part of FFmpeg.
12  *
13  * FFmpeg is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * FFmpeg is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with FFmpeg; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26  * 
27  * the C code (not assembly, mmx, ...) of this file can be used
28  * under the LGPL license too
29  */
30
31 #include <stddef.h>
32 #include <inttypes.h> /* for __WORDSIZE */
33
34 #ifndef __WORDSIZE
35 // #warning You have misconfigured system and probably will lose performance!
36 #define __WORDSIZE MP_WORDSIZE
37 #endif
38
39 #undef PREFETCH
40 #undef MOVNTQ
41 #undef EMMS
42 #undef SFENCE
43 #undef MMREG_SIZE
44 #undef PREFETCHW
45 #undef PAVGB
46
47 #ifdef HAVE_SSE2
48 #define MMREG_SIZE 16
49 #else
50 #define MMREG_SIZE 8
51 #endif
52
53 #ifdef HAVE_3DNOW
54 #define PREFETCH  "prefetch"
55 #define PREFETCHW "prefetchw"
56 #define PAVGB     "pavgusb"
57 #elif defined ( HAVE_MMX2 )
58 #define PREFETCH "prefetchnta"
59 #define PREFETCHW "prefetcht0"
60 #define PAVGB     "pavgb"
61 #else
62 #ifdef __APPLE__
63 #define PREFETCH "#"
64 #define PREFETCHW "#"
65 #else
66 #define PREFETCH  " # nop"
67 #define PREFETCHW " # nop"
68 #endif
69 #endif
70
71 #ifdef HAVE_3DNOW
72 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
73 #define EMMS     "femms"
74 #else
75 #define EMMS     "emms"
76 #endif
77
78 #ifdef HAVE_MMX2
79 #define MOVNTQ "movntq"
80 #define SFENCE "sfence"
81 #else
82 #define MOVNTQ "movq"
83 #define SFENCE " # nop"
84 #endif
85
86 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
87 {
88   uint8_t *dest = dst;
89   const uint8_t *s = src;
90   const uint8_t *end;
91 #ifdef HAVE_MMX
92   const uint8_t *mm_end;
93 #endif
94   end = s + src_size;
95 #ifdef HAVE_MMX
96   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
97   mm_end = end - 23;
98   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
99   while(s < mm_end)
100   {
101     __asm __volatile(
102         PREFETCH"       32%1\n\t"
103         "movd   %1, %%mm0\n\t"
104         "punpckldq 3%1, %%mm0\n\t"
105         "movd   6%1, %%mm1\n\t"
106         "punpckldq 9%1, %%mm1\n\t"
107         "movd   12%1, %%mm2\n\t"
108         "punpckldq 15%1, %%mm2\n\t"
109         "movd   18%1, %%mm3\n\t"
110         "punpckldq 21%1, %%mm3\n\t"
111         "pand   %%mm7, %%mm0\n\t"
112         "pand   %%mm7, %%mm1\n\t"
113         "pand   %%mm7, %%mm2\n\t"
114         "pand   %%mm7, %%mm3\n\t"
115         MOVNTQ" %%mm0, %0\n\t"
116         MOVNTQ" %%mm1, 8%0\n\t"
117         MOVNTQ" %%mm2, 16%0\n\t"
118         MOVNTQ" %%mm3, 24%0"
119         :"=m"(*dest)
120         :"m"(*s)
121         :"memory");
122     dest += 32;
123     s += 24;
124   }
125   __asm __volatile(SFENCE:::"memory");
126   __asm __volatile(EMMS:::"memory");
127 #endif
128   while(s < end)
129   {
130 #ifdef WORDS_BIGENDIAN
131     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
132     *dest++ = 0;
133     *dest++ = s[2];
134     *dest++ = s[1];
135     *dest++ = s[0];
136     s+=3;
137 #else
138     *dest++ = *s++;
139     *dest++ = *s++;
140     *dest++ = *s++;
141     *dest++ = 0;
142 #endif
143   }
144 }
145
146 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
147 {
148   uint8_t *dest = dst;
149   const uint8_t *s = src;
150   const uint8_t *end;
151 #ifdef HAVE_MMX
152   const uint8_t *mm_end;
153 #endif
154   end = s + src_size;
155 #ifdef HAVE_MMX
156   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
157   mm_end = end - 31;
158   while(s < mm_end)
159   {
160     __asm __volatile(
161         PREFETCH"       32%1\n\t"
162         "movq   %1, %%mm0\n\t"
163         "movq   8%1, %%mm1\n\t"
164         "movq   16%1, %%mm4\n\t"
165         "movq   24%1, %%mm5\n\t"
166         "movq   %%mm0, %%mm2\n\t"
167         "movq   %%mm1, %%mm3\n\t"
168         "movq   %%mm4, %%mm6\n\t"
169         "movq   %%mm5, %%mm7\n\t"
170         "psrlq  $8, %%mm2\n\t"
171         "psrlq  $8, %%mm3\n\t"
172         "psrlq  $8, %%mm6\n\t"
173         "psrlq  $8, %%mm7\n\t"
174         "pand   %2, %%mm0\n\t"
175         "pand   %2, %%mm1\n\t"
176         "pand   %2, %%mm4\n\t"
177         "pand   %2, %%mm5\n\t"
178         "pand   %3, %%mm2\n\t"
179         "pand   %3, %%mm3\n\t"
180         "pand   %3, %%mm6\n\t"
181         "pand   %3, %%mm7\n\t"
182         "por    %%mm2, %%mm0\n\t"
183         "por    %%mm3, %%mm1\n\t"
184         "por    %%mm6, %%mm4\n\t"
185         "por    %%mm7, %%mm5\n\t"
186
187         "movq   %%mm1, %%mm2\n\t"
188         "movq   %%mm4, %%mm3\n\t"
189         "psllq  $48, %%mm2\n\t"
190         "psllq  $32, %%mm3\n\t"
191         "pand   %4, %%mm2\n\t"
192         "pand   %5, %%mm3\n\t"
193         "por    %%mm2, %%mm0\n\t"
194         "psrlq  $16, %%mm1\n\t"
195         "psrlq  $32, %%mm4\n\t"
196         "psllq  $16, %%mm5\n\t"
197         "por    %%mm3, %%mm1\n\t"
198         "pand   %6, %%mm5\n\t"
199         "por    %%mm5, %%mm4\n\t"
200
201         MOVNTQ" %%mm0, %0\n\t"
202         MOVNTQ" %%mm1, 8%0\n\t"
203         MOVNTQ" %%mm4, 16%0"
204         :"=m"(*dest)
205         :"m"(*s),"m"(mask24l),
206          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
207         :"memory");
208     dest += 24;
209     s += 32;
210   }
211   __asm __volatile(SFENCE:::"memory");
212   __asm __volatile(EMMS:::"memory");
213 #endif
214   while(s < end)
215   {
216 #ifdef WORDS_BIGENDIAN
217     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
218     s++;
219     dest[2] = *s++;
220     dest[1] = *s++;
221     dest[0] = *s++;
222     dest += 3;
223 #else
224     *dest++ = *s++;
225     *dest++ = *s++;
226     *dest++ = *s++;
227     s++;
228 #endif
229   }
230 }
231
232 /*
233  Original by Strepto/Astral
234  ported to gcc & bugfixed : A'rpi
235  MMX2, 3DNOW optimization by Nick Kurshev
236  32bit c version, and and&add trick by Michael Niedermayer
237 */
238 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239 {
240   register const uint8_t* s=src;
241   register uint8_t* d=dst;
242   register const uint8_t *end;
243   const uint8_t *mm_end;
244   end = s + src_size;
245 #ifdef HAVE_MMX
246   __asm __volatile(PREFETCH"    %0"::"m"(*s));
247   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
248   mm_end = end - 15;
249   while(s<mm_end)
250   {
251         __asm __volatile(
252                 PREFETCH"       32%1\n\t"
253                 "movq   %1, %%mm0\n\t"
254                 "movq   8%1, %%mm2\n\t"
255                 "movq   %%mm0, %%mm1\n\t"
256                 "movq   %%mm2, %%mm3\n\t"
257                 "pand   %%mm4, %%mm0\n\t"
258                 "pand   %%mm4, %%mm2\n\t"
259                 "paddw  %%mm1, %%mm0\n\t"
260                 "paddw  %%mm3, %%mm2\n\t"
261                 MOVNTQ" %%mm0, %0\n\t"
262                 MOVNTQ" %%mm2, 8%0"
263                 :"=m"(*d)
264                 :"m"(*s)
265                 );
266         d+=16;
267         s+=16;
268   }
269   __asm __volatile(SFENCE:::"memory");
270   __asm __volatile(EMMS:::"memory");
271 #endif
272     mm_end = end - 3;
273     while(s < mm_end)
274     {
275         register unsigned x= *((uint32_t *)s);
276         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
277         d+=4;
278         s+=4;
279     }
280     if(s < end)
281     {
282         register unsigned short x= *((uint16_t *)s);
283         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
284     }
285 }
286
287 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288 {
289   register const uint8_t* s=src;
290   register uint8_t* d=dst;
291   register const uint8_t *end;
292   const uint8_t *mm_end;
293   end = s + src_size;
294 #ifdef HAVE_MMX
295   __asm __volatile(PREFETCH"    %0"::"m"(*s));
296   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
297   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
298   mm_end = end - 15;
299   while(s<mm_end)
300   {
301         __asm __volatile(
302                 PREFETCH"       32%1\n\t"
303                 "movq   %1, %%mm0\n\t"
304                 "movq   8%1, %%mm2\n\t"
305                 "movq   %%mm0, %%mm1\n\t"
306                 "movq   %%mm2, %%mm3\n\t"
307                 "psrlq  $1, %%mm0\n\t"
308                 "psrlq  $1, %%mm2\n\t"
309                 "pand   %%mm7, %%mm0\n\t"
310                 "pand   %%mm7, %%mm2\n\t"
311                 "pand   %%mm6, %%mm1\n\t"
312                 "pand   %%mm6, %%mm3\n\t"
313                 "por    %%mm1, %%mm0\n\t"
314                 "por    %%mm3, %%mm2\n\t"
315                 MOVNTQ" %%mm0, %0\n\t"
316                 MOVNTQ" %%mm2, 8%0"
317                 :"=m"(*d)
318                 :"m"(*s)
319                 );
320         d+=16;
321         s+=16;
322   }
323   __asm __volatile(SFENCE:::"memory");
324   __asm __volatile(EMMS:::"memory");
325 #endif
326     mm_end = end - 3;
327     while(s < mm_end)
328     {
329         register uint32_t x= *((uint32_t *)s);
330         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
331         s+=4;
332         d+=4;
333     }
334     if(s < end)
335     {
336         register uint16_t x= *((uint16_t *)s);
337         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
338         s+=2;
339         d+=2;
340     }
341 }
342
343 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 {
345         const uint8_t *s = src;
346         const uint8_t *end;
347 #ifdef HAVE_MMX
348         const uint8_t *mm_end;
349 #endif
350         uint16_t *d = (uint16_t *)dst;
351         end = s + src_size;
352 #ifdef HAVE_MMX
353         mm_end = end - 15;
354 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
355         asm volatile(
356                 "movq %3, %%mm5                 \n\t"
357                 "movq %4, %%mm6                 \n\t"
358                 "movq %5, %%mm7                 \n\t"
359                 ASMALIGN(4)
360                 "1:                             \n\t"
361                 PREFETCH" 32(%1)                \n\t"
362                 "movd   (%1), %%mm0             \n\t"
363                 "movd   4(%1), %%mm3            \n\t"
364                 "punpckldq 8(%1), %%mm0         \n\t"
365                 "punpckldq 12(%1), %%mm3        \n\t"
366                 "movq %%mm0, %%mm1              \n\t"
367                 "movq %%mm3, %%mm4              \n\t"
368                 "pand %%mm6, %%mm0              \n\t"
369                 "pand %%mm6, %%mm3              \n\t"
370                 "pmaddwd %%mm7, %%mm0           \n\t"
371                 "pmaddwd %%mm7, %%mm3           \n\t"
372                 "pand %%mm5, %%mm1              \n\t"
373                 "pand %%mm5, %%mm4              \n\t"
374                 "por %%mm1, %%mm0               \n\t"   
375                 "por %%mm4, %%mm3               \n\t"
376                 "psrld $5, %%mm0                \n\t"
377                 "pslld $11, %%mm3               \n\t"
378                 "por %%mm3, %%mm0               \n\t"
379                 MOVNTQ" %%mm0, (%0)             \n\t"
380                 "add $16, %1                    \n\t"
381                 "add $8, %0                     \n\t"
382                 "cmp %2, %1                     \n\t"
383                 " jb 1b                         \n\t"
384                 : "+r" (d), "+r"(s)
385                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
386         );
387 #else
388         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
389         __asm __volatile(
390             "movq       %0, %%mm7\n\t"
391             "movq       %1, %%mm6\n\t"
392             ::"m"(red_16mask),"m"(green_16mask));
393         while(s < mm_end)
394         {
395             __asm __volatile(
396                 PREFETCH" 32%1\n\t"
397                 "movd   %1, %%mm0\n\t"
398                 "movd   4%1, %%mm3\n\t"
399                 "punpckldq 8%1, %%mm0\n\t"
400                 "punpckldq 12%1, %%mm3\n\t"
401                 "movq   %%mm0, %%mm1\n\t"
402                 "movq   %%mm0, %%mm2\n\t"
403                 "movq   %%mm3, %%mm4\n\t"
404                 "movq   %%mm3, %%mm5\n\t"
405                 "psrlq  $3, %%mm0\n\t"
406                 "psrlq  $3, %%mm3\n\t"
407                 "pand   %2, %%mm0\n\t"
408                 "pand   %2, %%mm3\n\t"
409                 "psrlq  $5, %%mm1\n\t"
410                 "psrlq  $5, %%mm4\n\t"
411                 "pand   %%mm6, %%mm1\n\t"
412                 "pand   %%mm6, %%mm4\n\t"
413                 "psrlq  $8, %%mm2\n\t"
414                 "psrlq  $8, %%mm5\n\t"
415                 "pand   %%mm7, %%mm2\n\t"
416                 "pand   %%mm7, %%mm5\n\t"
417                 "por    %%mm1, %%mm0\n\t"
418                 "por    %%mm4, %%mm3\n\t"
419                 "por    %%mm2, %%mm0\n\t"
420                 "por    %%mm5, %%mm3\n\t"
421                 "psllq  $16, %%mm3\n\t"
422                 "por    %%mm3, %%mm0\n\t"
423                 MOVNTQ" %%mm0, %0\n\t"
424                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
425                 d += 4;
426                 s += 16;
427         }
428 #endif
429         __asm __volatile(SFENCE:::"memory");
430         __asm __volatile(EMMS:::"memory");
431 #endif
432         while(s < end)
433         {
434                 register int rgb = *(uint32_t*)s; s += 4;
435                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
436         }
437 }
438
439 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
440 {
441         const uint8_t *s = src;
442         const uint8_t *end;
443 #ifdef HAVE_MMX
444         const uint8_t *mm_end;
445 #endif
446         uint16_t *d = (uint16_t *)dst;
447         end = s + src_size;
448 #ifdef HAVE_MMX
449         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
450         __asm __volatile(
451             "movq       %0, %%mm7\n\t"
452             "movq       %1, %%mm6\n\t"
453             ::"m"(red_16mask),"m"(green_16mask));
454         mm_end = end - 15;
455         while(s < mm_end)
456         {
457             __asm __volatile(
458                 PREFETCH" 32%1\n\t"
459                 "movd   %1, %%mm0\n\t"
460                 "movd   4%1, %%mm3\n\t"
461                 "punpckldq 8%1, %%mm0\n\t"
462                 "punpckldq 12%1, %%mm3\n\t"
463                 "movq   %%mm0, %%mm1\n\t"
464                 "movq   %%mm0, %%mm2\n\t"
465                 "movq   %%mm3, %%mm4\n\t"
466                 "movq   %%mm3, %%mm5\n\t"
467                 "psllq  $8, %%mm0\n\t"
468                 "psllq  $8, %%mm3\n\t"
469                 "pand   %%mm7, %%mm0\n\t"
470                 "pand   %%mm7, %%mm3\n\t"
471                 "psrlq  $5, %%mm1\n\t"
472                 "psrlq  $5, %%mm4\n\t"
473                 "pand   %%mm6, %%mm1\n\t"
474                 "pand   %%mm6, %%mm4\n\t"
475                 "psrlq  $19, %%mm2\n\t"
476                 "psrlq  $19, %%mm5\n\t"
477                 "pand   %2, %%mm2\n\t"
478                 "pand   %2, %%mm5\n\t"
479                 "por    %%mm1, %%mm0\n\t"
480                 "por    %%mm4, %%mm3\n\t"
481                 "por    %%mm2, %%mm0\n\t"
482                 "por    %%mm5, %%mm3\n\t"
483                 "psllq  $16, %%mm3\n\t"
484                 "por    %%mm3, %%mm0\n\t"
485                 MOVNTQ" %%mm0, %0\n\t"
486                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
487                 d += 4;
488                 s += 16;
489         }
490         __asm __volatile(SFENCE:::"memory");
491         __asm __volatile(EMMS:::"memory");
492 #endif
493         while(s < end)
494         {
495                 register int rgb = *(uint32_t*)s; s += 4;
496                 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
497         }
498 }
499
500 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
501 {
502         const uint8_t *s = src;
503         const uint8_t *end;
504 #ifdef HAVE_MMX
505         const uint8_t *mm_end;
506 #endif
507         uint16_t *d = (uint16_t *)dst;
508         end = s + src_size;
509 #ifdef HAVE_MMX
510         mm_end = end - 15;
511 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
512         asm volatile(
513                 "movq %3, %%mm5                 \n\t"
514                 "movq %4, %%mm6                 \n\t"
515                 "movq %5, %%mm7                 \n\t"
516                 ASMALIGN(4)
517                 "1:                             \n\t"
518                 PREFETCH" 32(%1)                \n\t"
519                 "movd   (%1), %%mm0             \n\t"
520                 "movd   4(%1), %%mm3            \n\t"
521                 "punpckldq 8(%1), %%mm0         \n\t"
522                 "punpckldq 12(%1), %%mm3        \n\t"
523                 "movq %%mm0, %%mm1              \n\t"
524                 "movq %%mm3, %%mm4              \n\t"
525                 "pand %%mm6, %%mm0              \n\t"
526                 "pand %%mm6, %%mm3              \n\t"
527                 "pmaddwd %%mm7, %%mm0           \n\t"
528                 "pmaddwd %%mm7, %%mm3           \n\t"
529                 "pand %%mm5, %%mm1              \n\t"
530                 "pand %%mm5, %%mm4              \n\t"
531                 "por %%mm1, %%mm0               \n\t"   
532                 "por %%mm4, %%mm3               \n\t"
533                 "psrld $6, %%mm0                \n\t"
534                 "pslld $10, %%mm3               \n\t"
535                 "por %%mm3, %%mm0               \n\t"
536                 MOVNTQ" %%mm0, (%0)             \n\t"
537                 "add $16, %1                    \n\t"
538                 "add $8, %0                     \n\t"
539                 "cmp %2, %1                     \n\t"
540                 " jb 1b                         \n\t"
541                 : "+r" (d), "+r"(s)
542                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
543         );
544 #else
545         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
546         __asm __volatile(
547             "movq       %0, %%mm7\n\t"
548             "movq       %1, %%mm6\n\t"
549             ::"m"(red_15mask),"m"(green_15mask));
550         while(s < mm_end)
551         {
552             __asm __volatile(
553                 PREFETCH" 32%1\n\t"
554                 "movd   %1, %%mm0\n\t"
555                 "movd   4%1, %%mm3\n\t"
556                 "punpckldq 8%1, %%mm0\n\t"
557                 "punpckldq 12%1, %%mm3\n\t"
558                 "movq   %%mm0, %%mm1\n\t"
559                 "movq   %%mm0, %%mm2\n\t"
560                 "movq   %%mm3, %%mm4\n\t"
561                 "movq   %%mm3, %%mm5\n\t"
562                 "psrlq  $3, %%mm0\n\t"
563                 "psrlq  $3, %%mm3\n\t"
564                 "pand   %2, %%mm0\n\t"
565                 "pand   %2, %%mm3\n\t"
566                 "psrlq  $6, %%mm1\n\t"
567                 "psrlq  $6, %%mm4\n\t"
568                 "pand   %%mm6, %%mm1\n\t"
569                 "pand   %%mm6, %%mm4\n\t"
570                 "psrlq  $9, %%mm2\n\t"
571                 "psrlq  $9, %%mm5\n\t"
572                 "pand   %%mm7, %%mm2\n\t"
573                 "pand   %%mm7, %%mm5\n\t"
574                 "por    %%mm1, %%mm0\n\t"
575                 "por    %%mm4, %%mm3\n\t"
576                 "por    %%mm2, %%mm0\n\t"
577                 "por    %%mm5, %%mm3\n\t"
578                 "psllq  $16, %%mm3\n\t"
579                 "por    %%mm3, %%mm0\n\t"
580                 MOVNTQ" %%mm0, %0\n\t"
581                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
582                 d += 4;
583                 s += 16;
584         }
585 #endif
586         __asm __volatile(SFENCE:::"memory");
587         __asm __volatile(EMMS:::"memory");
588 #endif
589         while(s < end)
590         {
591                 register int rgb = *(uint32_t*)s; s += 4;
592                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
593         }
594 }
595
596 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
597 {
598         const uint8_t *s = src;
599         const uint8_t *end;
600 #ifdef HAVE_MMX
601         const uint8_t *mm_end;
602 #endif
603         uint16_t *d = (uint16_t *)dst;
604         end = s + src_size;
605 #ifdef HAVE_MMX
606         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
607         __asm __volatile(
608             "movq       %0, %%mm7\n\t"
609             "movq       %1, %%mm6\n\t"
610             ::"m"(red_15mask),"m"(green_15mask));
611         mm_end = end - 15;
612         while(s < mm_end)
613         {
614             __asm __volatile(
615                 PREFETCH" 32%1\n\t"
616                 "movd   %1, %%mm0\n\t"
617                 "movd   4%1, %%mm3\n\t"
618                 "punpckldq 8%1, %%mm0\n\t"
619                 "punpckldq 12%1, %%mm3\n\t"
620                 "movq   %%mm0, %%mm1\n\t"
621                 "movq   %%mm0, %%mm2\n\t"
622                 "movq   %%mm3, %%mm4\n\t"
623                 "movq   %%mm3, %%mm5\n\t"
624                 "psllq  $7, %%mm0\n\t"
625                 "psllq  $7, %%mm3\n\t"
626                 "pand   %%mm7, %%mm0\n\t"
627                 "pand   %%mm7, %%mm3\n\t"
628                 "psrlq  $6, %%mm1\n\t"
629                 "psrlq  $6, %%mm4\n\t"
630                 "pand   %%mm6, %%mm1\n\t"
631                 "pand   %%mm6, %%mm4\n\t"
632                 "psrlq  $19, %%mm2\n\t"
633                 "psrlq  $19, %%mm5\n\t"
634                 "pand   %2, %%mm2\n\t"
635                 "pand   %2, %%mm5\n\t"
636                 "por    %%mm1, %%mm0\n\t"
637                 "por    %%mm4, %%mm3\n\t"
638                 "por    %%mm2, %%mm0\n\t"
639                 "por    %%mm5, %%mm3\n\t"
640                 "psllq  $16, %%mm3\n\t"
641                 "por    %%mm3, %%mm0\n\t"
642                 MOVNTQ" %%mm0, %0\n\t"
643                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
644                 d += 4;
645                 s += 16;
646         }
647         __asm __volatile(SFENCE:::"memory");
648         __asm __volatile(EMMS:::"memory");
649 #endif
650         while(s < end)
651         {
652                 register int rgb = *(uint32_t*)s; s += 4;
653                 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
654         }
655 }
656
657 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
658 {
659         const uint8_t *s = src;
660         const uint8_t *end;
661 #ifdef HAVE_MMX
662         const uint8_t *mm_end;
663 #endif
664         uint16_t *d = (uint16_t *)dst;
665         end = s + src_size;
666 #ifdef HAVE_MMX
667         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
668         __asm __volatile(
669             "movq       %0, %%mm7\n\t"
670             "movq       %1, %%mm6\n\t"
671             ::"m"(red_16mask),"m"(green_16mask));
672         mm_end = end - 11;
673         while(s < mm_end)
674         {
675             __asm __volatile(
676                 PREFETCH" 32%1\n\t"
677                 "movd   %1, %%mm0\n\t"
678                 "movd   3%1, %%mm3\n\t"
679                 "punpckldq 6%1, %%mm0\n\t"
680                 "punpckldq 9%1, %%mm3\n\t"
681                 "movq   %%mm0, %%mm1\n\t"
682                 "movq   %%mm0, %%mm2\n\t"
683                 "movq   %%mm3, %%mm4\n\t"
684                 "movq   %%mm3, %%mm5\n\t"
685                 "psrlq  $3, %%mm0\n\t"
686                 "psrlq  $3, %%mm3\n\t"
687                 "pand   %2, %%mm0\n\t"
688                 "pand   %2, %%mm3\n\t"
689                 "psrlq  $5, %%mm1\n\t"
690                 "psrlq  $5, %%mm4\n\t"
691                 "pand   %%mm6, %%mm1\n\t"
692                 "pand   %%mm6, %%mm4\n\t"
693                 "psrlq  $8, %%mm2\n\t"
694                 "psrlq  $8, %%mm5\n\t"
695                 "pand   %%mm7, %%mm2\n\t"
696                 "pand   %%mm7, %%mm5\n\t"
697                 "por    %%mm1, %%mm0\n\t"
698                 "por    %%mm4, %%mm3\n\t"
699                 "por    %%mm2, %%mm0\n\t"
700                 "por    %%mm5, %%mm3\n\t"
701                 "psllq  $16, %%mm3\n\t"
702                 "por    %%mm3, %%mm0\n\t"
703                 MOVNTQ" %%mm0, %0\n\t"
704                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
705                 d += 4;
706                 s += 12;
707         }
708         __asm __volatile(SFENCE:::"memory");
709         __asm __volatile(EMMS:::"memory");
710 #endif
711         while(s < end)
712         {
713                 const int b= *s++;
714                 const int g= *s++;
715                 const int r= *s++;
716                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
717         }
718 }
719
720 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
721 {
722         const uint8_t *s = src;
723         const uint8_t *end;
724 #ifdef HAVE_MMX
725         const uint8_t *mm_end;
726 #endif
727         uint16_t *d = (uint16_t *)dst;
728         end = s + src_size;
729 #ifdef HAVE_MMX
730         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
731         __asm __volatile(
732             "movq       %0, %%mm7\n\t"
733             "movq       %1, %%mm6\n\t"
734             ::"m"(red_16mask),"m"(green_16mask));
735         mm_end = end - 15;
736         while(s < mm_end)
737         {
738             __asm __volatile(
739                 PREFETCH" 32%1\n\t"
740                 "movd   %1, %%mm0\n\t"
741                 "movd   3%1, %%mm3\n\t"
742                 "punpckldq 6%1, %%mm0\n\t"
743                 "punpckldq 9%1, %%mm3\n\t"
744                 "movq   %%mm0, %%mm1\n\t"
745                 "movq   %%mm0, %%mm2\n\t"
746                 "movq   %%mm3, %%mm4\n\t"
747                 "movq   %%mm3, %%mm5\n\t"
748                 "psllq  $8, %%mm0\n\t"
749                 "psllq  $8, %%mm3\n\t"
750                 "pand   %%mm7, %%mm0\n\t"
751                 "pand   %%mm7, %%mm3\n\t"
752                 "psrlq  $5, %%mm1\n\t"
753                 "psrlq  $5, %%mm4\n\t"
754                 "pand   %%mm6, %%mm1\n\t"
755                 "pand   %%mm6, %%mm4\n\t"
756                 "psrlq  $19, %%mm2\n\t"
757                 "psrlq  $19, %%mm5\n\t"
758                 "pand   %2, %%mm2\n\t"
759                 "pand   %2, %%mm5\n\t"
760                 "por    %%mm1, %%mm0\n\t"
761                 "por    %%mm4, %%mm3\n\t"
762                 "por    %%mm2, %%mm0\n\t"
763                 "por    %%mm5, %%mm3\n\t"
764                 "psllq  $16, %%mm3\n\t"
765                 "por    %%mm3, %%mm0\n\t"
766                 MOVNTQ" %%mm0, %0\n\t"
767                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
768                 d += 4;
769                 s += 12;
770         }
771         __asm __volatile(SFENCE:::"memory");
772         __asm __volatile(EMMS:::"memory");
773 #endif
774         while(s < end)
775         {
776                 const int r= *s++;
777                 const int g= *s++;
778                 const int b= *s++;
779                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
780         }
781 }
782
783 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
784 {
785         const uint8_t *s = src;
786         const uint8_t *end;
787 #ifdef HAVE_MMX
788         const uint8_t *mm_end;
789 #endif
790         uint16_t *d = (uint16_t *)dst;
791         end = s + src_size;
792 #ifdef HAVE_MMX
793         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
794         __asm __volatile(
795             "movq       %0, %%mm7\n\t"
796             "movq       %1, %%mm6\n\t"
797             ::"m"(red_15mask),"m"(green_15mask));
798         mm_end = end - 11;
799         while(s < mm_end)
800         {
801             __asm __volatile(
802                 PREFETCH" 32%1\n\t"
803                 "movd   %1, %%mm0\n\t"
804                 "movd   3%1, %%mm3\n\t"
805                 "punpckldq 6%1, %%mm0\n\t"
806                 "punpckldq 9%1, %%mm3\n\t"
807                 "movq   %%mm0, %%mm1\n\t"
808                 "movq   %%mm0, %%mm2\n\t"
809                 "movq   %%mm3, %%mm4\n\t"
810                 "movq   %%mm3, %%mm5\n\t"
811                 "psrlq  $3, %%mm0\n\t"
812                 "psrlq  $3, %%mm3\n\t"
813                 "pand   %2, %%mm0\n\t"
814                 "pand   %2, %%mm3\n\t"
815                 "psrlq  $6, %%mm1\n\t"
816                 "psrlq  $6, %%mm4\n\t"
817                 "pand   %%mm6, %%mm1\n\t"
818                 "pand   %%mm6, %%mm4\n\t"
819                 "psrlq  $9, %%mm2\n\t"
820                 "psrlq  $9, %%mm5\n\t"
821                 "pand   %%mm7, %%mm2\n\t"
822                 "pand   %%mm7, %%mm5\n\t"
823                 "por    %%mm1, %%mm0\n\t"
824                 "por    %%mm4, %%mm3\n\t"
825                 "por    %%mm2, %%mm0\n\t"
826                 "por    %%mm5, %%mm3\n\t"
827                 "psllq  $16, %%mm3\n\t"
828                 "por    %%mm3, %%mm0\n\t"
829                 MOVNTQ" %%mm0, %0\n\t"
830                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
831                 d += 4;
832                 s += 12;
833         }
834         __asm __volatile(SFENCE:::"memory");
835         __asm __volatile(EMMS:::"memory");
836 #endif
837         while(s < end)
838         {
839                 const int b= *s++;
840                 const int g= *s++;
841                 const int r= *s++;
842                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
843         }
844 }
845
846 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
847 {
848         const uint8_t *s = src;
849         const uint8_t *end;
850 #ifdef HAVE_MMX
851         const uint8_t *mm_end;
852 #endif
853         uint16_t *d = (uint16_t *)dst;
854         end = s + src_size;
855 #ifdef HAVE_MMX
856         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
857         __asm __volatile(
858             "movq       %0, %%mm7\n\t"
859             "movq       %1, %%mm6\n\t"
860             ::"m"(red_15mask),"m"(green_15mask));
861         mm_end = end - 15;
862         while(s < mm_end)
863         {
864             __asm __volatile(
865                 PREFETCH" 32%1\n\t"
866                 "movd   %1, %%mm0\n\t"
867                 "movd   3%1, %%mm3\n\t"
868                 "punpckldq 6%1, %%mm0\n\t"
869                 "punpckldq 9%1, %%mm3\n\t"
870                 "movq   %%mm0, %%mm1\n\t"
871                 "movq   %%mm0, %%mm2\n\t"
872                 "movq   %%mm3, %%mm4\n\t"
873                 "movq   %%mm3, %%mm5\n\t"
874                 "psllq  $7, %%mm0\n\t"
875                 "psllq  $7, %%mm3\n\t"
876                 "pand   %%mm7, %%mm0\n\t"
877                 "pand   %%mm7, %%mm3\n\t"
878                 "psrlq  $6, %%mm1\n\t"
879                 "psrlq  $6, %%mm4\n\t"
880                 "pand   %%mm6, %%mm1\n\t"
881                 "pand   %%mm6, %%mm4\n\t"
882                 "psrlq  $19, %%mm2\n\t"
883                 "psrlq  $19, %%mm5\n\t"
884                 "pand   %2, %%mm2\n\t"
885                 "pand   %2, %%mm5\n\t"
886                 "por    %%mm1, %%mm0\n\t"
887                 "por    %%mm4, %%mm3\n\t"
888                 "por    %%mm2, %%mm0\n\t"
889                 "por    %%mm5, %%mm3\n\t"
890                 "psllq  $16, %%mm3\n\t"
891                 "por    %%mm3, %%mm0\n\t"
892                 MOVNTQ" %%mm0, %0\n\t"
893                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
894                 d += 4;
895                 s += 12;
896         }
897         __asm __volatile(SFENCE:::"memory");
898         __asm __volatile(EMMS:::"memory");
899 #endif
900         while(s < end)
901         {
902                 const int r= *s++;
903                 const int g= *s++;
904                 const int b= *s++;
905                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
906         }
907 }
908
909 /*
910   I use here less accurate approximation by simply
911  left-shifting the input
912   value and filling the low order bits with
913  zeroes. This method improves png's
914   compression but this scheme cannot reproduce white exactly, since it does not
915   generate an all-ones maximum value; the net effect is to darken the
916   image slightly.
917
918   The better method should be "left bit replication":
919
920    4 3 2 1 0
921    ---------
922    1 1 0 1 1
923
924    7 6 5 4 3  2 1 0
925    ----------------
926    1 1 0 1 1  1 1 0
927    |=======|  |===|
928        |      Leftmost Bits Repeated to Fill Open Bits
929        |
930    Original Bits
931 */
932 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
933 {
934         const uint16_t *end;
935 #ifdef HAVE_MMX
936         const uint16_t *mm_end;
937 #endif
938         uint8_t *d = (uint8_t *)dst;
939         const uint16_t *s = (uint16_t *)src;
940         end = s + src_size/2;
941 #ifdef HAVE_MMX
942         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
943         mm_end = end - 7;
944         while(s < mm_end)
945         {
946             __asm __volatile(
947                 PREFETCH" 32%1\n\t"
948                 "movq   %1, %%mm0\n\t"
949                 "movq   %1, %%mm1\n\t"
950                 "movq   %1, %%mm2\n\t"
951                 "pand   %2, %%mm0\n\t"
952                 "pand   %3, %%mm1\n\t"
953                 "pand   %4, %%mm2\n\t"
954                 "psllq  $3, %%mm0\n\t"
955                 "psrlq  $2, %%mm1\n\t"
956                 "psrlq  $7, %%mm2\n\t"
957                 "movq   %%mm0, %%mm3\n\t"
958                 "movq   %%mm1, %%mm4\n\t"
959                 "movq   %%mm2, %%mm5\n\t"
960                 "punpcklwd %5, %%mm0\n\t"
961                 "punpcklwd %5, %%mm1\n\t"
962                 "punpcklwd %5, %%mm2\n\t"
963                 "punpckhwd %5, %%mm3\n\t"
964                 "punpckhwd %5, %%mm4\n\t"
965                 "punpckhwd %5, %%mm5\n\t"
966                 "psllq  $8, %%mm1\n\t"
967                 "psllq  $16, %%mm2\n\t"
968                 "por    %%mm1, %%mm0\n\t"
969                 "por    %%mm2, %%mm0\n\t"
970                 "psllq  $8, %%mm4\n\t"
971                 "psllq  $16, %%mm5\n\t"
972                 "por    %%mm4, %%mm3\n\t"
973                 "por    %%mm5, %%mm3\n\t"
974
975                 "movq   %%mm0, %%mm6\n\t"
976                 "movq   %%mm3, %%mm7\n\t"
977                 
978                 "movq   8%1, %%mm0\n\t"
979                 "movq   8%1, %%mm1\n\t"
980                 "movq   8%1, %%mm2\n\t"
981                 "pand   %2, %%mm0\n\t"
982                 "pand   %3, %%mm1\n\t"
983                 "pand   %4, %%mm2\n\t"
984                 "psllq  $3, %%mm0\n\t"
985                 "psrlq  $2, %%mm1\n\t"
986                 "psrlq  $7, %%mm2\n\t"
987                 "movq   %%mm0, %%mm3\n\t"
988                 "movq   %%mm1, %%mm4\n\t"
989                 "movq   %%mm2, %%mm5\n\t"
990                 "punpcklwd %5, %%mm0\n\t"
991                 "punpcklwd %5, %%mm1\n\t"
992                 "punpcklwd %5, %%mm2\n\t"
993                 "punpckhwd %5, %%mm3\n\t"
994                 "punpckhwd %5, %%mm4\n\t"
995                 "punpckhwd %5, %%mm5\n\t"
996                 "psllq  $8, %%mm1\n\t"
997                 "psllq  $16, %%mm2\n\t"
998                 "por    %%mm1, %%mm0\n\t"
999                 "por    %%mm2, %%mm0\n\t"
1000                 "psllq  $8, %%mm4\n\t"
1001                 "psllq  $16, %%mm5\n\t"
1002                 "por    %%mm4, %%mm3\n\t"
1003                 "por    %%mm5, %%mm3\n\t"
1004
1005                 :"=m"(*d)
1006                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1007                 :"memory");
1008             /* Borrowed 32 to 24 */
1009             __asm __volatile(
1010                 "movq   %%mm0, %%mm4\n\t"
1011                 "movq   %%mm3, %%mm5\n\t"
1012                 "movq   %%mm6, %%mm0\n\t"
1013                 "movq   %%mm7, %%mm1\n\t"
1014                 
1015                 "movq   %%mm4, %%mm6\n\t"
1016                 "movq   %%mm5, %%mm7\n\t"
1017                 "movq   %%mm0, %%mm2\n\t"
1018                 "movq   %%mm1, %%mm3\n\t"
1019
1020                 "psrlq  $8, %%mm2\n\t"
1021                 "psrlq  $8, %%mm3\n\t"
1022                 "psrlq  $8, %%mm6\n\t"
1023                 "psrlq  $8, %%mm7\n\t"
1024                 "pand   %2, %%mm0\n\t"
1025                 "pand   %2, %%mm1\n\t"
1026                 "pand   %2, %%mm4\n\t"
1027                 "pand   %2, %%mm5\n\t"
1028                 "pand   %3, %%mm2\n\t"
1029                 "pand   %3, %%mm3\n\t"
1030                 "pand   %3, %%mm6\n\t"
1031                 "pand   %3, %%mm7\n\t"
1032                 "por    %%mm2, %%mm0\n\t"
1033                 "por    %%mm3, %%mm1\n\t"
1034                 "por    %%mm6, %%mm4\n\t"
1035                 "por    %%mm7, %%mm5\n\t"
1036
1037                 "movq   %%mm1, %%mm2\n\t"
1038                 "movq   %%mm4, %%mm3\n\t"
1039                 "psllq  $48, %%mm2\n\t"
1040                 "psllq  $32, %%mm3\n\t"
1041                 "pand   %4, %%mm2\n\t"
1042                 "pand   %5, %%mm3\n\t"
1043                 "por    %%mm2, %%mm0\n\t"
1044                 "psrlq  $16, %%mm1\n\t"
1045                 "psrlq  $32, %%mm4\n\t"
1046                 "psllq  $16, %%mm5\n\t"
1047                 "por    %%mm3, %%mm1\n\t"
1048                 "pand   %6, %%mm5\n\t"
1049                 "por    %%mm5, %%mm4\n\t"
1050
1051                 MOVNTQ" %%mm0, %0\n\t"
1052                 MOVNTQ" %%mm1, 8%0\n\t"
1053                 MOVNTQ" %%mm4, 16%0"
1054
1055                 :"=m"(*d)
1056                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1057                 :"memory");
1058                 d += 24;
1059                 s += 8;
1060         }
1061         __asm __volatile(SFENCE:::"memory");
1062         __asm __volatile(EMMS:::"memory");
1063 #endif
1064         while(s < end)
1065         {
1066                 register uint16_t bgr;
1067                 bgr = *s++;
1068                 *d++ = (bgr&0x1F)<<3;
1069                 *d++ = (bgr&0x3E0)>>2;
1070                 *d++ = (bgr&0x7C00)>>7;
1071         }
1072 }
1073
1074 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1075 {
1076         const uint16_t *end;
1077 #ifdef HAVE_MMX
1078         const uint16_t *mm_end;
1079 #endif
1080         uint8_t *d = (uint8_t *)dst;
1081         const uint16_t *s = (const uint16_t *)src;
1082         end = s + src_size/2;
1083 #ifdef HAVE_MMX
1084         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1085         mm_end = end - 7;
1086         while(s < mm_end)
1087         {
1088             __asm __volatile(
1089                 PREFETCH" 32%1\n\t"
1090                 "movq   %1, %%mm0\n\t"
1091                 "movq   %1, %%mm1\n\t"
1092                 "movq   %1, %%mm2\n\t"
1093                 "pand   %2, %%mm0\n\t"
1094                 "pand   %3, %%mm1\n\t"
1095                 "pand   %4, %%mm2\n\t"
1096                 "psllq  $3, %%mm0\n\t"
1097                 "psrlq  $3, %%mm1\n\t"
1098                 "psrlq  $8, %%mm2\n\t"
1099                 "movq   %%mm0, %%mm3\n\t"
1100                 "movq   %%mm1, %%mm4\n\t"
1101                 "movq   %%mm2, %%mm5\n\t"
1102                 "punpcklwd %5, %%mm0\n\t"
1103                 "punpcklwd %5, %%mm1\n\t"
1104                 "punpcklwd %5, %%mm2\n\t"
1105                 "punpckhwd %5, %%mm3\n\t"
1106                 "punpckhwd %5, %%mm4\n\t"
1107                 "punpckhwd %5, %%mm5\n\t"
1108                 "psllq  $8, %%mm1\n\t"
1109                 "psllq  $16, %%mm2\n\t"
1110                 "por    %%mm1, %%mm0\n\t"
1111                 "por    %%mm2, %%mm0\n\t"
1112                 "psllq  $8, %%mm4\n\t"
1113                 "psllq  $16, %%mm5\n\t"
1114                 "por    %%mm4, %%mm3\n\t"
1115                 "por    %%mm5, %%mm3\n\t"
1116                 
1117                 "movq   %%mm0, %%mm6\n\t"
1118                 "movq   %%mm3, %%mm7\n\t"
1119
1120                 "movq   8%1, %%mm0\n\t"
1121                 "movq   8%1, %%mm1\n\t"
1122                 "movq   8%1, %%mm2\n\t"
1123                 "pand   %2, %%mm0\n\t"
1124                 "pand   %3, %%mm1\n\t"
1125                 "pand   %4, %%mm2\n\t"
1126                 "psllq  $3, %%mm0\n\t"
1127                 "psrlq  $3, %%mm1\n\t"
1128                 "psrlq  $8, %%mm2\n\t"
1129                 "movq   %%mm0, %%mm3\n\t"
1130                 "movq   %%mm1, %%mm4\n\t"
1131                 "movq   %%mm2, %%mm5\n\t"
1132                 "punpcklwd %5, %%mm0\n\t"
1133                 "punpcklwd %5, %%mm1\n\t"
1134                 "punpcklwd %5, %%mm2\n\t"
1135                 "punpckhwd %5, %%mm3\n\t"
1136                 "punpckhwd %5, %%mm4\n\t"
1137                 "punpckhwd %5, %%mm5\n\t"
1138                 "psllq  $8, %%mm1\n\t"
1139                 "psllq  $16, %%mm2\n\t"
1140                 "por    %%mm1, %%mm0\n\t"
1141                 "por    %%mm2, %%mm0\n\t"
1142                 "psllq  $8, %%mm4\n\t"
1143                 "psllq  $16, %%mm5\n\t"
1144                 "por    %%mm4, %%mm3\n\t"
1145                 "por    %%mm5, %%mm3\n\t"
1146                 :"=m"(*d)
1147                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1148                 :"memory");
1149             /* Borrowed 32 to 24 */
1150             __asm __volatile(
1151                 "movq   %%mm0, %%mm4\n\t"
1152                 "movq   %%mm3, %%mm5\n\t"
1153                 "movq   %%mm6, %%mm0\n\t"
1154                 "movq   %%mm7, %%mm1\n\t"
1155                 
1156                 "movq   %%mm4, %%mm6\n\t"
1157                 "movq   %%mm5, %%mm7\n\t"
1158                 "movq   %%mm0, %%mm2\n\t"
1159                 "movq   %%mm1, %%mm3\n\t"
1160
1161                 "psrlq  $8, %%mm2\n\t"
1162                 "psrlq  $8, %%mm3\n\t"
1163                 "psrlq  $8, %%mm6\n\t"
1164                 "psrlq  $8, %%mm7\n\t"
1165                 "pand   %2, %%mm0\n\t"
1166                 "pand   %2, %%mm1\n\t"
1167                 "pand   %2, %%mm4\n\t"
1168                 "pand   %2, %%mm5\n\t"
1169                 "pand   %3, %%mm2\n\t"
1170                 "pand   %3, %%mm3\n\t"
1171                 "pand   %3, %%mm6\n\t"
1172                 "pand   %3, %%mm7\n\t"
1173                 "por    %%mm2, %%mm0\n\t"
1174                 "por    %%mm3, %%mm1\n\t"
1175                 "por    %%mm6, %%mm4\n\t"
1176                 "por    %%mm7, %%mm5\n\t"
1177
1178                 "movq   %%mm1, %%mm2\n\t"
1179                 "movq   %%mm4, %%mm3\n\t"
1180                 "psllq  $48, %%mm2\n\t"
1181                 "psllq  $32, %%mm3\n\t"
1182                 "pand   %4, %%mm2\n\t"
1183                 "pand   %5, %%mm3\n\t"
1184                 "por    %%mm2, %%mm0\n\t"
1185                 "psrlq  $16, %%mm1\n\t"
1186                 "psrlq  $32, %%mm4\n\t"
1187                 "psllq  $16, %%mm5\n\t"
1188                 "por    %%mm3, %%mm1\n\t"
1189                 "pand   %6, %%mm5\n\t"
1190                 "por    %%mm5, %%mm4\n\t"
1191
1192                 MOVNTQ" %%mm0, %0\n\t"
1193                 MOVNTQ" %%mm1, 8%0\n\t"
1194                 MOVNTQ" %%mm4, 16%0"
1195
1196                 :"=m"(*d)
1197                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1198                 :"memory");
1199                 d += 24;
1200                 s += 8;
1201         }
1202         __asm __volatile(SFENCE:::"memory");
1203         __asm __volatile(EMMS:::"memory");
1204 #endif
1205         while(s < end)
1206         {
1207                 register uint16_t bgr;
1208                 bgr = *s++;
1209                 *d++ = (bgr&0x1F)<<3;
1210                 *d++ = (bgr&0x7E0)>>3;
1211                 *d++ = (bgr&0xF800)>>8;
1212         }
1213 }
1214
1215 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1216 {
1217         const uint16_t *end;
1218 #ifdef HAVE_MMX
1219         const uint16_t *mm_end;
1220 #endif
1221         uint8_t *d = (uint8_t *)dst;
1222         const uint16_t *s = (const uint16_t *)src;
1223         end = s + src_size/2;
1224 #ifdef HAVE_MMX
1225         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1226         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1227         mm_end = end - 3;
1228         while(s < mm_end)
1229         {
1230             __asm __volatile(
1231                 PREFETCH" 32%1\n\t"
1232                 "movq   %1, %%mm0\n\t"
1233                 "movq   %1, %%mm1\n\t"
1234                 "movq   %1, %%mm2\n\t"
1235                 "pand   %2, %%mm0\n\t"
1236                 "pand   %3, %%mm1\n\t"
1237                 "pand   %4, %%mm2\n\t"
1238                 "psllq  $3, %%mm0\n\t"
1239                 "psrlq  $2, %%mm1\n\t"
1240                 "psrlq  $7, %%mm2\n\t"
1241                 "movq   %%mm0, %%mm3\n\t"
1242                 "movq   %%mm1, %%mm4\n\t"
1243                 "movq   %%mm2, %%mm5\n\t"
1244                 "punpcklwd %%mm7, %%mm0\n\t"
1245                 "punpcklwd %%mm7, %%mm1\n\t"
1246                 "punpcklwd %%mm7, %%mm2\n\t"
1247                 "punpckhwd %%mm7, %%mm3\n\t"
1248                 "punpckhwd %%mm7, %%mm4\n\t"
1249                 "punpckhwd %%mm7, %%mm5\n\t"
1250                 "psllq  $8, %%mm1\n\t"
1251                 "psllq  $16, %%mm2\n\t"
1252                 "por    %%mm1, %%mm0\n\t"
1253                 "por    %%mm2, %%mm0\n\t"
1254                 "psllq  $8, %%mm4\n\t"
1255                 "psllq  $16, %%mm5\n\t"
1256                 "por    %%mm4, %%mm3\n\t"
1257                 "por    %%mm5, %%mm3\n\t"
1258                 MOVNTQ" %%mm0, %0\n\t"
1259                 MOVNTQ" %%mm3, 8%0\n\t"
1260                 :"=m"(*d)
1261                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1262                 :"memory");
1263                 d += 16;
1264                 s += 4;
1265         }
1266         __asm __volatile(SFENCE:::"memory");
1267         __asm __volatile(EMMS:::"memory");
1268 #endif
1269         while(s < end)
1270         {
1271 #if 0 //slightly slower on athlon
1272                 int bgr= *s++;
1273                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1274 #else
1275                 register uint16_t bgr;
1276                 bgr = *s++;
1277 #ifdef WORDS_BIGENDIAN
1278                 *d++ = 0;
1279                 *d++ = (bgr&0x7C00)>>7;
1280                 *d++ = (bgr&0x3E0)>>2;
1281                 *d++ = (bgr&0x1F)<<3;
1282 #else
1283                 *d++ = (bgr&0x1F)<<3;
1284                 *d++ = (bgr&0x3E0)>>2;
1285                 *d++ = (bgr&0x7C00)>>7;
1286                 *d++ = 0;
1287 #endif
1288
1289 #endif
1290         }
1291 }
1292
1293 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1294 {
1295         const uint16_t *end;
1296 #ifdef HAVE_MMX
1297         const uint16_t *mm_end;
1298 #endif
1299         uint8_t *d = (uint8_t *)dst;
1300         const uint16_t *s = (uint16_t *)src;
1301         end = s + src_size/2;
1302 #ifdef HAVE_MMX
1303         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1304         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1305         mm_end = end - 3;
1306         while(s < mm_end)
1307         {
1308             __asm __volatile(
1309                 PREFETCH" 32%1\n\t"
1310                 "movq   %1, %%mm0\n\t"
1311                 "movq   %1, %%mm1\n\t"
1312                 "movq   %1, %%mm2\n\t"
1313                 "pand   %2, %%mm0\n\t"
1314                 "pand   %3, %%mm1\n\t"
1315                 "pand   %4, %%mm2\n\t"
1316                 "psllq  $3, %%mm0\n\t"
1317                 "psrlq  $3, %%mm1\n\t"
1318                 "psrlq  $8, %%mm2\n\t"
1319                 "movq   %%mm0, %%mm3\n\t"
1320                 "movq   %%mm1, %%mm4\n\t"
1321                 "movq   %%mm2, %%mm5\n\t"
1322                 "punpcklwd %%mm7, %%mm0\n\t"
1323                 "punpcklwd %%mm7, %%mm1\n\t"
1324                 "punpcklwd %%mm7, %%mm2\n\t"
1325                 "punpckhwd %%mm7, %%mm3\n\t"
1326                 "punpckhwd %%mm7, %%mm4\n\t"
1327                 "punpckhwd %%mm7, %%mm5\n\t"
1328                 "psllq  $8, %%mm1\n\t"
1329                 "psllq  $16, %%mm2\n\t"
1330                 "por    %%mm1, %%mm0\n\t"
1331                 "por    %%mm2, %%mm0\n\t"
1332                 "psllq  $8, %%mm4\n\t"
1333                 "psllq  $16, %%mm5\n\t"
1334                 "por    %%mm4, %%mm3\n\t"
1335                 "por    %%mm5, %%mm3\n\t"
1336                 MOVNTQ" %%mm0, %0\n\t"
1337                 MOVNTQ" %%mm3, 8%0\n\t"
1338                 :"=m"(*d)
1339                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1340                 :"memory");
1341                 d += 16;
1342                 s += 4;
1343         }
1344         __asm __volatile(SFENCE:::"memory");
1345         __asm __volatile(EMMS:::"memory");
1346 #endif
1347         while(s < end)
1348         {
1349                 register uint16_t bgr;
1350                 bgr = *s++;
1351 #ifdef WORDS_BIGENDIAN
1352                 *d++ = 0;
1353                 *d++ = (bgr&0xF800)>>8;
1354                 *d++ = (bgr&0x7E0)>>3;
1355                 *d++ = (bgr&0x1F)<<3;
1356 #else
1357                 *d++ = (bgr&0x1F)<<3;
1358                 *d++ = (bgr&0x7E0)>>3;
1359                 *d++ = (bgr&0xF800)>>8;
1360                 *d++ = 0;
1361 #endif
1362         }
1363 }
1364
1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1366 {
1367 #ifdef HAVE_MMX
1368 /* TODO: unroll this loop */
1369         asm volatile (
1370                 "xor %%"REG_a", %%"REG_a"       \n\t"
1371                 ASMALIGN(4)
1372                 "1:                             \n\t"
1373                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1374                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1375                 "movq %%mm0, %%mm1              \n\t"
1376                 "movq %%mm0, %%mm2              \n\t"
1377                 "pslld $16, %%mm0               \n\t"
1378                 "psrld $16, %%mm1               \n\t"
1379                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1380                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1381                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1382                 "por %%mm0, %%mm2               \n\t"
1383                 "por %%mm1, %%mm2               \n\t"
1384                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1385                 "add $8, %%"REG_a"              \n\t"
1386                 "cmp %2, %%"REG_a"              \n\t"
1387                 " jb 1b                         \n\t"
1388                 :: "r" (src), "r"(dst), "r" (src_size-7)
1389                 : "%"REG_a
1390         );
1391
1392         __asm __volatile(SFENCE:::"memory");
1393         __asm __volatile(EMMS:::"memory");
1394 #else
1395         unsigned i;
1396         unsigned num_pixels = src_size >> 2;
1397         for(i=0; i<num_pixels; i++)
1398         {
1399 #ifdef WORDS_BIGENDIAN  
1400           dst[4*i + 1] = src[4*i + 3];
1401           dst[4*i + 2] = src[4*i + 2];
1402           dst[4*i + 3] = src[4*i + 1];
1403 #else
1404           dst[4*i + 0] = src[4*i + 2];
1405           dst[4*i + 1] = src[4*i + 1];
1406           dst[4*i + 2] = src[4*i + 0];
1407 #endif
1408         }
1409 #endif
1410 }
1411
1412 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1413 {
1414         unsigned i;
1415 #ifdef HAVE_MMX
1416         long mmx_size= 23 - src_size;
1417         asm volatile (
1418                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1419                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1420                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1421                 ASMALIGN(4)
1422                 "1:                             \n\t"
1423                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1424                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1425                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1426                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1427                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1428                 "pand %%mm5, %%mm0              \n\t"
1429                 "pand %%mm6, %%mm1              \n\t"
1430                 "pand %%mm7, %%mm2              \n\t"
1431                 "por %%mm0, %%mm1               \n\t"
1432                 "por %%mm2, %%mm1               \n\t"                
1433                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1434                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1435                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1436                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1437                 "pand %%mm7, %%mm0              \n\t"
1438                 "pand %%mm5, %%mm1              \n\t"
1439                 "pand %%mm6, %%mm2              \n\t"
1440                 "por %%mm0, %%mm1               \n\t"
1441                 "por %%mm2, %%mm1               \n\t"                
1442                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1443                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1444                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1445                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1446                 "pand %%mm6, %%mm0              \n\t"
1447                 "pand %%mm7, %%mm1              \n\t"
1448                 "pand %%mm5, %%mm2              \n\t"
1449                 "por %%mm0, %%mm1               \n\t"
1450                 "por %%mm2, %%mm1               \n\t"                
1451                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1452                 "add $24, %%"REG_a"             \n\t"
1453                 " js 1b                         \n\t"
1454                 : "+a" (mmx_size)
1455                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1456         );
1457
1458         __asm __volatile(SFENCE:::"memory");
1459         __asm __volatile(EMMS:::"memory");
1460
1461         if(mmx_size==23) return; //finihsed, was multiple of 8
1462
1463         src+= src_size;
1464         dst+= src_size;
1465         src_size= 23-mmx_size;
1466         src-= src_size;
1467         dst-= src_size;
1468 #endif
1469         for(i=0; i<src_size; i+=3)
1470         {
1471                 register uint8_t x;
1472                 x          = src[i + 2];
1473                 dst[i + 1] = src[i + 1];
1474                 dst[i + 2] = src[i + 0];
1475                 dst[i + 0] = x;
1476         }
1477 }
1478
1479 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1480         long width, long height,
1481         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1482 {
1483         long y;
1484         const long chromWidth= width>>1;
1485         for(y=0; y<height; y++)
1486         {
1487 #ifdef HAVE_MMX
1488 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1489                 asm volatile(
1490                         "xor %%"REG_a", %%"REG_a"       \n\t"
1491                         ASMALIGN(4)
1492                         "1:                             \n\t"
1493                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1494                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1495                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1496                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1497                         "movq %%mm0, %%mm2              \n\t" // U(0)
1498                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1499                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1500                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1501
1502                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1503                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1504                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1505                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1506                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1507                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1508                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1509                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1510
1511                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1512                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1513                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1514                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1515
1516                         "add $8, %%"REG_a"              \n\t"
1517                         "cmp %4, %%"REG_a"              \n\t"
1518                         " jb 1b                         \n\t"
1519                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1520                         : "%"REG_a
1521                 );
1522 #else
1523
1524 #if defined ARCH_ALPHA && defined HAVE_MVI
1525 #define pl2yuy2(n)                                      \
1526         y1 = yc[n];                                     \
1527         y2 = yc2[n];                                    \
1528         u = uc[n];                                      \
1529         v = vc[n];                                      \
1530         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1531         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1532         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1533         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1534         yuv1 = (u << 8) + (v << 24);                    \
1535         yuv2 = yuv1 + y2;                               \
1536         yuv1 += y1;                                     \
1537         qdst[n] = yuv1;                                 \
1538         qdst2[n] = yuv2;
1539
1540                 int i;
1541                 uint64_t *qdst = (uint64_t *) dst;
1542                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1543                 const uint32_t *yc = (uint32_t *) ysrc;
1544                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1545                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1546                 for(i = 0; i < chromWidth; i += 8){
1547                         uint64_t y1, y2, yuv1, yuv2;
1548                         uint64_t u, v;
1549                         /* Prefetch */
1550                         asm("ldq $31,64(%0)" :: "r"(yc));
1551                         asm("ldq $31,64(%0)" :: "r"(yc2));
1552                         asm("ldq $31,64(%0)" :: "r"(uc));
1553                         asm("ldq $31,64(%0)" :: "r"(vc));
1554
1555                         pl2yuy2(0);
1556                         pl2yuy2(1);
1557                         pl2yuy2(2);
1558                         pl2yuy2(3);
1559
1560                         yc += 4;
1561                         yc2 += 4;
1562                         uc += 4;
1563                         vc += 4;
1564                         qdst += 4;
1565                         qdst2 += 4;
1566                 }
1567                 y++;
1568                 ysrc += lumStride;
1569                 dst += dstStride;
1570
1571 #elif __WORDSIZE >= 64
1572                 int i;
1573                 uint64_t *ldst = (uint64_t *) dst;
1574                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575                 for(i = 0; i < chromWidth; i += 2){
1576                         uint64_t k, l;
1577                         k = yc[0] + (uc[0] << 8) +
1578                             (yc[1] << 16) + (vc[0] << 24);
1579                         l = yc[2] + (uc[1] << 8) +
1580                             (yc[3] << 16) + (vc[1] << 24);
1581                         *ldst++ = k + (l << 32);
1582                         yc += 4;
1583                         uc += 2;
1584                         vc += 2;
1585                 }
1586
1587 #else
1588                 int i, *idst = (int32_t *) dst;
1589                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1590                 for(i = 0; i < chromWidth; i++){
1591 #ifdef WORDS_BIGENDIAN
1592                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1593                             (yc[1] << 8) + (vc[0] << 0);
1594 #else
1595                         *idst++ = yc[0] + (uc[0] << 8) +
1596                             (yc[1] << 16) + (vc[0] << 24);
1597 #endif
1598                         yc += 2;
1599                         uc++;
1600                         vc++;
1601                 }
1602 #endif
1603 #endif
1604                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1605                 {
1606                         usrc += chromStride;
1607                         vsrc += chromStride;
1608                 }
1609                 ysrc += lumStride;
1610                 dst += dstStride;
1611         }
1612 #ifdef HAVE_MMX
1613 asm(    EMMS" \n\t"
1614         SFENCE" \n\t"
1615         :::"memory");
1616 #endif
1617 }
1618
1619 /**
1620  *
1621  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1622  * problem for anyone then tell me, and ill fix it)
1623  */
1624 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1625         long width, long height,
1626         long lumStride, long chromStride, long dstStride)
1627 {
1628         //FIXME interpolate chroma
1629         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1630 }
1631
1632 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1633         long width, long height,
1634         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1635 {
1636         long y;
1637         const long chromWidth= width>>1;
1638         for(y=0; y<height; y++)
1639         {
1640 #ifdef HAVE_MMX
1641 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1642                 asm volatile(
1643                         "xor %%"REG_a", %%"REG_a"       \n\t"
1644                         ASMALIGN(4)
1645                         "1:                             \n\t"
1646                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1647                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1648                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1649                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1650                         "movq %%mm0, %%mm2              \n\t" // U(0)
1651                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1652                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1653                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1654
1655                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1656                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1657                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1658                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1659                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1660                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1661                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1662                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1663
1664                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1665                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1666                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1667                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1668
1669                         "add $8, %%"REG_a"              \n\t"
1670                         "cmp %4, %%"REG_a"              \n\t"
1671                         " jb 1b                         \n\t"
1672                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1673                         : "%"REG_a
1674                 );
1675 #else
1676 //FIXME adapt the alpha asm code from yv12->yuy2
1677
1678 #if __WORDSIZE >= 64
1679                 int i;
1680                 uint64_t *ldst = (uint64_t *) dst;
1681                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1682                 for(i = 0; i < chromWidth; i += 2){
1683                         uint64_t k, l;
1684                         k = uc[0] + (yc[0] << 8) +
1685                             (vc[0] << 16) + (yc[1] << 24);
1686                         l = uc[1] + (yc[2] << 8) +
1687                             (vc[1] << 16) + (yc[3] << 24);
1688                         *ldst++ = k + (l << 32);
1689                         yc += 4;
1690                         uc += 2;
1691                         vc += 2;
1692                 }
1693
1694 #else
1695                 int i, *idst = (int32_t *) dst;
1696                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1697                 for(i = 0; i < chromWidth; i++){
1698 #ifdef WORDS_BIGENDIAN
1699                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1700                             (vc[0] << 8) + (yc[1] << 0);
1701 #else
1702                         *idst++ = uc[0] + (yc[0] << 8) +
1703                             (vc[0] << 16) + (yc[1] << 24);
1704 #endif
1705                         yc += 2;
1706                         uc++;
1707                         vc++;
1708                 }
1709 #endif
1710 #endif
1711                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1712                 {
1713                         usrc += chromStride;
1714                         vsrc += chromStride;
1715                 }
1716                 ysrc += lumStride;
1717                 dst += dstStride;
1718         }
1719 #ifdef HAVE_MMX
1720 asm(    EMMS" \n\t"
1721         SFENCE" \n\t"
1722         :::"memory");
1723 #endif
1724 }
1725
1726 /**
1727  *
1728  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729  * problem for anyone then tell me, and ill fix it)
1730  */
1731 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1732         long width, long height,
1733         long lumStride, long chromStride, long dstStride)
1734 {
1735         //FIXME interpolate chroma
1736         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1737 }
1738
1739 /**
1740  *
1741  * width should be a multiple of 16
1742  */
1743 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1744         long width, long height,
1745         long lumStride, long chromStride, long dstStride)
1746 {
1747         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1748 }
1749
1750 /**
1751  *
1752  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1753  * problem for anyone then tell me, and ill fix it)
1754  */
1755 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1756         long width, long height,
1757         long lumStride, long chromStride, long srcStride)
1758 {
1759         long y;
1760         const long chromWidth= width>>1;
1761         for(y=0; y<height; y+=2)
1762         {
1763 #ifdef HAVE_MMX
1764                 asm volatile(
1765                         "xor %%"REG_a", %%"REG_a"       \n\t"
1766                         "pcmpeqw %%mm7, %%mm7           \n\t"
1767                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1768                         ASMALIGN(4)
1769                         "1:                             \n\t"
1770                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1771                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1772                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1773                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1774                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1775                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1776                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1777                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1778                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1779                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1780                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1781
1782                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1783
1784                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1785                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1786                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1787                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1788                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1789                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1790                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1791                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1792                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1793                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1794
1795                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1796
1797                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1798                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1799                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1800                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1801                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1802                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1803                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1804                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1805
1806                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1807                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1808
1809                         "add $8, %%"REG_a"              \n\t"
1810                         "cmp %4, %%"REG_a"              \n\t"
1811                         " jb 1b                         \n\t"
1812                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1813                         : "memory", "%"REG_a
1814                 );
1815
1816                 ydst += lumStride;
1817                 src  += srcStride;
1818
1819                 asm volatile(
1820                         "xor %%"REG_a", %%"REG_a"       \n\t"
1821                         ASMALIGN(4)
1822                         "1:                             \n\t"
1823                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1824                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1825                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1826                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1827                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1828                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1829                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1830                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1831                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1832                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1833                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1834
1835                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1836                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1837
1838                         "add $8, %%"REG_a"              \n\t"
1839                         "cmp %4, %%"REG_a"              \n\t"
1840                         " jb 1b                         \n\t"
1841
1842                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1843                         : "memory", "%"REG_a
1844                 );
1845 #else
1846                 long i;
1847                 for(i=0; i<chromWidth; i++)
1848                 {
1849                         ydst[2*i+0]     = src[4*i+0];
1850                         udst[i]         = src[4*i+1];
1851                         ydst[2*i+1]     = src[4*i+2];
1852                         vdst[i]         = src[4*i+3];
1853                 }
1854                 ydst += lumStride;
1855                 src  += srcStride;
1856
1857                 for(i=0; i<chromWidth; i++)
1858                 {
1859                         ydst[2*i+0]     = src[4*i+0];
1860                         ydst[2*i+1]     = src[4*i+2];
1861                 }
1862 #endif
1863                 udst += chromStride;
1864                 vdst += chromStride;
1865                 ydst += lumStride;
1866                 src  += srcStride;
1867         }
1868 #ifdef HAVE_MMX
1869 asm volatile(   EMMS" \n\t"
1870                 SFENCE" \n\t"
1871                 :::"memory");
1872 #endif
1873 }
1874
1875 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1876         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1877         long width, long height, long lumStride, long chromStride)
1878 {
1879         /* Y Plane */
1880         memcpy(ydst, ysrc, width*height);
1881
1882         /* XXX: implement upscaling for U,V */
1883 }
1884
1885 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1886 {
1887         long x,y;
1888         
1889         dst[0]= src[0];
1890         
1891         // first line
1892         for(x=0; x<srcWidth-1; x++){
1893                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1894                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1895         }
1896         dst[2*srcWidth-1]= src[srcWidth-1];
1897         
1898         dst+= dstStride;
1899
1900         for(y=1; y<srcHeight; y++){
1901 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1902                 const long mmxSize= srcWidth&~15;
1903                 asm volatile(
1904                         "mov %4, %%"REG_a"              \n\t"
1905                         "1:                             \n\t"
1906                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1907                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1908                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1909                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1910                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1911                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1912                         PAVGB" %%mm0, %%mm5             \n\t"
1913                         PAVGB" %%mm0, %%mm3             \n\t"
1914                         PAVGB" %%mm0, %%mm5             \n\t"
1915                         PAVGB" %%mm0, %%mm3             \n\t"
1916                         PAVGB" %%mm1, %%mm4             \n\t"
1917                         PAVGB" %%mm1, %%mm2             \n\t"
1918                         PAVGB" %%mm1, %%mm4             \n\t"
1919                         PAVGB" %%mm1, %%mm2             \n\t"
1920                         "movq %%mm5, %%mm7              \n\t"
1921                         "movq %%mm4, %%mm6              \n\t"
1922                         "punpcklbw %%mm3, %%mm5         \n\t"
1923                         "punpckhbw %%mm3, %%mm7         \n\t"
1924                         "punpcklbw %%mm2, %%mm4         \n\t"
1925                         "punpckhbw %%mm2, %%mm6         \n\t"
1926 #if 1
1927                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1928                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1929                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1930                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1931 #else
1932                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1933                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1934                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1935                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1936 #endif
1937                         "add $8, %%"REG_a"              \n\t"
1938                         " js 1b                         \n\t"
1939                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1940                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1941                            "g" (-mmxSize)
1942                         : "%"REG_a
1943
1944                 );
1945 #else
1946                 const long mmxSize=1;
1947 #endif
1948                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1949                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1950
1951                 for(x=mmxSize-1; x<srcWidth-1; x++){
1952                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1953                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1954                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1955                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1956                 }
1957                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1958                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1959
1960                 dst+=dstStride*2;
1961                 src+=srcStride;
1962         }
1963         
1964         // last line
1965 #if 1
1966         dst[0]= src[0];
1967         
1968         for(x=0; x<srcWidth-1; x++){
1969                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1970                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1971         }
1972         dst[2*srcWidth-1]= src[srcWidth-1];
1973 #else
1974         for(x=0; x<srcWidth; x++){
1975                 dst[2*x+0]=
1976                 dst[2*x+1]= src[x];
1977         }
1978 #endif
1979
1980 #ifdef HAVE_MMX
1981 asm volatile(   EMMS" \n\t"
1982                 SFENCE" \n\t"
1983                 :::"memory");
1984 #endif
1985 }
1986
1987 /**
1988  *
1989  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1990  * problem for anyone then tell me, and ill fix it)
1991  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1992  */
1993 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1994         long width, long height,
1995         long lumStride, long chromStride, long srcStride)
1996 {
1997         long y;
1998         const long chromWidth= width>>1;
1999         for(y=0; y<height; y+=2)
2000         {
2001 #ifdef HAVE_MMX
2002                 asm volatile(
2003                         "xorl %%eax, %%eax              \n\t"
2004                         "pcmpeqw %%mm7, %%mm7           \n\t"
2005                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
2006                         ASMALIGN(4)
2007                         "1:                             \n\t"
2008                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2009                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
2010                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
2011                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
2012                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
2013                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
2014                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
2015                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2016                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2017                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
2018                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
2019
2020                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
2021
2022                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2023                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2024                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2025                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2026                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2027                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2028                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2029                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2030                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2031                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2032
2033                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2034
2035                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2036                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2037                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2038                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2039                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2040                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2041                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2042                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2043
2044                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2045                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2046
2047                         "addl $8, %%eax                 \n\t"
2048                         "cmpl %4, %%eax                 \n\t"
2049                         " jb 1b                         \n\t"
2050                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2051                         : "memory", "%eax"
2052                 );
2053
2054                 ydst += lumStride;
2055                 src  += srcStride;
2056
2057                 asm volatile(
2058                         "xorl %%eax, %%eax              \n\t"
2059                         ASMALIGN(4)
2060                         "1:                             \n\t"
2061                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2062                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2063                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2064                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2065                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2066                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2067                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2068                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2069                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2070                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2071                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2072
2073                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2074                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2075
2076                         "addl $8, %%eax                 \n\t"
2077                         "cmpl %4, %%eax                 \n\t"
2078                         " jb 1b                         \n\t"
2079
2080                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2081                         : "memory", "%eax"
2082                 );
2083 #else
2084                 long i;
2085                 for(i=0; i<chromWidth; i++)
2086                 {
2087                         udst[i]         = src[4*i+0];
2088                         ydst[2*i+0]     = src[4*i+1];
2089                         vdst[i]         = src[4*i+2];
2090                         ydst[2*i+1]     = src[4*i+3];
2091                 }
2092                 ydst += lumStride;
2093                 src  += srcStride;
2094
2095                 for(i=0; i<chromWidth; i++)
2096                 {
2097                         ydst[2*i+0]     = src[4*i+1];
2098                         ydst[2*i+1]     = src[4*i+3];
2099                 }
2100 #endif
2101                 udst += chromStride;
2102                 vdst += chromStride;
2103                 ydst += lumStride;
2104                 src  += srcStride;
2105         }
2106 #ifdef HAVE_MMX
2107 asm volatile(   EMMS" \n\t"
2108                 SFENCE" \n\t"
2109                 :::"memory");
2110 #endif
2111 }
2112
2113 /**
2114  *
2115  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2116  * problem for anyone then tell me, and ill fix it)
2117  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2118  */
2119 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2120         long width, long height,
2121         long lumStride, long chromStride, long srcStride)
2122 {
2123         long y;
2124         const long chromWidth= width>>1;
2125 #ifdef HAVE_MMX
2126         for(y=0; y<height-2; y+=2)
2127         {
2128                 long i;
2129                 for(i=0; i<2; i++)
2130                 {
2131                         asm volatile(
2132                                 "mov %2, %%"REG_a"              \n\t"
2133                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2134                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2135                                 "pxor %%mm7, %%mm7              \n\t"
2136                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2137                                 ASMALIGN(4)
2138                                 "1:                             \n\t"
2139                                 PREFETCH" 64(%0, %%"REG_d")     \n\t"
2140                                 "movd (%0, %%"REG_d"), %%mm0    \n\t"
2141                                 "movd 3(%0, %%"REG_d"), %%mm1   \n\t"
2142                                 "punpcklbw %%mm7, %%mm0         \n\t"
2143                                 "punpcklbw %%mm7, %%mm1         \n\t"
2144                                 "movd 6(%0, %%"REG_d"), %%mm2   \n\t"
2145                                 "movd 9(%0, %%"REG_d"), %%mm3   \n\t"
2146                                 "punpcklbw %%mm7, %%mm2         \n\t"
2147                                 "punpcklbw %%mm7, %%mm3         \n\t"
2148                                 "pmaddwd %%mm6, %%mm0           \n\t"
2149                                 "pmaddwd %%mm6, %%mm1           \n\t"
2150                                 "pmaddwd %%mm6, %%mm2           \n\t"
2151                                 "pmaddwd %%mm6, %%mm3           \n\t"
2152 #ifndef FAST_BGR2YV12
2153                                 "psrad $8, %%mm0                \n\t"
2154                                 "psrad $8, %%mm1                \n\t"
2155                                 "psrad $8, %%mm2                \n\t"
2156                                 "psrad $8, %%mm3                \n\t"
2157 #endif
2158                                 "packssdw %%mm1, %%mm0          \n\t"
2159                                 "packssdw %%mm3, %%mm2          \n\t"
2160                                 "pmaddwd %%mm5, %%mm0           \n\t"
2161                                 "pmaddwd %%mm5, %%mm2           \n\t"
2162                                 "packssdw %%mm2, %%mm0          \n\t"
2163                                 "psraw $7, %%mm0                \n\t"
2164
2165                                 "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2166                                 "movd 15(%0, %%"REG_d"), %%mm1  \n\t"
2167                                 "punpcklbw %%mm7, %%mm4         \n\t"
2168                                 "punpcklbw %%mm7, %%mm1         \n\t"
2169                                 "movd 18(%0, %%"REG_d"), %%mm2  \n\t"
2170                                 "movd 21(%0, %%"REG_d"), %%mm3  \n\t"
2171                                 "punpcklbw %%mm7, %%mm2         \n\t"
2172                                 "punpcklbw %%mm7, %%mm3         \n\t"
2173                                 "pmaddwd %%mm6, %%mm4           \n\t"
2174                                 "pmaddwd %%mm6, %%mm1           \n\t"
2175                                 "pmaddwd %%mm6, %%mm2           \n\t"
2176                                 "pmaddwd %%mm6, %%mm3           \n\t"
2177 #ifndef FAST_BGR2YV12
2178                                 "psrad $8, %%mm4                \n\t"
2179                                 "psrad $8, %%mm1                \n\t"
2180                                 "psrad $8, %%mm2                \n\t"
2181                                 "psrad $8, %%mm3                \n\t"
2182 #endif
2183                                 "packssdw %%mm1, %%mm4          \n\t"
2184                                 "packssdw %%mm3, %%mm2          \n\t"
2185                                 "pmaddwd %%mm5, %%mm4           \n\t"
2186                                 "pmaddwd %%mm5, %%mm2           \n\t"
2187                                 "add $24, %%"REG_d"             \n\t"
2188                                 "packssdw %%mm2, %%mm4          \n\t"
2189                                 "psraw $7, %%mm4                \n\t"
2190
2191                                 "packuswb %%mm4, %%mm0          \n\t"
2192                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2193
2194                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2195                                 "add $8, %%"REG_a"              \n\t"
2196                                 " js 1b                         \n\t"
2197                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2198                                 : "%"REG_a, "%"REG_d
2199                         );
2200                         ydst += lumStride;
2201                         src  += srcStride;
2202                 }
2203                 src -= srcStride*2;
2204                 asm volatile(
2205                         "mov %4, %%"REG_a"              \n\t"
2206                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2207                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2208                         "pxor %%mm7, %%mm7              \n\t"
2209                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2210                         "add %%"REG_d", %%"REG_d"       \n\t"
2211                         ASMALIGN(4)
2212                         "1:                             \n\t"
2213                         PREFETCH" 64(%0, %%"REG_d")     \n\t"
2214                         PREFETCH" 64(%1, %%"REG_d")     \n\t"
2215 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2216                         "movq (%0, %%"REG_d"), %%mm0    \n\t"
2217                         "movq (%1, %%"REG_d"), %%mm1    \n\t"
2218                         "movq 6(%0, %%"REG_d"), %%mm2   \n\t"
2219                         "movq 6(%1, %%"REG_d"), %%mm3   \n\t"
2220                         PAVGB" %%mm1, %%mm0             \n\t"
2221                         PAVGB" %%mm3, %%mm2             \n\t"
2222                         "movq %%mm0, %%mm1              \n\t"
2223                         "movq %%mm2, %%mm3              \n\t"
2224                         "psrlq $24, %%mm0               \n\t"
2225                         "psrlq $24, %%mm2               \n\t"
2226                         PAVGB" %%mm1, %%mm0             \n\t"
2227                         PAVGB" %%mm3, %%mm2             \n\t"
2228                         "punpcklbw %%mm7, %%mm0         \n\t"
2229                         "punpcklbw %%mm7, %%mm2         \n\t"
2230 #else
2231                         "movd (%0, %%"REG_d"), %%mm0    \n\t"
2232                         "movd (%1, %%"REG_d"), %%mm1    \n\t"
2233                         "movd 3(%0, %%"REG_d"), %%mm2   \n\t"
2234                         "movd 3(%1, %%"REG_d"), %%mm3   \n\t"
2235                         "punpcklbw %%mm7, %%mm0         \n\t"
2236                         "punpcklbw %%mm7, %%mm1         \n\t"
2237                         "punpcklbw %%mm7, %%mm2         \n\t"
2238                         "punpcklbw %%mm7, %%mm3         \n\t"
2239                         "paddw %%mm1, %%mm0             \n\t"
2240                         "paddw %%mm3, %%mm2             \n\t"
2241                         "paddw %%mm2, %%mm0             \n\t"
2242                         "movd 6(%0, %%"REG_d"), %%mm4   \n\t"
2243                         "movd 6(%1, %%"REG_d"), %%mm1   \n\t"
2244                         "movd 9(%0, %%"REG_d"), %%mm2   \n\t"
2245                         "movd 9(%1, %%"REG_d"), %%mm3   \n\t"
2246                         "punpcklbw %%mm7, %%mm4         \n\t"
2247                         "punpcklbw %%mm7, %%mm1         \n\t"
2248                         "punpcklbw %%mm7, %%mm2         \n\t"
2249                         "punpcklbw %%mm7, %%mm3         \n\t"
2250                         "paddw %%mm1, %%mm4             \n\t"
2251                         "paddw %%mm3, %%mm2             \n\t"
2252                         "paddw %%mm4, %%mm2             \n\t"
2253                         "psrlw $2, %%mm0                \n\t"
2254                         "psrlw $2, %%mm2                \n\t"
2255 #endif
2256                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2257                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2258
2259                         "pmaddwd %%mm0, %%mm1           \n\t"
2260                         "pmaddwd %%mm2, %%mm3           \n\t"
2261                         "pmaddwd %%mm6, %%mm0           \n\t"
2262                         "pmaddwd %%mm6, %%mm2           \n\t"
2263 #ifndef FAST_BGR2YV12
2264                         "psrad $8, %%mm0                \n\t"
2265                         "psrad $8, %%mm1                \n\t"
2266                         "psrad $8, %%mm2                \n\t"
2267                         "psrad $8, %%mm3                \n\t"
2268 #endif
2269                         "packssdw %%mm2, %%mm0          \n\t"
2270                         "packssdw %%mm3, %%mm1          \n\t"
2271                         "pmaddwd %%mm5, %%mm0           \n\t"
2272                         "pmaddwd %%mm5, %%mm1           \n\t"
2273                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2274                         "psraw $7, %%mm0                \n\t"
2275
2276 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2277                         "movq 12(%0, %%"REG_d"), %%mm4  \n\t"
2278                         "movq 12(%1, %%"REG_d"), %%mm1  \n\t"
2279                         "movq 18(%0, %%"REG_d"), %%mm2  \n\t"
2280                         "movq 18(%1, %%"REG_d"), %%mm3  \n\t"
2281                         PAVGB" %%mm1, %%mm4             \n\t"
2282                         PAVGB" %%mm3, %%mm2             \n\t"
2283                         "movq %%mm4, %%mm1              \n\t"
2284                         "movq %%mm2, %%mm3              \n\t"
2285                         "psrlq $24, %%mm4               \n\t"
2286                         "psrlq $24, %%mm2               \n\t"
2287                         PAVGB" %%mm1, %%mm4             \n\t"
2288                         PAVGB" %%mm3, %%mm2             \n\t"
2289                         "punpcklbw %%mm7, %%mm4         \n\t"
2290                         "punpcklbw %%mm7, %%mm2         \n\t"
2291 #else
2292                         "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2293                         "movd 12(%1, %%"REG_d"), %%mm1  \n\t"
2294                         "movd 15(%0, %%"REG_d"), %%mm2  \n\t"
2295                         "movd 15(%1, %%"REG_d"), %%mm3  \n\t"
2296                         "punpcklbw %%mm7, %%mm4         \n\t"
2297                         "punpcklbw %%mm7, %%mm1         \n\t"
2298                         "punpcklbw %%mm7, %%mm2         \n\t"
2299                         "punpcklbw %%mm7, %%mm3         \n\t"
2300                         "paddw %%mm1, %%mm4             \n\t"
2301                         "paddw %%mm3, %%mm2             \n\t"
2302                         "paddw %%mm2, %%mm4             \n\t"
2303                         "movd 18(%0, %%"REG_d"), %%mm5  \n\t"
2304                         "movd 18(%1, %%"REG_d"), %%mm1  \n\t"
2305                         "movd 21(%0, %%"REG_d"), %%mm2  \n\t"
2306                         "movd 21(%1, %%"REG_d"), %%mm3  \n\t"
2307                         "punpcklbw %%mm7, %%mm5         \n\t"
2308                         "punpcklbw %%mm7, %%mm1         \n\t"
2309                         "punpcklbw %%mm7, %%mm2         \n\t"
2310                         "punpcklbw %%mm7, %%mm3         \n\t"
2311                         "paddw %%mm1, %%mm5             \n\t"
2312                         "paddw %%mm3, %%mm2             \n\t"
2313                         "paddw %%mm5, %%mm2             \n\t"
2314                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2315                         "psrlw $2, %%mm4                \n\t"
2316                         "psrlw $2, %%mm2                \n\t"
2317 #endif
2318                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2319                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2320
2321                         "pmaddwd %%mm4, %%mm1           \n\t"
2322                         "pmaddwd %%mm2, %%mm3           \n\t"
2323                         "pmaddwd %%mm6, %%mm4           \n\t"
2324                         "pmaddwd %%mm6, %%mm2           \n\t"
2325 #ifndef FAST_BGR2YV12
2326                         "psrad $8, %%mm4                \n\t"
2327                         "psrad $8, %%mm1                \n\t"
2328                         "psrad $8, %%mm2                \n\t"
2329                         "psrad $8, %%mm3                \n\t"
2330 #endif
2331                         "packssdw %%mm2, %%mm4          \n\t"
2332                         "packssdw %%mm3, %%mm1          \n\t"
2333                         "pmaddwd %%mm5, %%mm4           \n\t"
2334                         "pmaddwd %%mm5, %%mm1           \n\t"
2335                         "add $24, %%"REG_d"             \n\t"
2336                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2337                         "psraw $7, %%mm4                \n\t"
2338
2339                         "movq %%mm0, %%mm1              \n\t"
2340                         "punpckldq %%mm4, %%mm0         \n\t"
2341                         "punpckhdq %%mm4, %%mm1         \n\t"
2342                         "packsswb %%mm1, %%mm0          \n\t"
2343                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2344                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2345                         "punpckhdq %%mm0, %%mm0         \n\t"
2346                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2347                         "add $4, %%"REG_a"              \n\t"
2348                         " js 1b                         \n\t"
2349                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2350                         : "%"REG_a, "%"REG_d
2351                 );
2352
2353                 udst += chromStride;
2354                 vdst += chromStride;
2355                 src  += srcStride*2;
2356         }
2357
2358         asm volatile(   EMMS" \n\t"
2359                         SFENCE" \n\t"
2360                         :::"memory");
2361 #else
2362         y=0;
2363 #endif
2364         for(; y<height; y+=2)
2365         {
2366                 long i;
2367                 for(i=0; i<chromWidth; i++)
2368                 {
2369                         unsigned int b= src[6*i+0];
2370                         unsigned int g= src[6*i+1];
2371                         unsigned int r= src[6*i+2];
2372
2373                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2375                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2376
2377                         udst[i]         = U;
2378                         vdst[i]         = V;
2379                         ydst[2*i]       = Y;
2380
2381                         b= src[6*i+3];
2382                         g= src[6*i+4];
2383                         r= src[6*i+5];
2384
2385                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2386                         ydst[2*i+1]     = Y;
2387                 }
2388                 ydst += lumStride;
2389                 src  += srcStride;
2390
2391                 for(i=0; i<chromWidth; i++)
2392                 {
2393                         unsigned int b= src[6*i+0];
2394                         unsigned int g= src[6*i+1];
2395                         unsigned int r= src[6*i+2];
2396
2397                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2398
2399                         ydst[2*i]       = Y;
2400
2401                         b= src[6*i+3];
2402                         g= src[6*i+4];
2403                         r= src[6*i+5];
2404
2405                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2406                         ydst[2*i+1]     = Y;
2407                 }
2408                 udst += chromStride;
2409                 vdst += chromStride;
2410                 ydst += lumStride;
2411                 src  += srcStride;
2412         }
2413 }
2414
2415 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2416                             long width, long height, long src1Stride,
2417                             long src2Stride, long dstStride){
2418         long h;
2419
2420         for(h=0; h < height; h++)
2421         {
2422                 long w;
2423
2424 #ifdef HAVE_MMX
2425 #ifdef HAVE_SSE2
2426                 asm(
2427                         "xor %%"REG_a", %%"REG_a"       \n\t"
2428                         "1:                             \n\t"
2429                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2430                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2431                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2432                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2433                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2434                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2435                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2436                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2437                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2438                         "add $16, %%"REG_a"             \n\t"
2439                         "cmp %3, %%"REG_a"              \n\t"
2440                         " jb 1b                         \n\t"
2441                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2442                         : "memory", "%"REG_a""
2443                 );
2444 #else
2445                 asm(
2446                         "xor %%"REG_a", %%"REG_a"       \n\t"
2447                         "1:                             \n\t"
2448                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2449                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2450                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2451                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2452                         "movq %%mm0, %%mm1              \n\t"
2453                         "movq %%mm2, %%mm3              \n\t"
2454                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2455                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2456                         "punpcklbw %%mm4, %%mm0         \n\t"
2457                         "punpckhbw %%mm4, %%mm1         \n\t"
2458                         "punpcklbw %%mm5, %%mm2         \n\t"
2459                         "punpckhbw %%mm5, %%mm3         \n\t"
2460                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2461                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2462                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2463                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2464                         "add $16, %%"REG_a"             \n\t"
2465                         "cmp %3, %%"REG_a"              \n\t"
2466                         " jb 1b                         \n\t"
2467                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2468                         : "memory", "%"REG_a
2469                 );
2470 #endif
2471                 for(w= (width&(~15)); w < width; w++)
2472                 {
2473                         dest[2*w+0] = src1[w];
2474                         dest[2*w+1] = src2[w];
2475                 }
2476 #else
2477                 for(w=0; w < width; w++)
2478                 {
2479                         dest[2*w+0] = src1[w];
2480                         dest[2*w+1] = src2[w];
2481                 }
2482 #endif
2483                 dest += dstStride;
2484                 src1 += src1Stride;
2485                 src2 += src2Stride;
2486         }
2487 #ifdef HAVE_MMX
2488         asm(
2489                 EMMS" \n\t"
2490                 SFENCE" \n\t"
2491                 ::: "memory"
2492                 );
2493 #endif
2494 }
2495
2496 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2497                         uint8_t *dst1, uint8_t *dst2,
2498                         long width, long height,
2499                         long srcStride1, long srcStride2,
2500                         long dstStride1, long dstStride2)
2501 {
2502     long y,x,w,h;
2503     w=width/2; h=height/2;
2504 #ifdef HAVE_MMX
2505     asm volatile(
2506         PREFETCH" %0\n\t"
2507         PREFETCH" %1\n\t"
2508         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2509 #endif
2510     for(y=0;y<h;y++){
2511         const uint8_t* s1=src1+srcStride1*(y>>1);
2512         uint8_t* d=dst1+dstStride1*y;
2513         x=0;
2514 #ifdef HAVE_MMX
2515         for(;x<w-31;x+=32)
2516         {
2517             asm volatile(
2518                 PREFETCH" 32%1\n\t"
2519                 "movq   %1, %%mm0\n\t"
2520                 "movq   8%1, %%mm2\n\t"
2521                 "movq   16%1, %%mm4\n\t"
2522                 "movq   24%1, %%mm6\n\t"
2523                 "movq   %%mm0, %%mm1\n\t"
2524                 "movq   %%mm2, %%mm3\n\t"
2525                 "movq   %%mm4, %%mm5\n\t"
2526                 "movq   %%mm6, %%mm7\n\t"
2527                 "punpcklbw %%mm0, %%mm0\n\t"
2528                 "punpckhbw %%mm1, %%mm1\n\t"
2529                 "punpcklbw %%mm2, %%mm2\n\t"
2530                 "punpckhbw %%mm3, %%mm3\n\t"
2531                 "punpcklbw %%mm4, %%mm4\n\t"
2532                 "punpckhbw %%mm5, %%mm5\n\t"
2533                 "punpcklbw %%mm6, %%mm6\n\t"
2534                 "punpckhbw %%mm7, %%mm7\n\t"
2535                 MOVNTQ" %%mm0, %0\n\t"
2536                 MOVNTQ" %%mm1, 8%0\n\t"
2537                 MOVNTQ" %%mm2, 16%0\n\t"
2538                 MOVNTQ" %%mm3, 24%0\n\t"
2539                 MOVNTQ" %%mm4, 32%0\n\t"
2540                 MOVNTQ" %%mm5, 40%0\n\t"
2541                 MOVNTQ" %%mm6, 48%0\n\t"
2542                 MOVNTQ" %%mm7, 56%0"
2543                 :"=m"(d[2*x])
2544                 :"m"(s1[x])
2545                 :"memory");
2546         }
2547 #endif
2548         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2549     }
2550     for(y=0;y<h;y++){
2551         const uint8_t* s2=src2+srcStride2*(y>>1);
2552         uint8_t* d=dst2+dstStride2*y;
2553         x=0;
2554 #ifdef HAVE_MMX
2555         for(;x<w-31;x+=32)
2556         {
2557             asm volatile(
2558                 PREFETCH" 32%1\n\t"
2559                 "movq   %1, %%mm0\n\t"
2560                 "movq   8%1, %%mm2\n\t"
2561                 "movq   16%1, %%mm4\n\t"
2562                 "movq   24%1, %%mm6\n\t"
2563                 "movq   %%mm0, %%mm1\n\t"
2564                 "movq   %%mm2, %%mm3\n\t"
2565                 "movq   %%mm4, %%mm5\n\t"
2566                 "movq   %%mm6, %%mm7\n\t"
2567                 "punpcklbw %%mm0, %%mm0\n\t"
2568                 "punpckhbw %%mm1, %%mm1\n\t"
2569                 "punpcklbw %%mm2, %%mm2\n\t"
2570                 "punpckhbw %%mm3, %%mm3\n\t"
2571                 "punpcklbw %%mm4, %%mm4\n\t"
2572                 "punpckhbw %%mm5, %%mm5\n\t"
2573                 "punpcklbw %%mm6, %%mm6\n\t"
2574                 "punpckhbw %%mm7, %%mm7\n\t"
2575                 MOVNTQ" %%mm0, %0\n\t"
2576                 MOVNTQ" %%mm1, 8%0\n\t"
2577                 MOVNTQ" %%mm2, 16%0\n\t"
2578                 MOVNTQ" %%mm3, 24%0\n\t"
2579                 MOVNTQ" %%mm4, 32%0\n\t"
2580                 MOVNTQ" %%mm5, 40%0\n\t"
2581                 MOVNTQ" %%mm6, 48%0\n\t"
2582                 MOVNTQ" %%mm7, 56%0"
2583                 :"=m"(d[2*x])
2584                 :"m"(s2[x])
2585                 :"memory");
2586         }
2587 #endif
2588         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2589     }
2590 #ifdef HAVE_MMX
2591         asm(
2592                 EMMS" \n\t"
2593                 SFENCE" \n\t"
2594                 ::: "memory"
2595                 );
2596 #endif
2597 }
2598
2599 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2600                         uint8_t *dst,
2601                         long width, long height,
2602                         long srcStride1, long srcStride2,
2603                         long srcStride3, long dstStride)
2604 {
2605     long y,x,w,h;
2606     w=width/2; h=height;
2607     for(y=0;y<h;y++){
2608         const uint8_t* yp=src1+srcStride1*y;
2609         const uint8_t* up=src2+srcStride2*(y>>2);
2610         const uint8_t* vp=src3+srcStride3*(y>>2);
2611         uint8_t* d=dst+dstStride*y;
2612         x=0;
2613 #ifdef HAVE_MMX
2614         for(;x<w-7;x+=8)
2615         {
2616             asm volatile(
2617                 PREFETCH" 32(%1, %0)\n\t"
2618                 PREFETCH" 32(%2, %0)\n\t"
2619                 PREFETCH" 32(%3, %0)\n\t"
2620                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2621                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2622                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2623                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2624                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2625                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2626                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2627                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2628                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2629                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2630
2631                 "movq   %%mm1, %%mm6\n\t"
2632                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2633                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2634                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2635                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2636                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2637                 
2638                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2639                 "movq   8(%1, %0, 4), %%mm0\n\t"
2640                 "movq   %%mm0, %%mm3\n\t"
2641                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2642                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2643                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2644                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2645
2646                 "movq   %%mm4, %%mm6\n\t"
2647                 "movq   16(%1, %0, 4), %%mm0\n\t"
2648                 "movq   %%mm0, %%mm3\n\t"
2649                 "punpcklbw %%mm5, %%mm4\n\t"
2650                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2651                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2652                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2653                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2654                 
2655                 "punpckhbw %%mm5, %%mm6\n\t"
2656                 "movq   24(%1, %0, 4), %%mm0\n\t"
2657                 "movq   %%mm0, %%mm3\n\t"
2658                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2659                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2660                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2661                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2662
2663                 : "+r" (x)
2664                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2665                 :"memory");
2666         }
2667 #endif
2668         for(; x<w; x++)
2669         {
2670             const long x2= x<<2;
2671             d[8*x+0]=yp[x2];
2672             d[8*x+1]=up[x];
2673             d[8*x+2]=yp[x2+1];
2674             d[8*x+3]=vp[x];
2675             d[8*x+4]=yp[x2+2];
2676             d[8*x+5]=up[x];
2677             d[8*x+6]=yp[x2+3];
2678             d[8*x+7]=vp[x];
2679         }
2680     }
2681 #ifdef HAVE_MMX
2682         asm(
2683                 EMMS" \n\t"
2684                 SFENCE" \n\t"
2685                 ::: "memory"
2686                 );
2687 #endif
2688 }