]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
add support for intel mac. mp3lib is not fixed yet.
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #include "asmalign.h"
16
17 #ifndef __WORDSIZE
18 // #warning You have misconfigured system and probably will lose performance!
19 #define __WORDSIZE MP_WORDSIZE
20 #endif
21
22 #undef PREFETCH
23 #undef MOVNTQ
24 #undef EMMS
25 #undef SFENCE
26 #undef MMREG_SIZE
27 #undef PREFETCHW
28 #undef PAVGB
29
30 #ifdef HAVE_SSE2
31 #define MMREG_SIZE 16
32 #else
33 #define MMREG_SIZE 8
34 #endif
35
36 #ifdef HAVE_3DNOW
37 #define PREFETCH  "prefetch"
38 #define PREFETCHW "prefetchw"
39 #define PAVGB     "pavgusb"
40 #elif defined ( HAVE_MMX2 )
41 #define PREFETCH "prefetchnta"
42 #define PREFETCHW "prefetcht0"
43 #define PAVGB     "pavgb"
44 #else
45 #ifdef __APPLE__
46 #define PREFETCH "#"
47 #define PREFETCHW "#"
48 #elif
49 #define PREFETCH "/nop"
50 #define PREFETCHW "/nop"
51 #endif
52 #endif
53
54 #ifdef HAVE_3DNOW
55 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
56 #define EMMS     "femms"
57 #else
58 #define EMMS     "emms"
59 #endif
60
61 #ifdef HAVE_MMX2
62 #define MOVNTQ "movntq"
63 #define SFENCE "sfence"
64 #else
65 #define MOVNTQ "movq"
66 #ifdef __APPLE__
67 #define SFENCE "#"
68 #elif
69 #define SFENCE "/nop"
70 #endif
71 #endif
72
73 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
74 {
75   uint8_t *dest = dst;
76   const uint8_t *s = src;
77   const uint8_t *end;
78 #ifdef HAVE_MMX
79   const uint8_t *mm_end;
80 #endif
81   end = s + src_size;
82 #ifdef HAVE_MMX
83   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
84   mm_end = end - 23;
85   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
86   while(s < mm_end)
87   {
88     __asm __volatile(
89         PREFETCH"       32%1\n\t"
90         "movd   %1, %%mm0\n\t"
91         "punpckldq 3%1, %%mm0\n\t"
92         "movd   6%1, %%mm1\n\t"
93         "punpckldq 9%1, %%mm1\n\t"
94         "movd   12%1, %%mm2\n\t"
95         "punpckldq 15%1, %%mm2\n\t"
96         "movd   18%1, %%mm3\n\t"
97         "punpckldq 21%1, %%mm3\n\t"
98         "pand   %%mm7, %%mm0\n\t"
99         "pand   %%mm7, %%mm1\n\t"
100         "pand   %%mm7, %%mm2\n\t"
101         "pand   %%mm7, %%mm3\n\t"
102         MOVNTQ" %%mm0, %0\n\t"
103         MOVNTQ" %%mm1, 8%0\n\t"
104         MOVNTQ" %%mm2, 16%0\n\t"
105         MOVNTQ" %%mm3, 24%0"
106         :"=m"(*dest)
107         :"m"(*s)
108         :"memory");
109     dest += 32;
110     s += 24;
111   }
112   __asm __volatile(SFENCE:::"memory");
113   __asm __volatile(EMMS:::"memory");
114 #endif
115   while(s < end)
116   {
117 #ifdef WORDS_BIGENDIAN
118     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
119     *dest++ = 0;
120     *dest++ = s[2];
121     *dest++ = s[1];
122     *dest++ = s[0];
123     s+=3;
124 #else
125     *dest++ = *s++;
126     *dest++ = *s++;
127     *dest++ = *s++;
128     *dest++ = 0;
129 #endif
130   }
131 }
132
133 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
134 {
135   uint8_t *dest = dst;
136   const uint8_t *s = src;
137   const uint8_t *end;
138 #ifdef HAVE_MMX
139   const uint8_t *mm_end;
140 #endif
141   end = s + src_size;
142 #ifdef HAVE_MMX
143   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
144   mm_end = end - 31;
145   while(s < mm_end)
146   {
147     __asm __volatile(
148         PREFETCH"       32%1\n\t"
149         "movq   %1, %%mm0\n\t"
150         "movq   8%1, %%mm1\n\t"
151         "movq   16%1, %%mm4\n\t"
152         "movq   24%1, %%mm5\n\t"
153         "movq   %%mm0, %%mm2\n\t"
154         "movq   %%mm1, %%mm3\n\t"
155         "movq   %%mm4, %%mm6\n\t"
156         "movq   %%mm5, %%mm7\n\t"
157         "psrlq  $8, %%mm2\n\t"
158         "psrlq  $8, %%mm3\n\t"
159         "psrlq  $8, %%mm6\n\t"
160         "psrlq  $8, %%mm7\n\t"
161         "pand   %2, %%mm0\n\t"
162         "pand   %2, %%mm1\n\t"
163         "pand   %2, %%mm4\n\t"
164         "pand   %2, %%mm5\n\t"
165         "pand   %3, %%mm2\n\t"
166         "pand   %3, %%mm3\n\t"
167         "pand   %3, %%mm6\n\t"
168         "pand   %3, %%mm7\n\t"
169         "por    %%mm2, %%mm0\n\t"
170         "por    %%mm3, %%mm1\n\t"
171         "por    %%mm6, %%mm4\n\t"
172         "por    %%mm7, %%mm5\n\t"
173
174         "movq   %%mm1, %%mm2\n\t"
175         "movq   %%mm4, %%mm3\n\t"
176         "psllq  $48, %%mm2\n\t"
177         "psllq  $32, %%mm3\n\t"
178         "pand   %4, %%mm2\n\t"
179         "pand   %5, %%mm3\n\t"
180         "por    %%mm2, %%mm0\n\t"
181         "psrlq  $16, %%mm1\n\t"
182         "psrlq  $32, %%mm4\n\t"
183         "psllq  $16, %%mm5\n\t"
184         "por    %%mm3, %%mm1\n\t"
185         "pand   %6, %%mm5\n\t"
186         "por    %%mm5, %%mm4\n\t"
187
188         MOVNTQ" %%mm0, %0\n\t"
189         MOVNTQ" %%mm1, 8%0\n\t"
190         MOVNTQ" %%mm4, 16%0"
191         :"=m"(*dest)
192         :"m"(*s),"m"(mask24l),
193          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
194         :"memory");
195     dest += 24;
196     s += 32;
197   }
198   __asm __volatile(SFENCE:::"memory");
199   __asm __volatile(EMMS:::"memory");
200 #endif
201   while(s < end)
202   {
203 #ifdef WORDS_BIGENDIAN
204     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
205     s++;
206     dest[2] = *s++;
207     dest[1] = *s++;
208     dest[0] = *s++;
209     dest += 3;
210 #else
211     *dest++ = *s++;
212     *dest++ = *s++;
213     *dest++ = *s++;
214     s++;
215 #endif
216   }
217 }
218
219 /*
220  Original by Strepto/Astral
221  ported to gcc & bugfixed : A'rpi
222  MMX2, 3DNOW optimization by Nick Kurshev
223  32bit c version, and and&add trick by Michael Niedermayer
224 */
225 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
226 {
227   register const uint8_t* s=src;
228   register uint8_t* d=dst;
229   register const uint8_t *end;
230   const uint8_t *mm_end;
231   end = s + src_size;
232 #ifdef HAVE_MMX
233   __asm __volatile(PREFETCH"    %0"::"m"(*s));
234   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
235   mm_end = end - 15;
236   while(s<mm_end)
237   {
238         __asm __volatile(
239                 PREFETCH"       32%1\n\t"
240                 "movq   %1, %%mm0\n\t"
241                 "movq   8%1, %%mm2\n\t"
242                 "movq   %%mm0, %%mm1\n\t"
243                 "movq   %%mm2, %%mm3\n\t"
244                 "pand   %%mm4, %%mm0\n\t"
245                 "pand   %%mm4, %%mm2\n\t"
246                 "paddw  %%mm1, %%mm0\n\t"
247                 "paddw  %%mm3, %%mm2\n\t"
248                 MOVNTQ" %%mm0, %0\n\t"
249                 MOVNTQ" %%mm2, 8%0"
250                 :"=m"(*d)
251                 :"m"(*s)
252                 );
253         d+=16;
254         s+=16;
255   }
256   __asm __volatile(SFENCE:::"memory");
257   __asm __volatile(EMMS:::"memory");
258 #endif
259     mm_end = end - 3;
260     while(s < mm_end)
261     {
262         register unsigned x= *((uint32_t *)s);
263         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
264         d+=4;
265         s+=4;
266     }
267     if(s < end)
268     {
269         register unsigned short x= *((uint16_t *)s);
270         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
271     }
272 }
273
274 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
275 {
276   register const uint8_t* s=src;
277   register uint8_t* d=dst;
278   register const uint8_t *end;
279   const uint8_t *mm_end;
280   end = s + src_size;
281 #ifdef HAVE_MMX
282   __asm __volatile(PREFETCH"    %0"::"m"(*s));
283   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
284   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
285   mm_end = end - 15;
286   while(s<mm_end)
287   {
288         __asm __volatile(
289                 PREFETCH"       32%1\n\t"
290                 "movq   %1, %%mm0\n\t"
291                 "movq   8%1, %%mm2\n\t"
292                 "movq   %%mm0, %%mm1\n\t"
293                 "movq   %%mm2, %%mm3\n\t"
294                 "psrlq  $1, %%mm0\n\t"
295                 "psrlq  $1, %%mm2\n\t"
296                 "pand   %%mm7, %%mm0\n\t"
297                 "pand   %%mm7, %%mm2\n\t"
298                 "pand   %%mm6, %%mm1\n\t"
299                 "pand   %%mm6, %%mm3\n\t"
300                 "por    %%mm1, %%mm0\n\t"
301                 "por    %%mm3, %%mm2\n\t"
302                 MOVNTQ" %%mm0, %0\n\t"
303                 MOVNTQ" %%mm2, 8%0"
304                 :"=m"(*d)
305                 :"m"(*s)
306                 );
307         d+=16;
308         s+=16;
309   }
310   __asm __volatile(SFENCE:::"memory");
311   __asm __volatile(EMMS:::"memory");
312 #endif
313     mm_end = end - 3;
314     while(s < mm_end)
315     {
316         register uint32_t x= *((uint32_t *)s);
317         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
318         s+=4;
319         d+=4;
320     }
321     if(s < end)
322     {
323         register uint16_t x= *((uint16_t *)s);
324         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
325         s+=2;
326         d+=2;
327     }
328 }
329
330 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
331 {
332         const uint8_t *s = src;
333         const uint8_t *end;
334 #ifdef HAVE_MMX
335         const uint8_t *mm_end;
336 #endif
337         uint16_t *d = (uint16_t *)dst;
338         end = s + src_size;
339 #ifdef HAVE_MMX
340         mm_end = end - 15;
341 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
342         asm volatile(
343                 "movq %3, %%mm5                 \n\t"
344                 "movq %4, %%mm6                 \n\t"
345                 "movq %5, %%mm7                 \n\t"
346                 ASMALIGN16
347                 "1:                             \n\t"
348                 PREFETCH" 32(%1)                \n\t"
349                 "movd   (%1), %%mm0             \n\t"
350                 "movd   4(%1), %%mm3            \n\t"
351                 "punpckldq 8(%1), %%mm0         \n\t"
352                 "punpckldq 12(%1), %%mm3        \n\t"
353                 "movq %%mm0, %%mm1              \n\t"
354                 "movq %%mm3, %%mm4              \n\t"
355                 "pand %%mm6, %%mm0              \n\t"
356                 "pand %%mm6, %%mm3              \n\t"
357                 "pmaddwd %%mm7, %%mm0           \n\t"
358                 "pmaddwd %%mm7, %%mm3           \n\t"
359                 "pand %%mm5, %%mm1              \n\t"
360                 "pand %%mm5, %%mm4              \n\t"
361                 "por %%mm1, %%mm0               \n\t"   
362                 "por %%mm4, %%mm3               \n\t"
363                 "psrld $5, %%mm0                \n\t"
364                 "pslld $11, %%mm3               \n\t"
365                 "por %%mm3, %%mm0               \n\t"
366                 MOVNTQ" %%mm0, (%0)             \n\t"
367                 "add $16, %1                    \n\t"
368                 "add $8, %0                     \n\t"
369                 "cmp %2, %1                     \n\t"
370                 " jb 1b                         \n\t"
371                 : "+r" (d), "+r"(s)
372                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
373         );
374 #else
375         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
376         __asm __volatile(
377             "movq       %0, %%mm7\n\t"
378             "movq       %1, %%mm6\n\t"
379             ::"m"(red_16mask),"m"(green_16mask));
380         while(s < mm_end)
381         {
382             __asm __volatile(
383                 PREFETCH" 32%1\n\t"
384                 "movd   %1, %%mm0\n\t"
385                 "movd   4%1, %%mm3\n\t"
386                 "punpckldq 8%1, %%mm0\n\t"
387                 "punpckldq 12%1, %%mm3\n\t"
388                 "movq   %%mm0, %%mm1\n\t"
389                 "movq   %%mm0, %%mm2\n\t"
390                 "movq   %%mm3, %%mm4\n\t"
391                 "movq   %%mm3, %%mm5\n\t"
392                 "psrlq  $3, %%mm0\n\t"
393                 "psrlq  $3, %%mm3\n\t"
394                 "pand   %2, %%mm0\n\t"
395                 "pand   %2, %%mm3\n\t"
396                 "psrlq  $5, %%mm1\n\t"
397                 "psrlq  $5, %%mm4\n\t"
398                 "pand   %%mm6, %%mm1\n\t"
399                 "pand   %%mm6, %%mm4\n\t"
400                 "psrlq  $8, %%mm2\n\t"
401                 "psrlq  $8, %%mm5\n\t"
402                 "pand   %%mm7, %%mm2\n\t"
403                 "pand   %%mm7, %%mm5\n\t"
404                 "por    %%mm1, %%mm0\n\t"
405                 "por    %%mm4, %%mm3\n\t"
406                 "por    %%mm2, %%mm0\n\t"
407                 "por    %%mm5, %%mm3\n\t"
408                 "psllq  $16, %%mm3\n\t"
409                 "por    %%mm3, %%mm0\n\t"
410                 MOVNTQ" %%mm0, %0\n\t"
411                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
412                 d += 4;
413                 s += 16;
414         }
415 #endif
416         __asm __volatile(SFENCE:::"memory");
417         __asm __volatile(EMMS:::"memory");
418 #endif
419         while(s < end)
420         {
421                 register int rgb = *(uint32_t*)s; s += 4;
422                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
423         }
424 }
425
426 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
427 {
428         const uint8_t *s = src;
429         const uint8_t *end;
430 #ifdef HAVE_MMX
431         const uint8_t *mm_end;
432 #endif
433         uint16_t *d = (uint16_t *)dst;
434         end = s + src_size;
435 #ifdef HAVE_MMX
436         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
437         __asm __volatile(
438             "movq       %0, %%mm7\n\t"
439             "movq       %1, %%mm6\n\t"
440             ::"m"(red_16mask),"m"(green_16mask));
441         mm_end = end - 15;
442         while(s < mm_end)
443         {
444             __asm __volatile(
445                 PREFETCH" 32%1\n\t"
446                 "movd   %1, %%mm0\n\t"
447                 "movd   4%1, %%mm3\n\t"
448                 "punpckldq 8%1, %%mm0\n\t"
449                 "punpckldq 12%1, %%mm3\n\t"
450                 "movq   %%mm0, %%mm1\n\t"
451                 "movq   %%mm0, %%mm2\n\t"
452                 "movq   %%mm3, %%mm4\n\t"
453                 "movq   %%mm3, %%mm5\n\t"
454                 "psllq  $8, %%mm0\n\t"
455                 "psllq  $8, %%mm3\n\t"
456                 "pand   %%mm7, %%mm0\n\t"
457                 "pand   %%mm7, %%mm3\n\t"
458                 "psrlq  $5, %%mm1\n\t"
459                 "psrlq  $5, %%mm4\n\t"
460                 "pand   %%mm6, %%mm1\n\t"
461                 "pand   %%mm6, %%mm4\n\t"
462                 "psrlq  $19, %%mm2\n\t"
463                 "psrlq  $19, %%mm5\n\t"
464                 "pand   %2, %%mm2\n\t"
465                 "pand   %2, %%mm5\n\t"
466                 "por    %%mm1, %%mm0\n\t"
467                 "por    %%mm4, %%mm3\n\t"
468                 "por    %%mm2, %%mm0\n\t"
469                 "por    %%mm5, %%mm3\n\t"
470                 "psllq  $16, %%mm3\n\t"
471                 "por    %%mm3, %%mm0\n\t"
472                 MOVNTQ" %%mm0, %0\n\t"
473                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
474                 d += 4;
475                 s += 16;
476         }
477         __asm __volatile(SFENCE:::"memory");
478         __asm __volatile(EMMS:::"memory");
479 #endif
480         while(s < end)
481         {
482                 register int rgb = *(uint32_t*)s; s += 4;
483                 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
484         }
485 }
486
487 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
488 {
489         const uint8_t *s = src;
490         const uint8_t *end;
491 #ifdef HAVE_MMX
492         const uint8_t *mm_end;
493 #endif
494         uint16_t *d = (uint16_t *)dst;
495         end = s + src_size;
496 #ifdef HAVE_MMX
497         mm_end = end - 15;
498 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
499         asm volatile(
500                 "movq %3, %%mm5                 \n\t"
501                 "movq %4, %%mm6                 \n\t"
502                 "movq %5, %%mm7                 \n\t"
503                 ASMALIGN16
504                 "1:                             \n\t"
505                 PREFETCH" 32(%1)                \n\t"
506                 "movd   (%1), %%mm0             \n\t"
507                 "movd   4(%1), %%mm3            \n\t"
508                 "punpckldq 8(%1), %%mm0         \n\t"
509                 "punpckldq 12(%1), %%mm3        \n\t"
510                 "movq %%mm0, %%mm1              \n\t"
511                 "movq %%mm3, %%mm4              \n\t"
512                 "pand %%mm6, %%mm0              \n\t"
513                 "pand %%mm6, %%mm3              \n\t"
514                 "pmaddwd %%mm7, %%mm0           \n\t"
515                 "pmaddwd %%mm7, %%mm3           \n\t"
516                 "pand %%mm5, %%mm1              \n\t"
517                 "pand %%mm5, %%mm4              \n\t"
518                 "por %%mm1, %%mm0               \n\t"   
519                 "por %%mm4, %%mm3               \n\t"
520                 "psrld $6, %%mm0                \n\t"
521                 "pslld $10, %%mm3               \n\t"
522                 "por %%mm3, %%mm0               \n\t"
523                 MOVNTQ" %%mm0, (%0)             \n\t"
524                 "add $16, %1                    \n\t"
525                 "add $8, %0                     \n\t"
526                 "cmp %2, %1                     \n\t"
527                 " jb 1b                         \n\t"
528                 : "+r" (d), "+r"(s)
529                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
530         );
531 #else
532         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
533         __asm __volatile(
534             "movq       %0, %%mm7\n\t"
535             "movq       %1, %%mm6\n\t"
536             ::"m"(red_15mask),"m"(green_15mask));
537         while(s < mm_end)
538         {
539             __asm __volatile(
540                 PREFETCH" 32%1\n\t"
541                 "movd   %1, %%mm0\n\t"
542                 "movd   4%1, %%mm3\n\t"
543                 "punpckldq 8%1, %%mm0\n\t"
544                 "punpckldq 12%1, %%mm3\n\t"
545                 "movq   %%mm0, %%mm1\n\t"
546                 "movq   %%mm0, %%mm2\n\t"
547                 "movq   %%mm3, %%mm4\n\t"
548                 "movq   %%mm3, %%mm5\n\t"
549                 "psrlq  $3, %%mm0\n\t"
550                 "psrlq  $3, %%mm3\n\t"
551                 "pand   %2, %%mm0\n\t"
552                 "pand   %2, %%mm3\n\t"
553                 "psrlq  $6, %%mm1\n\t"
554                 "psrlq  $6, %%mm4\n\t"
555                 "pand   %%mm6, %%mm1\n\t"
556                 "pand   %%mm6, %%mm4\n\t"
557                 "psrlq  $9, %%mm2\n\t"
558                 "psrlq  $9, %%mm5\n\t"
559                 "pand   %%mm7, %%mm2\n\t"
560                 "pand   %%mm7, %%mm5\n\t"
561                 "por    %%mm1, %%mm0\n\t"
562                 "por    %%mm4, %%mm3\n\t"
563                 "por    %%mm2, %%mm0\n\t"
564                 "por    %%mm5, %%mm3\n\t"
565                 "psllq  $16, %%mm3\n\t"
566                 "por    %%mm3, %%mm0\n\t"
567                 MOVNTQ" %%mm0, %0\n\t"
568                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
569                 d += 4;
570                 s += 16;
571         }
572 #endif
573         __asm __volatile(SFENCE:::"memory");
574         __asm __volatile(EMMS:::"memory");
575 #endif
576         while(s < end)
577         {
578                 register int rgb = *(uint32_t*)s; s += 4;
579                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
580         }
581 }
582
583 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
584 {
585         const uint8_t *s = src;
586         const uint8_t *end;
587 #ifdef HAVE_MMX
588         const uint8_t *mm_end;
589 #endif
590         uint16_t *d = (uint16_t *)dst;
591         end = s + src_size;
592 #ifdef HAVE_MMX
593         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
594         __asm __volatile(
595             "movq       %0, %%mm7\n\t"
596             "movq       %1, %%mm6\n\t"
597             ::"m"(red_15mask),"m"(green_15mask));
598         mm_end = end - 15;
599         while(s < mm_end)
600         {
601             __asm __volatile(
602                 PREFETCH" 32%1\n\t"
603                 "movd   %1, %%mm0\n\t"
604                 "movd   4%1, %%mm3\n\t"
605                 "punpckldq 8%1, %%mm0\n\t"
606                 "punpckldq 12%1, %%mm3\n\t"
607                 "movq   %%mm0, %%mm1\n\t"
608                 "movq   %%mm0, %%mm2\n\t"
609                 "movq   %%mm3, %%mm4\n\t"
610                 "movq   %%mm3, %%mm5\n\t"
611                 "psllq  $7, %%mm0\n\t"
612                 "psllq  $7, %%mm3\n\t"
613                 "pand   %%mm7, %%mm0\n\t"
614                 "pand   %%mm7, %%mm3\n\t"
615                 "psrlq  $6, %%mm1\n\t"
616                 "psrlq  $6, %%mm4\n\t"
617                 "pand   %%mm6, %%mm1\n\t"
618                 "pand   %%mm6, %%mm4\n\t"
619                 "psrlq  $19, %%mm2\n\t"
620                 "psrlq  $19, %%mm5\n\t"
621                 "pand   %2, %%mm2\n\t"
622                 "pand   %2, %%mm5\n\t"
623                 "por    %%mm1, %%mm0\n\t"
624                 "por    %%mm4, %%mm3\n\t"
625                 "por    %%mm2, %%mm0\n\t"
626                 "por    %%mm5, %%mm3\n\t"
627                 "psllq  $16, %%mm3\n\t"
628                 "por    %%mm3, %%mm0\n\t"
629                 MOVNTQ" %%mm0, %0\n\t"
630                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
631                 d += 4;
632                 s += 16;
633         }
634         __asm __volatile(SFENCE:::"memory");
635         __asm __volatile(EMMS:::"memory");
636 #endif
637         while(s < end)
638         {
639                 register int rgb = *(uint32_t*)s; s += 4;
640                 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
641         }
642 }
643
644 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
645 {
646         const uint8_t *s = src;
647         const uint8_t *end;
648 #ifdef HAVE_MMX
649         const uint8_t *mm_end;
650 #endif
651         uint16_t *d = (uint16_t *)dst;
652         end = s + src_size;
653 #ifdef HAVE_MMX
654         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
655         __asm __volatile(
656             "movq       %0, %%mm7\n\t"
657             "movq       %1, %%mm6\n\t"
658             ::"m"(red_16mask),"m"(green_16mask));
659         mm_end = end - 11;
660         while(s < mm_end)
661         {
662             __asm __volatile(
663                 PREFETCH" 32%1\n\t"
664                 "movd   %1, %%mm0\n\t"
665                 "movd   3%1, %%mm3\n\t"
666                 "punpckldq 6%1, %%mm0\n\t"
667                 "punpckldq 9%1, %%mm3\n\t"
668                 "movq   %%mm0, %%mm1\n\t"
669                 "movq   %%mm0, %%mm2\n\t"
670                 "movq   %%mm3, %%mm4\n\t"
671                 "movq   %%mm3, %%mm5\n\t"
672                 "psrlq  $3, %%mm0\n\t"
673                 "psrlq  $3, %%mm3\n\t"
674                 "pand   %2, %%mm0\n\t"
675                 "pand   %2, %%mm3\n\t"
676                 "psrlq  $5, %%mm1\n\t"
677                 "psrlq  $5, %%mm4\n\t"
678                 "pand   %%mm6, %%mm1\n\t"
679                 "pand   %%mm6, %%mm4\n\t"
680                 "psrlq  $8, %%mm2\n\t"
681                 "psrlq  $8, %%mm5\n\t"
682                 "pand   %%mm7, %%mm2\n\t"
683                 "pand   %%mm7, %%mm5\n\t"
684                 "por    %%mm1, %%mm0\n\t"
685                 "por    %%mm4, %%mm3\n\t"
686                 "por    %%mm2, %%mm0\n\t"
687                 "por    %%mm5, %%mm3\n\t"
688                 "psllq  $16, %%mm3\n\t"
689                 "por    %%mm3, %%mm0\n\t"
690                 MOVNTQ" %%mm0, %0\n\t"
691                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
692                 d += 4;
693                 s += 12;
694         }
695         __asm __volatile(SFENCE:::"memory");
696         __asm __volatile(EMMS:::"memory");
697 #endif
698         while(s < end)
699         {
700                 const int b= *s++;
701                 const int g= *s++;
702                 const int r= *s++;
703                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
704         }
705 }
706
707 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
708 {
709         const uint8_t *s = src;
710         const uint8_t *end;
711 #ifdef HAVE_MMX
712         const uint8_t *mm_end;
713 #endif
714         uint16_t *d = (uint16_t *)dst;
715         end = s + src_size;
716 #ifdef HAVE_MMX
717         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
718         __asm __volatile(
719             "movq       %0, %%mm7\n\t"
720             "movq       %1, %%mm6\n\t"
721             ::"m"(red_16mask),"m"(green_16mask));
722         mm_end = end - 15;
723         while(s < mm_end)
724         {
725             __asm __volatile(
726                 PREFETCH" 32%1\n\t"
727                 "movd   %1, %%mm0\n\t"
728                 "movd   3%1, %%mm3\n\t"
729                 "punpckldq 6%1, %%mm0\n\t"
730                 "punpckldq 9%1, %%mm3\n\t"
731                 "movq   %%mm0, %%mm1\n\t"
732                 "movq   %%mm0, %%mm2\n\t"
733                 "movq   %%mm3, %%mm4\n\t"
734                 "movq   %%mm3, %%mm5\n\t"
735                 "psllq  $8, %%mm0\n\t"
736                 "psllq  $8, %%mm3\n\t"
737                 "pand   %%mm7, %%mm0\n\t"
738                 "pand   %%mm7, %%mm3\n\t"
739                 "psrlq  $5, %%mm1\n\t"
740                 "psrlq  $5, %%mm4\n\t"
741                 "pand   %%mm6, %%mm1\n\t"
742                 "pand   %%mm6, %%mm4\n\t"
743                 "psrlq  $19, %%mm2\n\t"
744                 "psrlq  $19, %%mm5\n\t"
745                 "pand   %2, %%mm2\n\t"
746                 "pand   %2, %%mm5\n\t"
747                 "por    %%mm1, %%mm0\n\t"
748                 "por    %%mm4, %%mm3\n\t"
749                 "por    %%mm2, %%mm0\n\t"
750                 "por    %%mm5, %%mm3\n\t"
751                 "psllq  $16, %%mm3\n\t"
752                 "por    %%mm3, %%mm0\n\t"
753                 MOVNTQ" %%mm0, %0\n\t"
754                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
755                 d += 4;
756                 s += 12;
757         }
758         __asm __volatile(SFENCE:::"memory");
759         __asm __volatile(EMMS:::"memory");
760 #endif
761         while(s < end)
762         {
763                 const int r= *s++;
764                 const int g= *s++;
765                 const int b= *s++;
766                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
767         }
768 }
769
770 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
771 {
772         const uint8_t *s = src;
773         const uint8_t *end;
774 #ifdef HAVE_MMX
775         const uint8_t *mm_end;
776 #endif
777         uint16_t *d = (uint16_t *)dst;
778         end = s + src_size;
779 #ifdef HAVE_MMX
780         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
781         __asm __volatile(
782             "movq       %0, %%mm7\n\t"
783             "movq       %1, %%mm6\n\t"
784             ::"m"(red_15mask),"m"(green_15mask));
785         mm_end = end - 11;
786         while(s < mm_end)
787         {
788             __asm __volatile(
789                 PREFETCH" 32%1\n\t"
790                 "movd   %1, %%mm0\n\t"
791                 "movd   3%1, %%mm3\n\t"
792                 "punpckldq 6%1, %%mm0\n\t"
793                 "punpckldq 9%1, %%mm3\n\t"
794                 "movq   %%mm0, %%mm1\n\t"
795                 "movq   %%mm0, %%mm2\n\t"
796                 "movq   %%mm3, %%mm4\n\t"
797                 "movq   %%mm3, %%mm5\n\t"
798                 "psrlq  $3, %%mm0\n\t"
799                 "psrlq  $3, %%mm3\n\t"
800                 "pand   %2, %%mm0\n\t"
801                 "pand   %2, %%mm3\n\t"
802                 "psrlq  $6, %%mm1\n\t"
803                 "psrlq  $6, %%mm4\n\t"
804                 "pand   %%mm6, %%mm1\n\t"
805                 "pand   %%mm6, %%mm4\n\t"
806                 "psrlq  $9, %%mm2\n\t"
807                 "psrlq  $9, %%mm5\n\t"
808                 "pand   %%mm7, %%mm2\n\t"
809                 "pand   %%mm7, %%mm5\n\t"
810                 "por    %%mm1, %%mm0\n\t"
811                 "por    %%mm4, %%mm3\n\t"
812                 "por    %%mm2, %%mm0\n\t"
813                 "por    %%mm5, %%mm3\n\t"
814                 "psllq  $16, %%mm3\n\t"
815                 "por    %%mm3, %%mm0\n\t"
816                 MOVNTQ" %%mm0, %0\n\t"
817                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
818                 d += 4;
819                 s += 12;
820         }
821         __asm __volatile(SFENCE:::"memory");
822         __asm __volatile(EMMS:::"memory");
823 #endif
824         while(s < end)
825         {
826                 const int b= *s++;
827                 const int g= *s++;
828                 const int r= *s++;
829                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
830         }
831 }
832
833 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
834 {
835         const uint8_t *s = src;
836         const uint8_t *end;
837 #ifdef HAVE_MMX
838         const uint8_t *mm_end;
839 #endif
840         uint16_t *d = (uint16_t *)dst;
841         end = s + src_size;
842 #ifdef HAVE_MMX
843         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
844         __asm __volatile(
845             "movq       %0, %%mm7\n\t"
846             "movq       %1, %%mm6\n\t"
847             ::"m"(red_15mask),"m"(green_15mask));
848         mm_end = end - 15;
849         while(s < mm_end)
850         {
851             __asm __volatile(
852                 PREFETCH" 32%1\n\t"
853                 "movd   %1, %%mm0\n\t"
854                 "movd   3%1, %%mm3\n\t"
855                 "punpckldq 6%1, %%mm0\n\t"
856                 "punpckldq 9%1, %%mm3\n\t"
857                 "movq   %%mm0, %%mm1\n\t"
858                 "movq   %%mm0, %%mm2\n\t"
859                 "movq   %%mm3, %%mm4\n\t"
860                 "movq   %%mm3, %%mm5\n\t"
861                 "psllq  $7, %%mm0\n\t"
862                 "psllq  $7, %%mm3\n\t"
863                 "pand   %%mm7, %%mm0\n\t"
864                 "pand   %%mm7, %%mm3\n\t"
865                 "psrlq  $6, %%mm1\n\t"
866                 "psrlq  $6, %%mm4\n\t"
867                 "pand   %%mm6, %%mm1\n\t"
868                 "pand   %%mm6, %%mm4\n\t"
869                 "psrlq  $19, %%mm2\n\t"
870                 "psrlq  $19, %%mm5\n\t"
871                 "pand   %2, %%mm2\n\t"
872                 "pand   %2, %%mm5\n\t"
873                 "por    %%mm1, %%mm0\n\t"
874                 "por    %%mm4, %%mm3\n\t"
875                 "por    %%mm2, %%mm0\n\t"
876                 "por    %%mm5, %%mm3\n\t"
877                 "psllq  $16, %%mm3\n\t"
878                 "por    %%mm3, %%mm0\n\t"
879                 MOVNTQ" %%mm0, %0\n\t"
880                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
881                 d += 4;
882                 s += 12;
883         }
884         __asm __volatile(SFENCE:::"memory");
885         __asm __volatile(EMMS:::"memory");
886 #endif
887         while(s < end)
888         {
889                 const int r= *s++;
890                 const int g= *s++;
891                 const int b= *s++;
892                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
893         }
894 }
895
896 /*
897   I use here less accurate approximation by simply
898  left-shifting the input
899   value and filling the low order bits with
900  zeroes. This method improves png's
901   compression but this scheme cannot reproduce white exactly, since it does not
902   generate an all-ones maximum value; the net effect is to darken the
903   image slightly.
904
905   The better method should be "left bit replication":
906
907    4 3 2 1 0
908    ---------
909    1 1 0 1 1
910
911    7 6 5 4 3  2 1 0
912    ----------------
913    1 1 0 1 1  1 1 0
914    |=======|  |===|
915        |      Leftmost Bits Repeated to Fill Open Bits
916        |
917    Original Bits
918 */
919 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
920 {
921         const uint16_t *end;
922 #ifdef HAVE_MMX
923         const uint16_t *mm_end;
924 #endif
925         uint8_t *d = (uint8_t *)dst;
926         const uint16_t *s = (uint16_t *)src;
927         end = s + src_size/2;
928 #ifdef HAVE_MMX
929         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
930         mm_end = end - 7;
931         while(s < mm_end)
932         {
933             __asm __volatile(
934                 PREFETCH" 32%1\n\t"
935                 "movq   %1, %%mm0\n\t"
936                 "movq   %1, %%mm1\n\t"
937                 "movq   %1, %%mm2\n\t"
938                 "pand   %2, %%mm0\n\t"
939                 "pand   %3, %%mm1\n\t"
940                 "pand   %4, %%mm2\n\t"
941                 "psllq  $3, %%mm0\n\t"
942                 "psrlq  $2, %%mm1\n\t"
943                 "psrlq  $7, %%mm2\n\t"
944                 "movq   %%mm0, %%mm3\n\t"
945                 "movq   %%mm1, %%mm4\n\t"
946                 "movq   %%mm2, %%mm5\n\t"
947                 "punpcklwd %5, %%mm0\n\t"
948                 "punpcklwd %5, %%mm1\n\t"
949                 "punpcklwd %5, %%mm2\n\t"
950                 "punpckhwd %5, %%mm3\n\t"
951                 "punpckhwd %5, %%mm4\n\t"
952                 "punpckhwd %5, %%mm5\n\t"
953                 "psllq  $8, %%mm1\n\t"
954                 "psllq  $16, %%mm2\n\t"
955                 "por    %%mm1, %%mm0\n\t"
956                 "por    %%mm2, %%mm0\n\t"
957                 "psllq  $8, %%mm4\n\t"
958                 "psllq  $16, %%mm5\n\t"
959                 "por    %%mm4, %%mm3\n\t"
960                 "por    %%mm5, %%mm3\n\t"
961
962                 "movq   %%mm0, %%mm6\n\t"
963                 "movq   %%mm3, %%mm7\n\t"
964                 
965                 "movq   8%1, %%mm0\n\t"
966                 "movq   8%1, %%mm1\n\t"
967                 "movq   8%1, %%mm2\n\t"
968                 "pand   %2, %%mm0\n\t"
969                 "pand   %3, %%mm1\n\t"
970                 "pand   %4, %%mm2\n\t"
971                 "psllq  $3, %%mm0\n\t"
972                 "psrlq  $2, %%mm1\n\t"
973                 "psrlq  $7, %%mm2\n\t"
974                 "movq   %%mm0, %%mm3\n\t"
975                 "movq   %%mm1, %%mm4\n\t"
976                 "movq   %%mm2, %%mm5\n\t"
977                 "punpcklwd %5, %%mm0\n\t"
978                 "punpcklwd %5, %%mm1\n\t"
979                 "punpcklwd %5, %%mm2\n\t"
980                 "punpckhwd %5, %%mm3\n\t"
981                 "punpckhwd %5, %%mm4\n\t"
982                 "punpckhwd %5, %%mm5\n\t"
983                 "psllq  $8, %%mm1\n\t"
984                 "psllq  $16, %%mm2\n\t"
985                 "por    %%mm1, %%mm0\n\t"
986                 "por    %%mm2, %%mm0\n\t"
987                 "psllq  $8, %%mm4\n\t"
988                 "psllq  $16, %%mm5\n\t"
989                 "por    %%mm4, %%mm3\n\t"
990                 "por    %%mm5, %%mm3\n\t"
991
992                 :"=m"(*d)
993                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
994                 :"memory");
995             /* Borrowed 32 to 24 */
996             __asm __volatile(
997                 "movq   %%mm0, %%mm4\n\t"
998                 "movq   %%mm3, %%mm5\n\t"
999                 "movq   %%mm6, %%mm0\n\t"
1000                 "movq   %%mm7, %%mm1\n\t"
1001                 
1002                 "movq   %%mm4, %%mm6\n\t"
1003                 "movq   %%mm5, %%mm7\n\t"
1004                 "movq   %%mm0, %%mm2\n\t"
1005                 "movq   %%mm1, %%mm3\n\t"
1006
1007                 "psrlq  $8, %%mm2\n\t"
1008                 "psrlq  $8, %%mm3\n\t"
1009                 "psrlq  $8, %%mm6\n\t"
1010                 "psrlq  $8, %%mm7\n\t"
1011                 "pand   %2, %%mm0\n\t"
1012                 "pand   %2, %%mm1\n\t"
1013                 "pand   %2, %%mm4\n\t"
1014                 "pand   %2, %%mm5\n\t"
1015                 "pand   %3, %%mm2\n\t"
1016                 "pand   %3, %%mm3\n\t"
1017                 "pand   %3, %%mm6\n\t"
1018                 "pand   %3, %%mm7\n\t"
1019                 "por    %%mm2, %%mm0\n\t"
1020                 "por    %%mm3, %%mm1\n\t"
1021                 "por    %%mm6, %%mm4\n\t"
1022                 "por    %%mm7, %%mm5\n\t"
1023
1024                 "movq   %%mm1, %%mm2\n\t"
1025                 "movq   %%mm4, %%mm3\n\t"
1026                 "psllq  $48, %%mm2\n\t"
1027                 "psllq  $32, %%mm3\n\t"
1028                 "pand   %4, %%mm2\n\t"
1029                 "pand   %5, %%mm3\n\t"
1030                 "por    %%mm2, %%mm0\n\t"
1031                 "psrlq  $16, %%mm1\n\t"
1032                 "psrlq  $32, %%mm4\n\t"
1033                 "psllq  $16, %%mm5\n\t"
1034                 "por    %%mm3, %%mm1\n\t"
1035                 "pand   %6, %%mm5\n\t"
1036                 "por    %%mm5, %%mm4\n\t"
1037
1038                 MOVNTQ" %%mm0, %0\n\t"
1039                 MOVNTQ" %%mm1, 8%0\n\t"
1040                 MOVNTQ" %%mm4, 16%0"
1041
1042                 :"=m"(*d)
1043                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1044                 :"memory");
1045                 d += 24;
1046                 s += 8;
1047         }
1048         __asm __volatile(SFENCE:::"memory");
1049         __asm __volatile(EMMS:::"memory");
1050 #endif
1051         while(s < end)
1052         {
1053                 register uint16_t bgr;
1054                 bgr = *s++;
1055                 *d++ = (bgr&0x1F)<<3;
1056                 *d++ = (bgr&0x3E0)>>2;
1057                 *d++ = (bgr&0x7C00)>>7;
1058         }
1059 }
1060
1061 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1062 {
1063         const uint16_t *end;
1064 #ifdef HAVE_MMX
1065         const uint16_t *mm_end;
1066 #endif
1067         uint8_t *d = (uint8_t *)dst;
1068         const uint16_t *s = (const uint16_t *)src;
1069         end = s + src_size/2;
1070 #ifdef HAVE_MMX
1071         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1072         mm_end = end - 7;
1073         while(s < mm_end)
1074         {
1075             __asm __volatile(
1076                 PREFETCH" 32%1\n\t"
1077                 "movq   %1, %%mm0\n\t"
1078                 "movq   %1, %%mm1\n\t"
1079                 "movq   %1, %%mm2\n\t"
1080                 "pand   %2, %%mm0\n\t"
1081                 "pand   %3, %%mm1\n\t"
1082                 "pand   %4, %%mm2\n\t"
1083                 "psllq  $3, %%mm0\n\t"
1084                 "psrlq  $3, %%mm1\n\t"
1085                 "psrlq  $8, %%mm2\n\t"
1086                 "movq   %%mm0, %%mm3\n\t"
1087                 "movq   %%mm1, %%mm4\n\t"
1088                 "movq   %%mm2, %%mm5\n\t"
1089                 "punpcklwd %5, %%mm0\n\t"
1090                 "punpcklwd %5, %%mm1\n\t"
1091                 "punpcklwd %5, %%mm2\n\t"
1092                 "punpckhwd %5, %%mm3\n\t"
1093                 "punpckhwd %5, %%mm4\n\t"
1094                 "punpckhwd %5, %%mm5\n\t"
1095                 "psllq  $8, %%mm1\n\t"
1096                 "psllq  $16, %%mm2\n\t"
1097                 "por    %%mm1, %%mm0\n\t"
1098                 "por    %%mm2, %%mm0\n\t"
1099                 "psllq  $8, %%mm4\n\t"
1100                 "psllq  $16, %%mm5\n\t"
1101                 "por    %%mm4, %%mm3\n\t"
1102                 "por    %%mm5, %%mm3\n\t"
1103                 
1104                 "movq   %%mm0, %%mm6\n\t"
1105                 "movq   %%mm3, %%mm7\n\t"
1106
1107                 "movq   8%1, %%mm0\n\t"
1108                 "movq   8%1, %%mm1\n\t"
1109                 "movq   8%1, %%mm2\n\t"
1110                 "pand   %2, %%mm0\n\t"
1111                 "pand   %3, %%mm1\n\t"
1112                 "pand   %4, %%mm2\n\t"
1113                 "psllq  $3, %%mm0\n\t"
1114                 "psrlq  $3, %%mm1\n\t"
1115                 "psrlq  $8, %%mm2\n\t"
1116                 "movq   %%mm0, %%mm3\n\t"
1117                 "movq   %%mm1, %%mm4\n\t"
1118                 "movq   %%mm2, %%mm5\n\t"
1119                 "punpcklwd %5, %%mm0\n\t"
1120                 "punpcklwd %5, %%mm1\n\t"
1121                 "punpcklwd %5, %%mm2\n\t"
1122                 "punpckhwd %5, %%mm3\n\t"
1123                 "punpckhwd %5, %%mm4\n\t"
1124                 "punpckhwd %5, %%mm5\n\t"
1125                 "psllq  $8, %%mm1\n\t"
1126                 "psllq  $16, %%mm2\n\t"
1127                 "por    %%mm1, %%mm0\n\t"
1128                 "por    %%mm2, %%mm0\n\t"
1129                 "psllq  $8, %%mm4\n\t"
1130                 "psllq  $16, %%mm5\n\t"
1131                 "por    %%mm4, %%mm3\n\t"
1132                 "por    %%mm5, %%mm3\n\t"
1133                 :"=m"(*d)
1134                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1135                 :"memory");
1136             /* Borrowed 32 to 24 */
1137             __asm __volatile(
1138                 "movq   %%mm0, %%mm4\n\t"
1139                 "movq   %%mm3, %%mm5\n\t"
1140                 "movq   %%mm6, %%mm0\n\t"
1141                 "movq   %%mm7, %%mm1\n\t"
1142                 
1143                 "movq   %%mm4, %%mm6\n\t"
1144                 "movq   %%mm5, %%mm7\n\t"
1145                 "movq   %%mm0, %%mm2\n\t"
1146                 "movq   %%mm1, %%mm3\n\t"
1147
1148                 "psrlq  $8, %%mm2\n\t"
1149                 "psrlq  $8, %%mm3\n\t"
1150                 "psrlq  $8, %%mm6\n\t"
1151                 "psrlq  $8, %%mm7\n\t"
1152                 "pand   %2, %%mm0\n\t"
1153                 "pand   %2, %%mm1\n\t"
1154                 "pand   %2, %%mm4\n\t"
1155                 "pand   %2, %%mm5\n\t"
1156                 "pand   %3, %%mm2\n\t"
1157                 "pand   %3, %%mm3\n\t"
1158                 "pand   %3, %%mm6\n\t"
1159                 "pand   %3, %%mm7\n\t"
1160                 "por    %%mm2, %%mm0\n\t"
1161                 "por    %%mm3, %%mm1\n\t"
1162                 "por    %%mm6, %%mm4\n\t"
1163                 "por    %%mm7, %%mm5\n\t"
1164
1165                 "movq   %%mm1, %%mm2\n\t"
1166                 "movq   %%mm4, %%mm3\n\t"
1167                 "psllq  $48, %%mm2\n\t"
1168                 "psllq  $32, %%mm3\n\t"
1169                 "pand   %4, %%mm2\n\t"
1170                 "pand   %5, %%mm3\n\t"
1171                 "por    %%mm2, %%mm0\n\t"
1172                 "psrlq  $16, %%mm1\n\t"
1173                 "psrlq  $32, %%mm4\n\t"
1174                 "psllq  $16, %%mm5\n\t"
1175                 "por    %%mm3, %%mm1\n\t"
1176                 "pand   %6, %%mm5\n\t"
1177                 "por    %%mm5, %%mm4\n\t"
1178
1179                 MOVNTQ" %%mm0, %0\n\t"
1180                 MOVNTQ" %%mm1, 8%0\n\t"
1181                 MOVNTQ" %%mm4, 16%0"
1182
1183                 :"=m"(*d)
1184                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1185                 :"memory");
1186                 d += 24;
1187                 s += 8;
1188         }
1189         __asm __volatile(SFENCE:::"memory");
1190         __asm __volatile(EMMS:::"memory");
1191 #endif
1192         while(s < end)
1193         {
1194                 register uint16_t bgr;
1195                 bgr = *s++;
1196                 *d++ = (bgr&0x1F)<<3;
1197                 *d++ = (bgr&0x7E0)>>3;
1198                 *d++ = (bgr&0xF800)>>8;
1199         }
1200 }
1201
1202 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1203 {
1204         const uint16_t *end;
1205 #ifdef HAVE_MMX
1206         const uint16_t *mm_end;
1207 #endif
1208         uint8_t *d = (uint8_t *)dst;
1209         const uint16_t *s = (const uint16_t *)src;
1210         end = s + src_size/2;
1211 #ifdef HAVE_MMX
1212         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1213         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1214         mm_end = end - 3;
1215         while(s < mm_end)
1216         {
1217             __asm __volatile(
1218                 PREFETCH" 32%1\n\t"
1219                 "movq   %1, %%mm0\n\t"
1220                 "movq   %1, %%mm1\n\t"
1221                 "movq   %1, %%mm2\n\t"
1222                 "pand   %2, %%mm0\n\t"
1223                 "pand   %3, %%mm1\n\t"
1224                 "pand   %4, %%mm2\n\t"
1225                 "psllq  $3, %%mm0\n\t"
1226                 "psrlq  $2, %%mm1\n\t"
1227                 "psrlq  $7, %%mm2\n\t"
1228                 "movq   %%mm0, %%mm3\n\t"
1229                 "movq   %%mm1, %%mm4\n\t"
1230                 "movq   %%mm2, %%mm5\n\t"
1231                 "punpcklwd %%mm7, %%mm0\n\t"
1232                 "punpcklwd %%mm7, %%mm1\n\t"
1233                 "punpcklwd %%mm7, %%mm2\n\t"
1234                 "punpckhwd %%mm7, %%mm3\n\t"
1235                 "punpckhwd %%mm7, %%mm4\n\t"
1236                 "punpckhwd %%mm7, %%mm5\n\t"
1237                 "psllq  $8, %%mm1\n\t"
1238                 "psllq  $16, %%mm2\n\t"
1239                 "por    %%mm1, %%mm0\n\t"
1240                 "por    %%mm2, %%mm0\n\t"
1241                 "psllq  $8, %%mm4\n\t"
1242                 "psllq  $16, %%mm5\n\t"
1243                 "por    %%mm4, %%mm3\n\t"
1244                 "por    %%mm5, %%mm3\n\t"
1245                 MOVNTQ" %%mm0, %0\n\t"
1246                 MOVNTQ" %%mm3, 8%0\n\t"
1247                 :"=m"(*d)
1248                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1249                 :"memory");
1250                 d += 16;
1251                 s += 4;
1252         }
1253         __asm __volatile(SFENCE:::"memory");
1254         __asm __volatile(EMMS:::"memory");
1255 #endif
1256         while(s < end)
1257         {
1258 #if 0 //slightly slower on athlon
1259                 int bgr= *s++;
1260                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1261 #else
1262                 register uint16_t bgr;
1263                 bgr = *s++;
1264 #ifdef WORDS_BIGENDIAN
1265                 *d++ = 0;
1266                 *d++ = (bgr&0x7C00)>>7;
1267                 *d++ = (bgr&0x3E0)>>2;
1268                 *d++ = (bgr&0x1F)<<3;
1269 #else
1270                 *d++ = (bgr&0x1F)<<3;
1271                 *d++ = (bgr&0x3E0)>>2;
1272                 *d++ = (bgr&0x7C00)>>7;
1273                 *d++ = 0;
1274 #endif
1275
1276 #endif
1277         }
1278 }
1279
1280 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1281 {
1282         const uint16_t *end;
1283 #ifdef HAVE_MMX
1284         const uint16_t *mm_end;
1285 #endif
1286         uint8_t *d = (uint8_t *)dst;
1287         const uint16_t *s = (uint16_t *)src;
1288         end = s + src_size/2;
1289 #ifdef HAVE_MMX
1290         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1291         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1292         mm_end = end - 3;
1293         while(s < mm_end)
1294         {
1295             __asm __volatile(
1296                 PREFETCH" 32%1\n\t"
1297                 "movq   %1, %%mm0\n\t"
1298                 "movq   %1, %%mm1\n\t"
1299                 "movq   %1, %%mm2\n\t"
1300                 "pand   %2, %%mm0\n\t"
1301                 "pand   %3, %%mm1\n\t"
1302                 "pand   %4, %%mm2\n\t"
1303                 "psllq  $3, %%mm0\n\t"
1304                 "psrlq  $3, %%mm1\n\t"
1305                 "psrlq  $8, %%mm2\n\t"
1306                 "movq   %%mm0, %%mm3\n\t"
1307                 "movq   %%mm1, %%mm4\n\t"
1308                 "movq   %%mm2, %%mm5\n\t"
1309                 "punpcklwd %%mm7, %%mm0\n\t"
1310                 "punpcklwd %%mm7, %%mm1\n\t"
1311                 "punpcklwd %%mm7, %%mm2\n\t"
1312                 "punpckhwd %%mm7, %%mm3\n\t"
1313                 "punpckhwd %%mm7, %%mm4\n\t"
1314                 "punpckhwd %%mm7, %%mm5\n\t"
1315                 "psllq  $8, %%mm1\n\t"
1316                 "psllq  $16, %%mm2\n\t"
1317                 "por    %%mm1, %%mm0\n\t"
1318                 "por    %%mm2, %%mm0\n\t"
1319                 "psllq  $8, %%mm4\n\t"
1320                 "psllq  $16, %%mm5\n\t"
1321                 "por    %%mm4, %%mm3\n\t"
1322                 "por    %%mm5, %%mm3\n\t"
1323                 MOVNTQ" %%mm0, %0\n\t"
1324                 MOVNTQ" %%mm3, 8%0\n\t"
1325                 :"=m"(*d)
1326                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1327                 :"memory");
1328                 d += 16;
1329                 s += 4;
1330         }
1331         __asm __volatile(SFENCE:::"memory");
1332         __asm __volatile(EMMS:::"memory");
1333 #endif
1334         while(s < end)
1335         {
1336                 register uint16_t bgr;
1337                 bgr = *s++;
1338 #ifdef WORDS_BIGENDIAN
1339                 *d++ = 0;
1340                 *d++ = (bgr&0xF800)>>8;
1341                 *d++ = (bgr&0x7E0)>>3;
1342                 *d++ = (bgr&0x1F)<<3;
1343 #else
1344                 *d++ = (bgr&0x1F)<<3;
1345                 *d++ = (bgr&0x7E0)>>3;
1346                 *d++ = (bgr&0xF800)>>8;
1347                 *d++ = 0;
1348 #endif
1349         }
1350 }
1351
1352 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1353 {
1354 #ifdef HAVE_MMX
1355 /* TODO: unroll this loop */
1356         asm volatile (
1357                 "xor %%"REG_a", %%"REG_a"       \n\t"
1358                 ASMALIGN16
1359                 "1:                             \n\t"
1360                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1361                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1362                 "movq %%mm0, %%mm1              \n\t"
1363                 "movq %%mm0, %%mm2              \n\t"
1364                 "pslld $16, %%mm0               \n\t"
1365                 "psrld $16, %%mm1               \n\t"
1366                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1367                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1368                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1369                 "por %%mm0, %%mm2               \n\t"
1370                 "por %%mm1, %%mm2               \n\t"
1371                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1372                 "add $8, %%"REG_a"              \n\t"
1373                 "cmp %2, %%"REG_a"              \n\t"
1374                 " jb 1b                         \n\t"
1375                 :: "r" (src), "r"(dst), "r" (src_size-7)
1376                 : "%"REG_a
1377         );
1378
1379         __asm __volatile(SFENCE:::"memory");
1380         __asm __volatile(EMMS:::"memory");
1381 #else
1382         unsigned i;
1383         unsigned num_pixels = src_size >> 2;
1384         for(i=0; i<num_pixels; i++)
1385         {
1386 #ifdef WORDS_BIGENDIAN  
1387           dst[4*i + 1] = src[4*i + 3];
1388           dst[4*i + 2] = src[4*i + 2];
1389           dst[4*i + 3] = src[4*i + 1];
1390 #else
1391           dst[4*i + 0] = src[4*i + 2];
1392           dst[4*i + 1] = src[4*i + 1];
1393           dst[4*i + 2] = src[4*i + 0];
1394 #endif
1395         }
1396 #endif
1397 }
1398
1399 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1400 {
1401         unsigned i;
1402 #ifdef HAVE_MMX
1403         long mmx_size= 23 - src_size;
1404         asm volatile (
1405                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1406                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1407                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1408                 ASMALIGN16
1409                 "1:                             \n\t"
1410                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1411                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1412                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1413                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1414                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1415                 "pand %%mm5, %%mm0              \n\t"
1416                 "pand %%mm6, %%mm1              \n\t"
1417                 "pand %%mm7, %%mm2              \n\t"
1418                 "por %%mm0, %%mm1               \n\t"
1419                 "por %%mm2, %%mm1               \n\t"                
1420                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1421                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1422                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1423                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1424                 "pand %%mm7, %%mm0              \n\t"
1425                 "pand %%mm5, %%mm1              \n\t"
1426                 "pand %%mm6, %%mm2              \n\t"
1427                 "por %%mm0, %%mm1               \n\t"
1428                 "por %%mm2, %%mm1               \n\t"                
1429                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1430                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1431                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1432                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1433                 "pand %%mm6, %%mm0              \n\t"
1434                 "pand %%mm7, %%mm1              \n\t"
1435                 "pand %%mm5, %%mm2              \n\t"
1436                 "por %%mm0, %%mm1               \n\t"
1437                 "por %%mm2, %%mm1               \n\t"                
1438                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1439                 "add $24, %%"REG_a"             \n\t"
1440                 " js 1b                         \n\t"
1441                 : "+a" (mmx_size)
1442                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443         );
1444
1445         __asm __volatile(SFENCE:::"memory");
1446         __asm __volatile(EMMS:::"memory");
1447
1448         if(mmx_size==23) return; //finihsed, was multiple of 8
1449
1450         src+= src_size;
1451         dst+= src_size;
1452         src_size= 23-mmx_size;
1453         src-= src_size;
1454         dst-= src_size;
1455 #endif
1456         for(i=0; i<src_size; i+=3)
1457         {
1458                 register uint8_t x;
1459                 x          = src[i + 2];
1460                 dst[i + 1] = src[i + 1];
1461                 dst[i + 2] = src[i + 0];
1462                 dst[i + 0] = x;
1463         }
1464 }
1465
1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467         long width, long height,
1468         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1469 {
1470         long y;
1471         const long chromWidth= width>>1;
1472         for(y=0; y<height; y++)
1473         {
1474 #ifdef HAVE_MMX
1475 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1476                 asm volatile(
1477                         "xor %%"REG_a", %%"REG_a"       \n\t"
1478                         ASMALIGN16
1479                         "1:                             \n\t"
1480                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1481                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1482                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1483                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1484                         "movq %%mm0, %%mm2              \n\t" // U(0)
1485                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1486                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1487                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1488
1489                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1490                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1491                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1492                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1493                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1494                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1495                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1496                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1497
1498                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1499                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1500                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1501                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1502
1503                         "add $8, %%"REG_a"              \n\t"
1504                         "cmp %4, %%"REG_a"              \n\t"
1505                         " jb 1b                         \n\t"
1506                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1507                         : "%"REG_a
1508                 );
1509 #else
1510
1511 #if defined ARCH_ALPHA && defined HAVE_MVI
1512 #define pl2yuy2(n)                                      \
1513         y1 = yc[n];                                     \
1514         y2 = yc2[n];                                    \
1515         u = uc[n];                                      \
1516         v = vc[n];                                      \
1517         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1518         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1519         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1520         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1521         yuv1 = (u << 8) + (v << 24);                    \
1522         yuv2 = yuv1 + y2;                               \
1523         yuv1 += y1;                                     \
1524         qdst[n] = yuv1;                                 \
1525         qdst2[n] = yuv2;
1526
1527                 int i;
1528                 uint64_t *qdst = (uint64_t *) dst;
1529                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530                 const uint32_t *yc = (uint32_t *) ysrc;
1531                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533                 for(i = 0; i < chromWidth; i += 8){
1534                         uint64_t y1, y2, yuv1, yuv2;
1535                         uint64_t u, v;
1536                         /* Prefetch */
1537                         asm("ldq $31,64(%0)" :: "r"(yc));
1538                         asm("ldq $31,64(%0)" :: "r"(yc2));
1539                         asm("ldq $31,64(%0)" :: "r"(uc));
1540                         asm("ldq $31,64(%0)" :: "r"(vc));
1541
1542                         pl2yuy2(0);
1543                         pl2yuy2(1);
1544                         pl2yuy2(2);
1545                         pl2yuy2(3);
1546
1547                         yc += 4;
1548                         yc2 += 4;
1549                         uc += 4;
1550                         vc += 4;
1551                         qdst += 4;
1552                         qdst2 += 4;
1553                 }
1554                 y++;
1555                 ysrc += lumStride;
1556                 dst += dstStride;
1557
1558 #elif __WORDSIZE >= 64
1559                 int i;
1560                 uint64_t *ldst = (uint64_t *) dst;
1561                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562                 for(i = 0; i < chromWidth; i += 2){
1563                         uint64_t k, l;
1564                         k = yc[0] + (uc[0] << 8) +
1565                             (yc[1] << 16) + (vc[0] << 24);
1566                         l = yc[2] + (uc[1] << 8) +
1567                             (yc[3] << 16) + (vc[1] << 24);
1568                         *ldst++ = k + (l << 32);
1569                         yc += 4;
1570                         uc += 2;
1571                         vc += 2;
1572                 }
1573
1574 #else
1575                 int i, *idst = (int32_t *) dst;
1576                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577                 for(i = 0; i < chromWidth; i++){
1578 #ifdef WORDS_BIGENDIAN
1579                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580                             (yc[1] << 8) + (vc[0] << 0);
1581 #else
1582                         *idst++ = yc[0] + (uc[0] << 8) +
1583                             (yc[1] << 16) + (vc[0] << 24);
1584 #endif
1585                         yc += 2;
1586                         uc++;
1587                         vc++;
1588                 }
1589 #endif
1590 #endif
1591                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1592                 {
1593                         usrc += chromStride;
1594                         vsrc += chromStride;
1595                 }
1596                 ysrc += lumStride;
1597                 dst += dstStride;
1598         }
1599 #ifdef HAVE_MMX
1600 asm(    EMMS" \n\t"
1601         SFENCE" \n\t"
1602         :::"memory");
1603 #endif
1604 }
1605
1606 /**
1607  *
1608  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1609  * problem for anyone then tell me, and ill fix it)
1610  */
1611 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1612         long width, long height,
1613         long lumStride, long chromStride, long dstStride)
1614 {
1615         //FIXME interpolate chroma
1616         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1617 }
1618
1619 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1620         long width, long height,
1621         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1622 {
1623         long y;
1624         const long chromWidth= width>>1;
1625         for(y=0; y<height; y++)
1626         {
1627 #ifdef HAVE_MMX
1628 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1629                 asm volatile(
1630                         "xor %%"REG_a", %%"REG_a"       \n\t"
1631                         ASMALIGN16
1632                         "1:                             \n\t"
1633                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1634                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1635                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1636                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1637                         "movq %%mm0, %%mm2              \n\t" // U(0)
1638                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1639                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1640                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1641
1642                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1643                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1644                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1645                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1646                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1647                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1648                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1649                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1650
1651                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1652                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1653                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1654                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1655
1656                         "add $8, %%"REG_a"              \n\t"
1657                         "cmp %4, %%"REG_a"              \n\t"
1658                         " jb 1b                         \n\t"
1659                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1660                         : "%"REG_a
1661                 );
1662 #else
1663 //FIXME adapt the alpha asm code from yv12->yuy2
1664
1665 #if __WORDSIZE >= 64
1666                 int i;
1667                 uint64_t *ldst = (uint64_t *) dst;
1668                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1669                 for(i = 0; i < chromWidth; i += 2){
1670                         uint64_t k, l;
1671                         k = uc[0] + (yc[0] << 8) +
1672                             (vc[0] << 16) + (yc[1] << 24);
1673                         l = uc[1] + (yc[2] << 8) +
1674                             (vc[1] << 16) + (yc[3] << 24);
1675                         *ldst++ = k + (l << 32);
1676                         yc += 4;
1677                         uc += 2;
1678                         vc += 2;
1679                 }
1680
1681 #else
1682                 int i, *idst = (int32_t *) dst;
1683                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1684                 for(i = 0; i < chromWidth; i++){
1685 #ifdef WORDS_BIGENDIAN
1686                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1687                             (vc[0] << 8) + (yc[1] << 0);
1688 #else
1689                         *idst++ = uc[0] + (yc[0] << 8) +
1690                             (vc[0] << 16) + (yc[1] << 24);
1691 #endif
1692                         yc += 2;
1693                         uc++;
1694                         vc++;
1695                 }
1696 #endif
1697 #endif
1698                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1699                 {
1700                         usrc += chromStride;
1701                         vsrc += chromStride;
1702                 }
1703                 ysrc += lumStride;
1704                 dst += dstStride;
1705         }
1706 #ifdef HAVE_MMX
1707 asm(    EMMS" \n\t"
1708         SFENCE" \n\t"
1709         :::"memory");
1710 #endif
1711 }
1712
1713 /**
1714  *
1715  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1716  * problem for anyone then tell me, and ill fix it)
1717  */
1718 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1719         long width, long height,
1720         long lumStride, long chromStride, long dstStride)
1721 {
1722         //FIXME interpolate chroma
1723         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1724 }
1725
1726 /**
1727  *
1728  * width should be a multiple of 16
1729  */
1730 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731         long width, long height,
1732         long lumStride, long chromStride, long dstStride)
1733 {
1734         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1735 }
1736
1737 /**
1738  *
1739  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1740  * problem for anyone then tell me, and ill fix it)
1741  */
1742 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1743         long width, long height,
1744         long lumStride, long chromStride, long srcStride)
1745 {
1746         long y;
1747         const long chromWidth= width>>1;
1748         for(y=0; y<height; y+=2)
1749         {
1750 #ifdef HAVE_MMX
1751                 asm volatile(
1752                         "xor %%"REG_a", %%"REG_a"       \n\t"
1753                         "pcmpeqw %%mm7, %%mm7           \n\t"
1754                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1755                         ASMALIGN16
1756                         "1:                             \n\t"
1757                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1758                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1759                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1760                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1761                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1762                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1763                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1764                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1765                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1766                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1767                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1768
1769                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1770
1771                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1772                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1773                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1774                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1775                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1776                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1777                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1778                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1779                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1780                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1781
1782                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1783
1784                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1785                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1786                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1787                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1788                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1789                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1790                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1791                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1792
1793                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1794                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1795
1796                         "add $8, %%"REG_a"              \n\t"
1797                         "cmp %4, %%"REG_a"              \n\t"
1798                         " jb 1b                         \n\t"
1799                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1800                         : "memory", "%"REG_a
1801                 );
1802
1803                 ydst += lumStride;
1804                 src  += srcStride;
1805
1806                 asm volatile(
1807                         "xor %%"REG_a", %%"REG_a"       \n\t"
1808                         ASMALIGN16
1809                         "1:                             \n\t"
1810                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1811                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1812                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1813                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1814                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1815                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1816                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1817                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1818                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1819                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1820                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1821
1822                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1823                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1824
1825                         "add $8, %%"REG_a"              \n\t"
1826                         "cmp %4, %%"REG_a"              \n\t"
1827                         " jb 1b                         \n\t"
1828
1829                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830                         : "memory", "%"REG_a
1831                 );
1832 #else
1833                 long i;
1834                 for(i=0; i<chromWidth; i++)
1835                 {
1836                         ydst[2*i+0]     = src[4*i+0];
1837                         udst[i]         = src[4*i+1];
1838                         ydst[2*i+1]     = src[4*i+2];
1839                         vdst[i]         = src[4*i+3];
1840                 }
1841                 ydst += lumStride;
1842                 src  += srcStride;
1843
1844                 for(i=0; i<chromWidth; i++)
1845                 {
1846                         ydst[2*i+0]     = src[4*i+0];
1847                         ydst[2*i+1]     = src[4*i+2];
1848                 }
1849 #endif
1850                 udst += chromStride;
1851                 vdst += chromStride;
1852                 ydst += lumStride;
1853                 src  += srcStride;
1854         }
1855 #ifdef HAVE_MMX
1856 asm volatile(   EMMS" \n\t"
1857                 SFENCE" \n\t"
1858                 :::"memory");
1859 #endif
1860 }
1861
1862 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1863         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1864         long width, long height, long lumStride, long chromStride)
1865 {
1866         /* Y Plane */
1867         memcpy(ydst, ysrc, width*height);
1868
1869         /* XXX: implement upscaling for U,V */
1870 }
1871
1872 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1873 {
1874         long x,y;
1875         
1876         dst[0]= src[0];
1877         
1878         // first line
1879         for(x=0; x<srcWidth-1; x++){
1880                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1881                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1882         }
1883         dst[2*srcWidth-1]= src[srcWidth-1];
1884         
1885         dst+= dstStride;
1886
1887         for(y=1; y<srcHeight; y++){
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889                 const long mmxSize= srcWidth&~15;
1890                 asm volatile(
1891                         "mov %4, %%"REG_a"              \n\t"
1892                         "1:                             \n\t"
1893                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1894                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1895                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1896                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1897                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1898                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1899                         PAVGB" %%mm0, %%mm5             \n\t"
1900                         PAVGB" %%mm0, %%mm3             \n\t"
1901                         PAVGB" %%mm0, %%mm5             \n\t"
1902                         PAVGB" %%mm0, %%mm3             \n\t"
1903                         PAVGB" %%mm1, %%mm4             \n\t"
1904                         PAVGB" %%mm1, %%mm2             \n\t"
1905                         PAVGB" %%mm1, %%mm4             \n\t"
1906                         PAVGB" %%mm1, %%mm2             \n\t"
1907                         "movq %%mm5, %%mm7              \n\t"
1908                         "movq %%mm4, %%mm6              \n\t"
1909                         "punpcklbw %%mm3, %%mm5         \n\t"
1910                         "punpckhbw %%mm3, %%mm7         \n\t"
1911                         "punpcklbw %%mm2, %%mm4         \n\t"
1912                         "punpckhbw %%mm2, %%mm6         \n\t"
1913 #if 1
1914                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1915                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1916                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1917                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1918 #else
1919                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1920                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1921                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1922                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1923 #endif
1924                         "add $8, %%"REG_a"              \n\t"
1925                         " js 1b                         \n\t"
1926                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1927                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1928                            "g" (-mmxSize)
1929                         : "%"REG_a
1930
1931                 );
1932 #else
1933                 const long mmxSize=1;
1934 #endif
1935                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1936                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1937
1938                 for(x=mmxSize-1; x<srcWidth-1; x++){
1939                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1940                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1941                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1942                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1943                 }
1944                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1945                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1946
1947                 dst+=dstStride*2;
1948                 src+=srcStride;
1949         }
1950         
1951         // last line
1952 #if 1
1953         dst[0]= src[0];
1954         
1955         for(x=0; x<srcWidth-1; x++){
1956                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1957                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1958         }
1959         dst[2*srcWidth-1]= src[srcWidth-1];
1960 #else
1961         for(x=0; x<srcWidth; x++){
1962                 dst[2*x+0]=
1963                 dst[2*x+1]= src[x];
1964         }
1965 #endif
1966
1967 #ifdef HAVE_MMX
1968 asm volatile(   EMMS" \n\t"
1969                 SFENCE" \n\t"
1970                 :::"memory");
1971 #endif
1972 }
1973
1974 /**
1975  *
1976  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1977  * problem for anyone then tell me, and ill fix it)
1978  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1979  */
1980 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1981         long width, long height,
1982         long lumStride, long chromStride, long srcStride)
1983 {
1984         long y;
1985         const long chromWidth= width>>1;
1986         for(y=0; y<height; y+=2)
1987         {
1988 #ifdef HAVE_MMX
1989                 asm volatile(
1990                         "xorl %%eax, %%eax              \n\t"
1991                         "pcmpeqw %%mm7, %%mm7           \n\t"
1992                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1993                         ASMALIGN16
1994                         "1:                             \n\t"
1995                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1996                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1997                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1998                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1999                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
2000                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
2001                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
2002                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2003                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2004                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
2005                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
2006
2007                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
2008
2009                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2010                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2011                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2012                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2013                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2014                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2015                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2016                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2017                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2018                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2019
2020                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2021
2022                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2023                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2024                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2025                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2026                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2027                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2028                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2029                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2030
2031                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2032                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2033
2034                         "addl $8, %%eax                 \n\t"
2035                         "cmpl %4, %%eax                 \n\t"
2036                         " jb 1b                         \n\t"
2037                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2038                         : "memory", "%eax"
2039                 );
2040
2041                 ydst += lumStride;
2042                 src  += srcStride;
2043
2044                 asm volatile(
2045                         "xorl %%eax, %%eax              \n\t"
2046                         ASMALIGN16
2047                         "1:                             \n\t"
2048                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2049                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2050                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2051                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2052                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2053                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2054                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2055                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2056                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2057                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2058                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2059
2060                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2061                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2062
2063                         "addl $8, %%eax                 \n\t"
2064                         "cmpl %4, %%eax                 \n\t"
2065                         " jb 1b                         \n\t"
2066
2067                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2068                         : "memory", "%eax"
2069                 );
2070 #else
2071                 long i;
2072                 for(i=0; i<chromWidth; i++)
2073                 {
2074                         udst[i]         = src[4*i+0];
2075                         ydst[2*i+0]     = src[4*i+1];
2076                         vdst[i]         = src[4*i+2];
2077                         ydst[2*i+1]     = src[4*i+3];
2078                 }
2079                 ydst += lumStride;
2080                 src  += srcStride;
2081
2082                 for(i=0; i<chromWidth; i++)
2083                 {
2084                         ydst[2*i+0]     = src[4*i+1];
2085                         ydst[2*i+1]     = src[4*i+3];
2086                 }
2087 #endif
2088                 udst += chromStride;
2089                 vdst += chromStride;
2090                 ydst += lumStride;
2091                 src  += srcStride;
2092         }
2093 #ifdef HAVE_MMX
2094 asm volatile(   EMMS" \n\t"
2095                 SFENCE" \n\t"
2096                 :::"memory");
2097 #endif
2098 }
2099
2100 /**
2101  *
2102  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2103  * problem for anyone then tell me, and ill fix it)
2104  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2105  */
2106 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2107         long width, long height,
2108         long lumStride, long chromStride, long srcStride)
2109 {
2110         long y;
2111         const long chromWidth= width>>1;
2112 #ifdef HAVE_MMX
2113         for(y=0; y<height-2; y+=2)
2114         {
2115                 long i;
2116                 for(i=0; i<2; i++)
2117                 {
2118                         asm volatile(
2119                                 "mov %2, %%"REG_a"              \n\t"
2120                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2121                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2122                                 "pxor %%mm7, %%mm7              \n\t"
2123                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2124                                 ASMALIGN16
2125                                 "1:                             \n\t"
2126                                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
2127                                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
2128                                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
2129                                 "punpcklbw %%mm7, %%mm0         \n\t"
2130                                 "punpcklbw %%mm7, %%mm1         \n\t"
2131                                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
2132                                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
2133                                 "punpcklbw %%mm7, %%mm2         \n\t"
2134                                 "punpcklbw %%mm7, %%mm3         \n\t"
2135                                 "pmaddwd %%mm6, %%mm0           \n\t"
2136                                 "pmaddwd %%mm6, %%mm1           \n\t"
2137                                 "pmaddwd %%mm6, %%mm2           \n\t"
2138                                 "pmaddwd %%mm6, %%mm3           \n\t"
2139 #ifndef FAST_BGR2YV12
2140                                 "psrad $8, %%mm0                \n\t"
2141                                 "psrad $8, %%mm1                \n\t"
2142                                 "psrad $8, %%mm2                \n\t"
2143                                 "psrad $8, %%mm3                \n\t"
2144 #endif
2145                                 "packssdw %%mm1, %%mm0          \n\t"
2146                                 "packssdw %%mm3, %%mm2          \n\t"
2147                                 "pmaddwd %%mm5, %%mm0           \n\t"
2148                                 "pmaddwd %%mm5, %%mm2           \n\t"
2149                                 "packssdw %%mm2, %%mm0          \n\t"
2150                                 "psraw $7, %%mm0                \n\t"
2151
2152                                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2153                                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
2154                                 "punpcklbw %%mm7, %%mm4         \n\t"
2155                                 "punpcklbw %%mm7, %%mm1         \n\t"
2156                                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
2157                                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
2158                                 "punpcklbw %%mm7, %%mm2         \n\t"
2159                                 "punpcklbw %%mm7, %%mm3         \n\t"
2160                                 "pmaddwd %%mm6, %%mm4           \n\t"
2161                                 "pmaddwd %%mm6, %%mm1           \n\t"
2162                                 "pmaddwd %%mm6, %%mm2           \n\t"
2163                                 "pmaddwd %%mm6, %%mm3           \n\t"
2164 #ifndef FAST_BGR2YV12
2165                                 "psrad $8, %%mm4                \n\t"
2166                                 "psrad $8, %%mm1                \n\t"
2167                                 "psrad $8, %%mm2                \n\t"
2168                                 "psrad $8, %%mm3                \n\t"
2169 #endif
2170                                 "packssdw %%mm1, %%mm4          \n\t"
2171                                 "packssdw %%mm3, %%mm2          \n\t"
2172                                 "pmaddwd %%mm5, %%mm4           \n\t"
2173                                 "pmaddwd %%mm5, %%mm2           \n\t"
2174                                 "add $24, %%"REG_b"             \n\t"
2175                                 "packssdw %%mm2, %%mm4          \n\t"
2176                                 "psraw $7, %%mm4                \n\t"
2177
2178                                 "packuswb %%mm4, %%mm0          \n\t"
2179                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2180
2181                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2182                                 "add $8, %%"REG_a"              \n\t"
2183                                 " js 1b                         \n\t"
2184                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2185                                 : "%"REG_a, "%"REG_b
2186                         );
2187                         ydst += lumStride;
2188                         src  += srcStride;
2189                 }
2190                 src -= srcStride*2;
2191                 asm volatile(
2192                         "mov %4, %%"REG_a"              \n\t"
2193                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2194                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2195                         "pxor %%mm7, %%mm7              \n\t"
2196                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2197                         "add %%"REG_b", %%"REG_b"       \n\t"
2198                         ASMALIGN16
2199                         "1:                             \n\t"
2200                         PREFETCH" 64(%0, %%"REG_b")     \n\t"
2201                         PREFETCH" 64(%1, %%"REG_b")     \n\t"
2202 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2203                         "movq (%0, %%"REG_b"), %%mm0    \n\t"
2204                         "movq (%1, %%"REG_b"), %%mm1    \n\t"
2205                         "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
2206                         "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
2207                         PAVGB" %%mm1, %%mm0             \n\t"
2208                         PAVGB" %%mm3, %%mm2             \n\t"
2209                         "movq %%mm0, %%mm1              \n\t"
2210                         "movq %%mm2, %%mm3              \n\t"
2211                         "psrlq $24, %%mm0               \n\t"
2212                         "psrlq $24, %%mm2               \n\t"
2213                         PAVGB" %%mm1, %%mm0             \n\t"
2214                         PAVGB" %%mm3, %%mm2             \n\t"
2215                         "punpcklbw %%mm7, %%mm0         \n\t"
2216                         "punpcklbw %%mm7, %%mm2         \n\t"
2217 #else
2218                         "movd (%0, %%"REG_b"), %%mm0    \n\t"
2219                         "movd (%1, %%"REG_b"), %%mm1    \n\t"
2220                         "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
2221                         "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
2222                         "punpcklbw %%mm7, %%mm0         \n\t"
2223                         "punpcklbw %%mm7, %%mm1         \n\t"
2224                         "punpcklbw %%mm7, %%mm2         \n\t"
2225                         "punpcklbw %%mm7, %%mm3         \n\t"
2226                         "paddw %%mm1, %%mm0             \n\t"
2227                         "paddw %%mm3, %%mm2             \n\t"
2228                         "paddw %%mm2, %%mm0             \n\t"
2229                         "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
2230                         "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
2231                         "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
2232                         "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
2233                         "punpcklbw %%mm7, %%mm4         \n\t"
2234                         "punpcklbw %%mm7, %%mm1         \n\t"
2235                         "punpcklbw %%mm7, %%mm2         \n\t"
2236                         "punpcklbw %%mm7, %%mm3         \n\t"
2237                         "paddw %%mm1, %%mm4             \n\t"
2238                         "paddw %%mm3, %%mm2             \n\t"
2239                         "paddw %%mm4, %%mm2             \n\t"
2240                         "psrlw $2, %%mm0                \n\t"
2241                         "psrlw $2, %%mm2                \n\t"
2242 #endif
2243                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2244                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2245
2246                         "pmaddwd %%mm0, %%mm1           \n\t"
2247                         "pmaddwd %%mm2, %%mm3           \n\t"
2248                         "pmaddwd %%mm6, %%mm0           \n\t"
2249                         "pmaddwd %%mm6, %%mm2           \n\t"
2250 #ifndef FAST_BGR2YV12
2251                         "psrad $8, %%mm0                \n\t"
2252                         "psrad $8, %%mm1                \n\t"
2253                         "psrad $8, %%mm2                \n\t"
2254                         "psrad $8, %%mm3                \n\t"
2255 #endif
2256                         "packssdw %%mm2, %%mm0          \n\t"
2257                         "packssdw %%mm3, %%mm1          \n\t"
2258                         "pmaddwd %%mm5, %%mm0           \n\t"
2259                         "pmaddwd %%mm5, %%mm1           \n\t"
2260                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2261                         "psraw $7, %%mm0                \n\t"
2262
2263 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2264                         "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
2265                         "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
2266                         "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
2267                         "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
2268                         PAVGB" %%mm1, %%mm4             \n\t"
2269                         PAVGB" %%mm3, %%mm2             \n\t"
2270                         "movq %%mm4, %%mm1              \n\t"
2271                         "movq %%mm2, %%mm3              \n\t"
2272                         "psrlq $24, %%mm4               \n\t"
2273                         "psrlq $24, %%mm2               \n\t"
2274                         PAVGB" %%mm1, %%mm4             \n\t"
2275                         PAVGB" %%mm3, %%mm2             \n\t"
2276                         "punpcklbw %%mm7, %%mm4         \n\t"
2277                         "punpcklbw %%mm7, %%mm2         \n\t"
2278 #else
2279                         "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2280                         "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
2281                         "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
2282                         "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
2283                         "punpcklbw %%mm7, %%mm4         \n\t"
2284                         "punpcklbw %%mm7, %%mm1         \n\t"
2285                         "punpcklbw %%mm7, %%mm2         \n\t"
2286                         "punpcklbw %%mm7, %%mm3         \n\t"
2287                         "paddw %%mm1, %%mm4             \n\t"
2288                         "paddw %%mm3, %%mm2             \n\t"
2289                         "paddw %%mm2, %%mm4             \n\t"
2290                         "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
2291                         "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
2292                         "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
2293                         "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
2294                         "punpcklbw %%mm7, %%mm5         \n\t"
2295                         "punpcklbw %%mm7, %%mm1         \n\t"
2296                         "punpcklbw %%mm7, %%mm2         \n\t"
2297                         "punpcklbw %%mm7, %%mm3         \n\t"
2298                         "paddw %%mm1, %%mm5             \n\t"
2299                         "paddw %%mm3, %%mm2             \n\t"
2300                         "paddw %%mm5, %%mm2             \n\t"
2301                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2302                         "psrlw $2, %%mm4                \n\t"
2303                         "psrlw $2, %%mm2                \n\t"
2304 #endif
2305                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2306                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2307
2308                         "pmaddwd %%mm4, %%mm1           \n\t"
2309                         "pmaddwd %%mm2, %%mm3           \n\t"
2310                         "pmaddwd %%mm6, %%mm4           \n\t"
2311                         "pmaddwd %%mm6, %%mm2           \n\t"
2312 #ifndef FAST_BGR2YV12
2313                         "psrad $8, %%mm4                \n\t"
2314                         "psrad $8, %%mm1                \n\t"
2315                         "psrad $8, %%mm2                \n\t"
2316                         "psrad $8, %%mm3                \n\t"
2317 #endif
2318                         "packssdw %%mm2, %%mm4          \n\t"
2319                         "packssdw %%mm3, %%mm1          \n\t"
2320                         "pmaddwd %%mm5, %%mm4           \n\t"
2321                         "pmaddwd %%mm5, %%mm1           \n\t"
2322                         "add $24, %%"REG_b"             \n\t"
2323                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2324                         "psraw $7, %%mm4                \n\t"
2325
2326                         "movq %%mm0, %%mm1              \n\t"
2327                         "punpckldq %%mm4, %%mm0         \n\t"
2328                         "punpckhdq %%mm4, %%mm1         \n\t"
2329                         "packsswb %%mm1, %%mm0          \n\t"
2330                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2331                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2332                         "punpckhdq %%mm0, %%mm0         \n\t"
2333                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2334                         "add $4, %%"REG_a"              \n\t"
2335                         " js 1b                         \n\t"
2336                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2337                         : "%"REG_a, "%"REG_b
2338                 );
2339
2340                 udst += chromStride;
2341                 vdst += chromStride;
2342                 src  += srcStride*2;
2343         }
2344
2345         asm volatile(   EMMS" \n\t"
2346                         SFENCE" \n\t"
2347                         :::"memory");
2348 #else
2349         y=0;
2350 #endif
2351         for(; y<height; y+=2)
2352         {
2353                 long i;
2354                 for(i=0; i<chromWidth; i++)
2355                 {
2356                         unsigned int b= src[6*i+0];
2357                         unsigned int g= src[6*i+1];
2358                         unsigned int r= src[6*i+2];
2359
2360                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2361                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2362                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2363
2364                         udst[i]         = U;
2365                         vdst[i]         = V;
2366                         ydst[2*i]       = Y;
2367
2368                         b= src[6*i+3];
2369                         g= src[6*i+4];
2370                         r= src[6*i+5];
2371
2372                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2373                         ydst[2*i+1]     = Y;
2374                 }
2375                 ydst += lumStride;
2376                 src  += srcStride;
2377
2378                 for(i=0; i<chromWidth; i++)
2379                 {
2380                         unsigned int b= src[6*i+0];
2381                         unsigned int g= src[6*i+1];
2382                         unsigned int r= src[6*i+2];
2383
2384                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385
2386                         ydst[2*i]       = Y;
2387
2388                         b= src[6*i+3];
2389                         g= src[6*i+4];
2390                         r= src[6*i+5];
2391
2392                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2393                         ydst[2*i+1]     = Y;
2394                 }
2395                 udst += chromStride;
2396                 vdst += chromStride;
2397                 ydst += lumStride;
2398                 src  += srcStride;
2399         }
2400 }
2401
2402 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2403                             long width, long height, long src1Stride,
2404                             long src2Stride, long dstStride){
2405         long h;
2406
2407         for(h=0; h < height; h++)
2408         {
2409                 long w;
2410
2411 #ifdef HAVE_MMX
2412 #ifdef HAVE_SSE2
2413                 asm(
2414                         "xor %%"REG_a", %%"REG_a"       \n\t"
2415                         "1:                             \n\t"
2416                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2417                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2418                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2419                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2420                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2421                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2422                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2423                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2424                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2425                         "add $16, %%"REG_a"             \n\t"
2426                         "cmp %3, %%"REG_a"              \n\t"
2427                         " jb 1b                         \n\t"
2428                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2429                         : "memory", "%"REG_a""
2430                 );
2431 #else
2432                 asm(
2433                         "xor %%"REG_a", %%"REG_a"       \n\t"
2434                         "1:                             \n\t"
2435                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2436                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2437                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2438                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2439                         "movq %%mm0, %%mm1              \n\t"
2440                         "movq %%mm2, %%mm3              \n\t"
2441                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2442                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2443                         "punpcklbw %%mm4, %%mm0         \n\t"
2444                         "punpckhbw %%mm4, %%mm1         \n\t"
2445                         "punpcklbw %%mm5, %%mm2         \n\t"
2446                         "punpckhbw %%mm5, %%mm3         \n\t"
2447                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2448                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2449                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2450                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2451                         "add $16, %%"REG_a"             \n\t"
2452                         "cmp %3, %%"REG_a"              \n\t"
2453                         " jb 1b                         \n\t"
2454                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2455                         : "memory", "%"REG_a
2456                 );
2457 #endif
2458                 for(w= (width&(~15)); w < width; w++)
2459                 {
2460                         dest[2*w+0] = src1[w];
2461                         dest[2*w+1] = src2[w];
2462                 }
2463 #else
2464                 for(w=0; w < width; w++)
2465                 {
2466                         dest[2*w+0] = src1[w];
2467                         dest[2*w+1] = src2[w];
2468                 }
2469 #endif
2470                 dest += dstStride;
2471                 src1 += src1Stride;
2472                 src2 += src2Stride;
2473         }
2474 #ifdef HAVE_MMX
2475         asm(
2476                 EMMS" \n\t"
2477                 SFENCE" \n\t"
2478                 ::: "memory"
2479                 );
2480 #endif
2481 }
2482
2483 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2484                         uint8_t *dst1, uint8_t *dst2,
2485                         long width, long height,
2486                         long srcStride1, long srcStride2,
2487                         long dstStride1, long dstStride2)
2488 {
2489     long y,x,w,h;
2490     w=width/2; h=height/2;
2491 #ifdef HAVE_MMX
2492     asm volatile(
2493         PREFETCH" %0\n\t"
2494         PREFETCH" %1\n\t"
2495         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2496 #endif
2497     for(y=0;y<h;y++){
2498         const uint8_t* s1=src1+srcStride1*(y>>1);
2499         uint8_t* d=dst1+dstStride1*y;
2500         x=0;
2501 #ifdef HAVE_MMX
2502         for(;x<w-31;x+=32)
2503         {
2504             asm volatile(
2505                 PREFETCH" 32%1\n\t"
2506                 "movq   %1, %%mm0\n\t"
2507                 "movq   8%1, %%mm2\n\t"
2508                 "movq   16%1, %%mm4\n\t"
2509                 "movq   24%1, %%mm6\n\t"
2510                 "movq   %%mm0, %%mm1\n\t"
2511                 "movq   %%mm2, %%mm3\n\t"
2512                 "movq   %%mm4, %%mm5\n\t"
2513                 "movq   %%mm6, %%mm7\n\t"
2514                 "punpcklbw %%mm0, %%mm0\n\t"
2515                 "punpckhbw %%mm1, %%mm1\n\t"
2516                 "punpcklbw %%mm2, %%mm2\n\t"
2517                 "punpckhbw %%mm3, %%mm3\n\t"
2518                 "punpcklbw %%mm4, %%mm4\n\t"
2519                 "punpckhbw %%mm5, %%mm5\n\t"
2520                 "punpcklbw %%mm6, %%mm6\n\t"
2521                 "punpckhbw %%mm7, %%mm7\n\t"
2522                 MOVNTQ" %%mm0, %0\n\t"
2523                 MOVNTQ" %%mm1, 8%0\n\t"
2524                 MOVNTQ" %%mm2, 16%0\n\t"
2525                 MOVNTQ" %%mm3, 24%0\n\t"
2526                 MOVNTQ" %%mm4, 32%0\n\t"
2527                 MOVNTQ" %%mm5, 40%0\n\t"
2528                 MOVNTQ" %%mm6, 48%0\n\t"
2529                 MOVNTQ" %%mm7, 56%0"
2530                 :"=m"(d[2*x])
2531                 :"m"(s1[x])
2532                 :"memory");
2533         }
2534 #endif
2535         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2536     }
2537     for(y=0;y<h;y++){
2538         const uint8_t* s2=src2+srcStride2*(y>>1);
2539         uint8_t* d=dst2+dstStride2*y;
2540         x=0;
2541 #ifdef HAVE_MMX
2542         for(;x<w-31;x+=32)
2543         {
2544             asm volatile(
2545                 PREFETCH" 32%1\n\t"
2546                 "movq   %1, %%mm0\n\t"
2547                 "movq   8%1, %%mm2\n\t"
2548                 "movq   16%1, %%mm4\n\t"
2549                 "movq   24%1, %%mm6\n\t"
2550                 "movq   %%mm0, %%mm1\n\t"
2551                 "movq   %%mm2, %%mm3\n\t"
2552                 "movq   %%mm4, %%mm5\n\t"
2553                 "movq   %%mm6, %%mm7\n\t"
2554                 "punpcklbw %%mm0, %%mm0\n\t"
2555                 "punpckhbw %%mm1, %%mm1\n\t"
2556                 "punpcklbw %%mm2, %%mm2\n\t"
2557                 "punpckhbw %%mm3, %%mm3\n\t"
2558                 "punpcklbw %%mm4, %%mm4\n\t"
2559                 "punpckhbw %%mm5, %%mm5\n\t"
2560                 "punpcklbw %%mm6, %%mm6\n\t"
2561                 "punpckhbw %%mm7, %%mm7\n\t"
2562                 MOVNTQ" %%mm0, %0\n\t"
2563                 MOVNTQ" %%mm1, 8%0\n\t"
2564                 MOVNTQ" %%mm2, 16%0\n\t"
2565                 MOVNTQ" %%mm3, 24%0\n\t"
2566                 MOVNTQ" %%mm4, 32%0\n\t"
2567                 MOVNTQ" %%mm5, 40%0\n\t"
2568                 MOVNTQ" %%mm6, 48%0\n\t"
2569                 MOVNTQ" %%mm7, 56%0"
2570                 :"=m"(d[2*x])
2571                 :"m"(s2[x])
2572                 :"memory");
2573         }
2574 #endif
2575         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576     }
2577 #ifdef HAVE_MMX
2578         asm(
2579                 EMMS" \n\t"
2580                 SFENCE" \n\t"
2581                 ::: "memory"
2582                 );
2583 #endif
2584 }
2585
2586 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2587                         uint8_t *dst,
2588                         long width, long height,
2589                         long srcStride1, long srcStride2,
2590                         long srcStride3, long dstStride)
2591 {
2592     long y,x,w,h;
2593     w=width/2; h=height;
2594     for(y=0;y<h;y++){
2595         const uint8_t* yp=src1+srcStride1*y;
2596         const uint8_t* up=src2+srcStride2*(y>>2);
2597         const uint8_t* vp=src3+srcStride3*(y>>2);
2598         uint8_t* d=dst+dstStride*y;
2599         x=0;
2600 #ifdef HAVE_MMX
2601         for(;x<w-7;x+=8)
2602         {
2603             asm volatile(
2604                 PREFETCH" 32(%1, %0)\n\t"
2605                 PREFETCH" 32(%2, %0)\n\t"
2606                 PREFETCH" 32(%3, %0)\n\t"
2607                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2608                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2609                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2610                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2611                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2612                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2613                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2614                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2615                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2616                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2617
2618                 "movq   %%mm1, %%mm6\n\t"
2619                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2620                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2621                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2622                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2623                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2624                 
2625                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2626                 "movq   8(%1, %0, 4), %%mm0\n\t"
2627                 "movq   %%mm0, %%mm3\n\t"
2628                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2629                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2630                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2631                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2632
2633                 "movq   %%mm4, %%mm6\n\t"
2634                 "movq   16(%1, %0, 4), %%mm0\n\t"
2635                 "movq   %%mm0, %%mm3\n\t"
2636                 "punpcklbw %%mm5, %%mm4\n\t"
2637                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2638                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2639                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2640                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2641                 
2642                 "punpckhbw %%mm5, %%mm6\n\t"
2643                 "movq   24(%1, %0, 4), %%mm0\n\t"
2644                 "movq   %%mm0, %%mm3\n\t"
2645                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2646                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2647                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2648                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2649
2650                 : "+r" (x)
2651                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2652                 :"memory");
2653         }
2654 #endif
2655         for(; x<w; x++)
2656         {
2657             const long x2= x<<2;
2658             d[8*x+0]=yp[x2];
2659             d[8*x+1]=up[x];
2660             d[8*x+2]=yp[x2+1];
2661             d[8*x+3]=vp[x];
2662             d[8*x+4]=yp[x2+2];
2663             d[8*x+5]=up[x];
2664             d[8*x+6]=yp[x2+3];
2665             d[8*x+7]=vp[x];
2666         }
2667     }
2668 #ifdef HAVE_MMX
2669         asm(
2670                 EMMS" \n\t"
2671                 SFENCE" \n\t"
2672                 ::: "memory"
2673                 );
2674 #endif
2675 }