]> git.sesse.net Git - ffmpeg/blob - libswscale/rgb2rgb_template.c
-fPIC support for libswscale
[ffmpeg] / libswscale / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #ifndef __WORDSIZE
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
18 #endif
19
20 #undef PREFETCH
21 #undef MOVNTQ
22 #undef EMMS
23 #undef SFENCE
24 #undef MMREG_SIZE
25 #undef PREFETCHW
26 #undef PAVGB
27
28 #ifdef HAVE_SSE2
29 #define MMREG_SIZE 16
30 #else
31 #define MMREG_SIZE 8
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB     "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
41 #define PAVGB     "pavgb"
42 #else
43 #ifdef __APPLE__
44 #define PREFETCH "#"
45 #define PREFETCHW "#"
46 #else
47 #define PREFETCH "/nop"
48 #define PREFETCHW "/nop"
49 #endif
50 #endif
51
52 #ifdef HAVE_3DNOW
53 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define EMMS     "femms"
55 #else
56 #define EMMS     "emms"
57 #endif
58
59 #ifdef HAVE_MMX2
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
62 #else
63 #define MOVNTQ "movq"
64 #ifdef __APPLE__
65 #define SFENCE "#"
66 #else
67 #define SFENCE "/nop"
68 #endif
69 #endif
70
71 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
72 {
73   uint8_t *dest = dst;
74   const uint8_t *s = src;
75   const uint8_t *end;
76 #ifdef HAVE_MMX
77   const uint8_t *mm_end;
78 #endif
79   end = s + src_size;
80 #ifdef HAVE_MMX
81   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
82   mm_end = end - 23;
83   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
84   while(s < mm_end)
85   {
86     __asm __volatile(
87         PREFETCH"       32%1\n\t"
88         "movd   %1, %%mm0\n\t"
89         "punpckldq 3%1, %%mm0\n\t"
90         "movd   6%1, %%mm1\n\t"
91         "punpckldq 9%1, %%mm1\n\t"
92         "movd   12%1, %%mm2\n\t"
93         "punpckldq 15%1, %%mm2\n\t"
94         "movd   18%1, %%mm3\n\t"
95         "punpckldq 21%1, %%mm3\n\t"
96         "pand   %%mm7, %%mm0\n\t"
97         "pand   %%mm7, %%mm1\n\t"
98         "pand   %%mm7, %%mm2\n\t"
99         "pand   %%mm7, %%mm3\n\t"
100         MOVNTQ" %%mm0, %0\n\t"
101         MOVNTQ" %%mm1, 8%0\n\t"
102         MOVNTQ" %%mm2, 16%0\n\t"
103         MOVNTQ" %%mm3, 24%0"
104         :"=m"(*dest)
105         :"m"(*s)
106         :"memory");
107     dest += 32;
108     s += 24;
109   }
110   __asm __volatile(SFENCE:::"memory");
111   __asm __volatile(EMMS:::"memory");
112 #endif
113   while(s < end)
114   {
115 #ifdef WORDS_BIGENDIAN
116     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
117     *dest++ = 0;
118     *dest++ = s[2];
119     *dest++ = s[1];
120     *dest++ = s[0];
121     s+=3;
122 #else
123     *dest++ = *s++;
124     *dest++ = *s++;
125     *dest++ = *s++;
126     *dest++ = 0;
127 #endif
128   }
129 }
130
131 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
132 {
133   uint8_t *dest = dst;
134   const uint8_t *s = src;
135   const uint8_t *end;
136 #ifdef HAVE_MMX
137   const uint8_t *mm_end;
138 #endif
139   end = s + src_size;
140 #ifdef HAVE_MMX
141   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
142   mm_end = end - 31;
143   while(s < mm_end)
144   {
145     __asm __volatile(
146         PREFETCH"       32%1\n\t"
147         "movq   %1, %%mm0\n\t"
148         "movq   8%1, %%mm1\n\t"
149         "movq   16%1, %%mm4\n\t"
150         "movq   24%1, %%mm5\n\t"
151         "movq   %%mm0, %%mm2\n\t"
152         "movq   %%mm1, %%mm3\n\t"
153         "movq   %%mm4, %%mm6\n\t"
154         "movq   %%mm5, %%mm7\n\t"
155         "psrlq  $8, %%mm2\n\t"
156         "psrlq  $8, %%mm3\n\t"
157         "psrlq  $8, %%mm6\n\t"
158         "psrlq  $8, %%mm7\n\t"
159         "pand   %2, %%mm0\n\t"
160         "pand   %2, %%mm1\n\t"
161         "pand   %2, %%mm4\n\t"
162         "pand   %2, %%mm5\n\t"
163         "pand   %3, %%mm2\n\t"
164         "pand   %3, %%mm3\n\t"
165         "pand   %3, %%mm6\n\t"
166         "pand   %3, %%mm7\n\t"
167         "por    %%mm2, %%mm0\n\t"
168         "por    %%mm3, %%mm1\n\t"
169         "por    %%mm6, %%mm4\n\t"
170         "por    %%mm7, %%mm5\n\t"
171
172         "movq   %%mm1, %%mm2\n\t"
173         "movq   %%mm4, %%mm3\n\t"
174         "psllq  $48, %%mm2\n\t"
175         "psllq  $32, %%mm3\n\t"
176         "pand   %4, %%mm2\n\t"
177         "pand   %5, %%mm3\n\t"
178         "por    %%mm2, %%mm0\n\t"
179         "psrlq  $16, %%mm1\n\t"
180         "psrlq  $32, %%mm4\n\t"
181         "psllq  $16, %%mm5\n\t"
182         "por    %%mm3, %%mm1\n\t"
183         "pand   %6, %%mm5\n\t"
184         "por    %%mm5, %%mm4\n\t"
185
186         MOVNTQ" %%mm0, %0\n\t"
187         MOVNTQ" %%mm1, 8%0\n\t"
188         MOVNTQ" %%mm4, 16%0"
189         :"=m"(*dest)
190         :"m"(*s),"m"(mask24l),
191          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
192         :"memory");
193     dest += 24;
194     s += 32;
195   }
196   __asm __volatile(SFENCE:::"memory");
197   __asm __volatile(EMMS:::"memory");
198 #endif
199   while(s < end)
200   {
201 #ifdef WORDS_BIGENDIAN
202     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
203     s++;
204     dest[2] = *s++;
205     dest[1] = *s++;
206     dest[0] = *s++;
207     dest += 3;
208 #else
209     *dest++ = *s++;
210     *dest++ = *s++;
211     *dest++ = *s++;
212     s++;
213 #endif
214   }
215 }
216
217 /*
218  Original by Strepto/Astral
219  ported to gcc & bugfixed : A'rpi
220  MMX2, 3DNOW optimization by Nick Kurshev
221  32bit c version, and and&add trick by Michael Niedermayer
222 */
223 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
224 {
225   register const uint8_t* s=src;
226   register uint8_t* d=dst;
227   register const uint8_t *end;
228   const uint8_t *mm_end;
229   end = s + src_size;
230 #ifdef HAVE_MMX
231   __asm __volatile(PREFETCH"    %0"::"m"(*s));
232   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
233   mm_end = end - 15;
234   while(s<mm_end)
235   {
236         __asm __volatile(
237                 PREFETCH"       32%1\n\t"
238                 "movq   %1, %%mm0\n\t"
239                 "movq   8%1, %%mm2\n\t"
240                 "movq   %%mm0, %%mm1\n\t"
241                 "movq   %%mm2, %%mm3\n\t"
242                 "pand   %%mm4, %%mm0\n\t"
243                 "pand   %%mm4, %%mm2\n\t"
244                 "paddw  %%mm1, %%mm0\n\t"
245                 "paddw  %%mm3, %%mm2\n\t"
246                 MOVNTQ" %%mm0, %0\n\t"
247                 MOVNTQ" %%mm2, 8%0"
248                 :"=m"(*d)
249                 :"m"(*s)
250                 );
251         d+=16;
252         s+=16;
253   }
254   __asm __volatile(SFENCE:::"memory");
255   __asm __volatile(EMMS:::"memory");
256 #endif
257     mm_end = end - 3;
258     while(s < mm_end)
259     {
260         register unsigned x= *((uint32_t *)s);
261         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
262         d+=4;
263         s+=4;
264     }
265     if(s < end)
266     {
267         register unsigned short x= *((uint16_t *)s);
268         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
269     }
270 }
271
272 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
273 {
274   register const uint8_t* s=src;
275   register uint8_t* d=dst;
276   register const uint8_t *end;
277   const uint8_t *mm_end;
278   end = s + src_size;
279 #ifdef HAVE_MMX
280   __asm __volatile(PREFETCH"    %0"::"m"(*s));
281   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
282   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
283   mm_end = end - 15;
284   while(s<mm_end)
285   {
286         __asm __volatile(
287                 PREFETCH"       32%1\n\t"
288                 "movq   %1, %%mm0\n\t"
289                 "movq   8%1, %%mm2\n\t"
290                 "movq   %%mm0, %%mm1\n\t"
291                 "movq   %%mm2, %%mm3\n\t"
292                 "psrlq  $1, %%mm0\n\t"
293                 "psrlq  $1, %%mm2\n\t"
294                 "pand   %%mm7, %%mm0\n\t"
295                 "pand   %%mm7, %%mm2\n\t"
296                 "pand   %%mm6, %%mm1\n\t"
297                 "pand   %%mm6, %%mm3\n\t"
298                 "por    %%mm1, %%mm0\n\t"
299                 "por    %%mm3, %%mm2\n\t"
300                 MOVNTQ" %%mm0, %0\n\t"
301                 MOVNTQ" %%mm2, 8%0"
302                 :"=m"(*d)
303                 :"m"(*s)
304                 );
305         d+=16;
306         s+=16;
307   }
308   __asm __volatile(SFENCE:::"memory");
309   __asm __volatile(EMMS:::"memory");
310 #endif
311     mm_end = end - 3;
312     while(s < mm_end)
313     {
314         register uint32_t x= *((uint32_t *)s);
315         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
316         s+=4;
317         d+=4;
318     }
319     if(s < end)
320     {
321         register uint16_t x= *((uint16_t *)s);
322         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
323         s+=2;
324         d+=2;
325     }
326 }
327
328 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
329 {
330         const uint8_t *s = src;
331         const uint8_t *end;
332 #ifdef HAVE_MMX
333         const uint8_t *mm_end;
334 #endif
335         uint16_t *d = (uint16_t *)dst;
336         end = s + src_size;
337 #ifdef HAVE_MMX
338         mm_end = end - 15;
339 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
340         asm volatile(
341                 "movq %3, %%mm5                 \n\t"
342                 "movq %4, %%mm6                 \n\t"
343                 "movq %5, %%mm7                 \n\t"
344                 ASMALIGN(4)
345                 "1:                             \n\t"
346                 PREFETCH" 32(%1)                \n\t"
347                 "movd   (%1), %%mm0             \n\t"
348                 "movd   4(%1), %%mm3            \n\t"
349                 "punpckldq 8(%1), %%mm0         \n\t"
350                 "punpckldq 12(%1), %%mm3        \n\t"
351                 "movq %%mm0, %%mm1              \n\t"
352                 "movq %%mm3, %%mm4              \n\t"
353                 "pand %%mm6, %%mm0              \n\t"
354                 "pand %%mm6, %%mm3              \n\t"
355                 "pmaddwd %%mm7, %%mm0           \n\t"
356                 "pmaddwd %%mm7, %%mm3           \n\t"
357                 "pand %%mm5, %%mm1              \n\t"
358                 "pand %%mm5, %%mm4              \n\t"
359                 "por %%mm1, %%mm0               \n\t"   
360                 "por %%mm4, %%mm3               \n\t"
361                 "psrld $5, %%mm0                \n\t"
362                 "pslld $11, %%mm3               \n\t"
363                 "por %%mm3, %%mm0               \n\t"
364                 MOVNTQ" %%mm0, (%0)             \n\t"
365                 "add $16, %1                    \n\t"
366                 "add $8, %0                     \n\t"
367                 "cmp %2, %1                     \n\t"
368                 " jb 1b                         \n\t"
369                 : "+r" (d), "+r"(s)
370                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
371         );
372 #else
373         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
374         __asm __volatile(
375             "movq       %0, %%mm7\n\t"
376             "movq       %1, %%mm6\n\t"
377             ::"m"(red_16mask),"m"(green_16mask));
378         while(s < mm_end)
379         {
380             __asm __volatile(
381                 PREFETCH" 32%1\n\t"
382                 "movd   %1, %%mm0\n\t"
383                 "movd   4%1, %%mm3\n\t"
384                 "punpckldq 8%1, %%mm0\n\t"
385                 "punpckldq 12%1, %%mm3\n\t"
386                 "movq   %%mm0, %%mm1\n\t"
387                 "movq   %%mm0, %%mm2\n\t"
388                 "movq   %%mm3, %%mm4\n\t"
389                 "movq   %%mm3, %%mm5\n\t"
390                 "psrlq  $3, %%mm0\n\t"
391                 "psrlq  $3, %%mm3\n\t"
392                 "pand   %2, %%mm0\n\t"
393                 "pand   %2, %%mm3\n\t"
394                 "psrlq  $5, %%mm1\n\t"
395                 "psrlq  $5, %%mm4\n\t"
396                 "pand   %%mm6, %%mm1\n\t"
397                 "pand   %%mm6, %%mm4\n\t"
398                 "psrlq  $8, %%mm2\n\t"
399                 "psrlq  $8, %%mm5\n\t"
400                 "pand   %%mm7, %%mm2\n\t"
401                 "pand   %%mm7, %%mm5\n\t"
402                 "por    %%mm1, %%mm0\n\t"
403                 "por    %%mm4, %%mm3\n\t"
404                 "por    %%mm2, %%mm0\n\t"
405                 "por    %%mm5, %%mm3\n\t"
406                 "psllq  $16, %%mm3\n\t"
407                 "por    %%mm3, %%mm0\n\t"
408                 MOVNTQ" %%mm0, %0\n\t"
409                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
410                 d += 4;
411                 s += 16;
412         }
413 #endif
414         __asm __volatile(SFENCE:::"memory");
415         __asm __volatile(EMMS:::"memory");
416 #endif
417         while(s < end)
418         {
419                 register int rgb = *(uint32_t*)s; s += 4;
420                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
421         }
422 }
423
424 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
425 {
426         const uint8_t *s = src;
427         const uint8_t *end;
428 #ifdef HAVE_MMX
429         const uint8_t *mm_end;
430 #endif
431         uint16_t *d = (uint16_t *)dst;
432         end = s + src_size;
433 #ifdef HAVE_MMX
434         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
435         __asm __volatile(
436             "movq       %0, %%mm7\n\t"
437             "movq       %1, %%mm6\n\t"
438             ::"m"(red_16mask),"m"(green_16mask));
439         mm_end = end - 15;
440         while(s < mm_end)
441         {
442             __asm __volatile(
443                 PREFETCH" 32%1\n\t"
444                 "movd   %1, %%mm0\n\t"
445                 "movd   4%1, %%mm3\n\t"
446                 "punpckldq 8%1, %%mm0\n\t"
447                 "punpckldq 12%1, %%mm3\n\t"
448                 "movq   %%mm0, %%mm1\n\t"
449                 "movq   %%mm0, %%mm2\n\t"
450                 "movq   %%mm3, %%mm4\n\t"
451                 "movq   %%mm3, %%mm5\n\t"
452                 "psllq  $8, %%mm0\n\t"
453                 "psllq  $8, %%mm3\n\t"
454                 "pand   %%mm7, %%mm0\n\t"
455                 "pand   %%mm7, %%mm3\n\t"
456                 "psrlq  $5, %%mm1\n\t"
457                 "psrlq  $5, %%mm4\n\t"
458                 "pand   %%mm6, %%mm1\n\t"
459                 "pand   %%mm6, %%mm4\n\t"
460                 "psrlq  $19, %%mm2\n\t"
461                 "psrlq  $19, %%mm5\n\t"
462                 "pand   %2, %%mm2\n\t"
463                 "pand   %2, %%mm5\n\t"
464                 "por    %%mm1, %%mm0\n\t"
465                 "por    %%mm4, %%mm3\n\t"
466                 "por    %%mm2, %%mm0\n\t"
467                 "por    %%mm5, %%mm3\n\t"
468                 "psllq  $16, %%mm3\n\t"
469                 "por    %%mm3, %%mm0\n\t"
470                 MOVNTQ" %%mm0, %0\n\t"
471                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
472                 d += 4;
473                 s += 16;
474         }
475         __asm __volatile(SFENCE:::"memory");
476         __asm __volatile(EMMS:::"memory");
477 #endif
478         while(s < end)
479         {
480                 register int rgb = *(uint32_t*)s; s += 4;
481                 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
482         }
483 }
484
485 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
486 {
487         const uint8_t *s = src;
488         const uint8_t *end;
489 #ifdef HAVE_MMX
490         const uint8_t *mm_end;
491 #endif
492         uint16_t *d = (uint16_t *)dst;
493         end = s + src_size;
494 #ifdef HAVE_MMX
495         mm_end = end - 15;
496 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
497         asm volatile(
498                 "movq %3, %%mm5                 \n\t"
499                 "movq %4, %%mm6                 \n\t"
500                 "movq %5, %%mm7                 \n\t"
501                 ASMALIGN(4)
502                 "1:                             \n\t"
503                 PREFETCH" 32(%1)                \n\t"
504                 "movd   (%1), %%mm0             \n\t"
505                 "movd   4(%1), %%mm3            \n\t"
506                 "punpckldq 8(%1), %%mm0         \n\t"
507                 "punpckldq 12(%1), %%mm3        \n\t"
508                 "movq %%mm0, %%mm1              \n\t"
509                 "movq %%mm3, %%mm4              \n\t"
510                 "pand %%mm6, %%mm0              \n\t"
511                 "pand %%mm6, %%mm3              \n\t"
512                 "pmaddwd %%mm7, %%mm0           \n\t"
513                 "pmaddwd %%mm7, %%mm3           \n\t"
514                 "pand %%mm5, %%mm1              \n\t"
515                 "pand %%mm5, %%mm4              \n\t"
516                 "por %%mm1, %%mm0               \n\t"   
517                 "por %%mm4, %%mm3               \n\t"
518                 "psrld $6, %%mm0                \n\t"
519                 "pslld $10, %%mm3               \n\t"
520                 "por %%mm3, %%mm0               \n\t"
521                 MOVNTQ" %%mm0, (%0)             \n\t"
522                 "add $16, %1                    \n\t"
523                 "add $8, %0                     \n\t"
524                 "cmp %2, %1                     \n\t"
525                 " jb 1b                         \n\t"
526                 : "+r" (d), "+r"(s)
527                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
528         );
529 #else
530         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
531         __asm __volatile(
532             "movq       %0, %%mm7\n\t"
533             "movq       %1, %%mm6\n\t"
534             ::"m"(red_15mask),"m"(green_15mask));
535         while(s < mm_end)
536         {
537             __asm __volatile(
538                 PREFETCH" 32%1\n\t"
539                 "movd   %1, %%mm0\n\t"
540                 "movd   4%1, %%mm3\n\t"
541                 "punpckldq 8%1, %%mm0\n\t"
542                 "punpckldq 12%1, %%mm3\n\t"
543                 "movq   %%mm0, %%mm1\n\t"
544                 "movq   %%mm0, %%mm2\n\t"
545                 "movq   %%mm3, %%mm4\n\t"
546                 "movq   %%mm3, %%mm5\n\t"
547                 "psrlq  $3, %%mm0\n\t"
548                 "psrlq  $3, %%mm3\n\t"
549                 "pand   %2, %%mm0\n\t"
550                 "pand   %2, %%mm3\n\t"
551                 "psrlq  $6, %%mm1\n\t"
552                 "psrlq  $6, %%mm4\n\t"
553                 "pand   %%mm6, %%mm1\n\t"
554                 "pand   %%mm6, %%mm4\n\t"
555                 "psrlq  $9, %%mm2\n\t"
556                 "psrlq  $9, %%mm5\n\t"
557                 "pand   %%mm7, %%mm2\n\t"
558                 "pand   %%mm7, %%mm5\n\t"
559                 "por    %%mm1, %%mm0\n\t"
560                 "por    %%mm4, %%mm3\n\t"
561                 "por    %%mm2, %%mm0\n\t"
562                 "por    %%mm5, %%mm3\n\t"
563                 "psllq  $16, %%mm3\n\t"
564                 "por    %%mm3, %%mm0\n\t"
565                 MOVNTQ" %%mm0, %0\n\t"
566                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
567                 d += 4;
568                 s += 16;
569         }
570 #endif
571         __asm __volatile(SFENCE:::"memory");
572         __asm __volatile(EMMS:::"memory");
573 #endif
574         while(s < end)
575         {
576                 register int rgb = *(uint32_t*)s; s += 4;
577                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
578         }
579 }
580
581 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
582 {
583         const uint8_t *s = src;
584         const uint8_t *end;
585 #ifdef HAVE_MMX
586         const uint8_t *mm_end;
587 #endif
588         uint16_t *d = (uint16_t *)dst;
589         end = s + src_size;
590 #ifdef HAVE_MMX
591         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
592         __asm __volatile(
593             "movq       %0, %%mm7\n\t"
594             "movq       %1, %%mm6\n\t"
595             ::"m"(red_15mask),"m"(green_15mask));
596         mm_end = end - 15;
597         while(s < mm_end)
598         {
599             __asm __volatile(
600                 PREFETCH" 32%1\n\t"
601                 "movd   %1, %%mm0\n\t"
602                 "movd   4%1, %%mm3\n\t"
603                 "punpckldq 8%1, %%mm0\n\t"
604                 "punpckldq 12%1, %%mm3\n\t"
605                 "movq   %%mm0, %%mm1\n\t"
606                 "movq   %%mm0, %%mm2\n\t"
607                 "movq   %%mm3, %%mm4\n\t"
608                 "movq   %%mm3, %%mm5\n\t"
609                 "psllq  $7, %%mm0\n\t"
610                 "psllq  $7, %%mm3\n\t"
611                 "pand   %%mm7, %%mm0\n\t"
612                 "pand   %%mm7, %%mm3\n\t"
613                 "psrlq  $6, %%mm1\n\t"
614                 "psrlq  $6, %%mm4\n\t"
615                 "pand   %%mm6, %%mm1\n\t"
616                 "pand   %%mm6, %%mm4\n\t"
617                 "psrlq  $19, %%mm2\n\t"
618                 "psrlq  $19, %%mm5\n\t"
619                 "pand   %2, %%mm2\n\t"
620                 "pand   %2, %%mm5\n\t"
621                 "por    %%mm1, %%mm0\n\t"
622                 "por    %%mm4, %%mm3\n\t"
623                 "por    %%mm2, %%mm0\n\t"
624                 "por    %%mm5, %%mm3\n\t"
625                 "psllq  $16, %%mm3\n\t"
626                 "por    %%mm3, %%mm0\n\t"
627                 MOVNTQ" %%mm0, %0\n\t"
628                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
629                 d += 4;
630                 s += 16;
631         }
632         __asm __volatile(SFENCE:::"memory");
633         __asm __volatile(EMMS:::"memory");
634 #endif
635         while(s < end)
636         {
637                 register int rgb = *(uint32_t*)s; s += 4;
638                 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
639         }
640 }
641
642 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
643 {
644         const uint8_t *s = src;
645         const uint8_t *end;
646 #ifdef HAVE_MMX
647         const uint8_t *mm_end;
648 #endif
649         uint16_t *d = (uint16_t *)dst;
650         end = s + src_size;
651 #ifdef HAVE_MMX
652         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
653         __asm __volatile(
654             "movq       %0, %%mm7\n\t"
655             "movq       %1, %%mm6\n\t"
656             ::"m"(red_16mask),"m"(green_16mask));
657         mm_end = end - 11;
658         while(s < mm_end)
659         {
660             __asm __volatile(
661                 PREFETCH" 32%1\n\t"
662                 "movd   %1, %%mm0\n\t"
663                 "movd   3%1, %%mm3\n\t"
664                 "punpckldq 6%1, %%mm0\n\t"
665                 "punpckldq 9%1, %%mm3\n\t"
666                 "movq   %%mm0, %%mm1\n\t"
667                 "movq   %%mm0, %%mm2\n\t"
668                 "movq   %%mm3, %%mm4\n\t"
669                 "movq   %%mm3, %%mm5\n\t"
670                 "psrlq  $3, %%mm0\n\t"
671                 "psrlq  $3, %%mm3\n\t"
672                 "pand   %2, %%mm0\n\t"
673                 "pand   %2, %%mm3\n\t"
674                 "psrlq  $5, %%mm1\n\t"
675                 "psrlq  $5, %%mm4\n\t"
676                 "pand   %%mm6, %%mm1\n\t"
677                 "pand   %%mm6, %%mm4\n\t"
678                 "psrlq  $8, %%mm2\n\t"
679                 "psrlq  $8, %%mm5\n\t"
680                 "pand   %%mm7, %%mm2\n\t"
681                 "pand   %%mm7, %%mm5\n\t"
682                 "por    %%mm1, %%mm0\n\t"
683                 "por    %%mm4, %%mm3\n\t"
684                 "por    %%mm2, %%mm0\n\t"
685                 "por    %%mm5, %%mm3\n\t"
686                 "psllq  $16, %%mm3\n\t"
687                 "por    %%mm3, %%mm0\n\t"
688                 MOVNTQ" %%mm0, %0\n\t"
689                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
690                 d += 4;
691                 s += 12;
692         }
693         __asm __volatile(SFENCE:::"memory");
694         __asm __volatile(EMMS:::"memory");
695 #endif
696         while(s < end)
697         {
698                 const int b= *s++;
699                 const int g= *s++;
700                 const int r= *s++;
701                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
702         }
703 }
704
705 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
706 {
707         const uint8_t *s = src;
708         const uint8_t *end;
709 #ifdef HAVE_MMX
710         const uint8_t *mm_end;
711 #endif
712         uint16_t *d = (uint16_t *)dst;
713         end = s + src_size;
714 #ifdef HAVE_MMX
715         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
716         __asm __volatile(
717             "movq       %0, %%mm7\n\t"
718             "movq       %1, %%mm6\n\t"
719             ::"m"(red_16mask),"m"(green_16mask));
720         mm_end = end - 15;
721         while(s < mm_end)
722         {
723             __asm __volatile(
724                 PREFETCH" 32%1\n\t"
725                 "movd   %1, %%mm0\n\t"
726                 "movd   3%1, %%mm3\n\t"
727                 "punpckldq 6%1, %%mm0\n\t"
728                 "punpckldq 9%1, %%mm3\n\t"
729                 "movq   %%mm0, %%mm1\n\t"
730                 "movq   %%mm0, %%mm2\n\t"
731                 "movq   %%mm3, %%mm4\n\t"
732                 "movq   %%mm3, %%mm5\n\t"
733                 "psllq  $8, %%mm0\n\t"
734                 "psllq  $8, %%mm3\n\t"
735                 "pand   %%mm7, %%mm0\n\t"
736                 "pand   %%mm7, %%mm3\n\t"
737                 "psrlq  $5, %%mm1\n\t"
738                 "psrlq  $5, %%mm4\n\t"
739                 "pand   %%mm6, %%mm1\n\t"
740                 "pand   %%mm6, %%mm4\n\t"
741                 "psrlq  $19, %%mm2\n\t"
742                 "psrlq  $19, %%mm5\n\t"
743                 "pand   %2, %%mm2\n\t"
744                 "pand   %2, %%mm5\n\t"
745                 "por    %%mm1, %%mm0\n\t"
746                 "por    %%mm4, %%mm3\n\t"
747                 "por    %%mm2, %%mm0\n\t"
748                 "por    %%mm5, %%mm3\n\t"
749                 "psllq  $16, %%mm3\n\t"
750                 "por    %%mm3, %%mm0\n\t"
751                 MOVNTQ" %%mm0, %0\n\t"
752                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
753                 d += 4;
754                 s += 12;
755         }
756         __asm __volatile(SFENCE:::"memory");
757         __asm __volatile(EMMS:::"memory");
758 #endif
759         while(s < end)
760         {
761                 const int r= *s++;
762                 const int g= *s++;
763                 const int b= *s++;
764                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
765         }
766 }
767
768 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
769 {
770         const uint8_t *s = src;
771         const uint8_t *end;
772 #ifdef HAVE_MMX
773         const uint8_t *mm_end;
774 #endif
775         uint16_t *d = (uint16_t *)dst;
776         end = s + src_size;
777 #ifdef HAVE_MMX
778         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
779         __asm __volatile(
780             "movq       %0, %%mm7\n\t"
781             "movq       %1, %%mm6\n\t"
782             ::"m"(red_15mask),"m"(green_15mask));
783         mm_end = end - 11;
784         while(s < mm_end)
785         {
786             __asm __volatile(
787                 PREFETCH" 32%1\n\t"
788                 "movd   %1, %%mm0\n\t"
789                 "movd   3%1, %%mm3\n\t"
790                 "punpckldq 6%1, %%mm0\n\t"
791                 "punpckldq 9%1, %%mm3\n\t"
792                 "movq   %%mm0, %%mm1\n\t"
793                 "movq   %%mm0, %%mm2\n\t"
794                 "movq   %%mm3, %%mm4\n\t"
795                 "movq   %%mm3, %%mm5\n\t"
796                 "psrlq  $3, %%mm0\n\t"
797                 "psrlq  $3, %%mm3\n\t"
798                 "pand   %2, %%mm0\n\t"
799                 "pand   %2, %%mm3\n\t"
800                 "psrlq  $6, %%mm1\n\t"
801                 "psrlq  $6, %%mm4\n\t"
802                 "pand   %%mm6, %%mm1\n\t"
803                 "pand   %%mm6, %%mm4\n\t"
804                 "psrlq  $9, %%mm2\n\t"
805                 "psrlq  $9, %%mm5\n\t"
806                 "pand   %%mm7, %%mm2\n\t"
807                 "pand   %%mm7, %%mm5\n\t"
808                 "por    %%mm1, %%mm0\n\t"
809                 "por    %%mm4, %%mm3\n\t"
810                 "por    %%mm2, %%mm0\n\t"
811                 "por    %%mm5, %%mm3\n\t"
812                 "psllq  $16, %%mm3\n\t"
813                 "por    %%mm3, %%mm0\n\t"
814                 MOVNTQ" %%mm0, %0\n\t"
815                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
816                 d += 4;
817                 s += 12;
818         }
819         __asm __volatile(SFENCE:::"memory");
820         __asm __volatile(EMMS:::"memory");
821 #endif
822         while(s < end)
823         {
824                 const int b= *s++;
825                 const int g= *s++;
826                 const int r= *s++;
827                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
828         }
829 }
830
831 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
832 {
833         const uint8_t *s = src;
834         const uint8_t *end;
835 #ifdef HAVE_MMX
836         const uint8_t *mm_end;
837 #endif
838         uint16_t *d = (uint16_t *)dst;
839         end = s + src_size;
840 #ifdef HAVE_MMX
841         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
842         __asm __volatile(
843             "movq       %0, %%mm7\n\t"
844             "movq       %1, %%mm6\n\t"
845             ::"m"(red_15mask),"m"(green_15mask));
846         mm_end = end - 15;
847         while(s < mm_end)
848         {
849             __asm __volatile(
850                 PREFETCH" 32%1\n\t"
851                 "movd   %1, %%mm0\n\t"
852                 "movd   3%1, %%mm3\n\t"
853                 "punpckldq 6%1, %%mm0\n\t"
854                 "punpckldq 9%1, %%mm3\n\t"
855                 "movq   %%mm0, %%mm1\n\t"
856                 "movq   %%mm0, %%mm2\n\t"
857                 "movq   %%mm3, %%mm4\n\t"
858                 "movq   %%mm3, %%mm5\n\t"
859                 "psllq  $7, %%mm0\n\t"
860                 "psllq  $7, %%mm3\n\t"
861                 "pand   %%mm7, %%mm0\n\t"
862                 "pand   %%mm7, %%mm3\n\t"
863                 "psrlq  $6, %%mm1\n\t"
864                 "psrlq  $6, %%mm4\n\t"
865                 "pand   %%mm6, %%mm1\n\t"
866                 "pand   %%mm6, %%mm4\n\t"
867                 "psrlq  $19, %%mm2\n\t"
868                 "psrlq  $19, %%mm5\n\t"
869                 "pand   %2, %%mm2\n\t"
870                 "pand   %2, %%mm5\n\t"
871                 "por    %%mm1, %%mm0\n\t"
872                 "por    %%mm4, %%mm3\n\t"
873                 "por    %%mm2, %%mm0\n\t"
874                 "por    %%mm5, %%mm3\n\t"
875                 "psllq  $16, %%mm3\n\t"
876                 "por    %%mm3, %%mm0\n\t"
877                 MOVNTQ" %%mm0, %0\n\t"
878                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
879                 d += 4;
880                 s += 12;
881         }
882         __asm __volatile(SFENCE:::"memory");
883         __asm __volatile(EMMS:::"memory");
884 #endif
885         while(s < end)
886         {
887                 const int r= *s++;
888                 const int g= *s++;
889                 const int b= *s++;
890                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
891         }
892 }
893
894 /*
895   I use here less accurate approximation by simply
896  left-shifting the input
897   value and filling the low order bits with
898  zeroes. This method improves png's
899   compression but this scheme cannot reproduce white exactly, since it does not
900   generate an all-ones maximum value; the net effect is to darken the
901   image slightly.
902
903   The better method should be "left bit replication":
904
905    4 3 2 1 0
906    ---------
907    1 1 0 1 1
908
909    7 6 5 4 3  2 1 0
910    ----------------
911    1 1 0 1 1  1 1 0
912    |=======|  |===|
913        |      Leftmost Bits Repeated to Fill Open Bits
914        |
915    Original Bits
916 */
917 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
918 {
919         const uint16_t *end;
920 #ifdef HAVE_MMX
921         const uint16_t *mm_end;
922 #endif
923         uint8_t *d = (uint8_t *)dst;
924         const uint16_t *s = (uint16_t *)src;
925         end = s + src_size/2;
926 #ifdef HAVE_MMX
927         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
928         mm_end = end - 7;
929         while(s < mm_end)
930         {
931             __asm __volatile(
932                 PREFETCH" 32%1\n\t"
933                 "movq   %1, %%mm0\n\t"
934                 "movq   %1, %%mm1\n\t"
935                 "movq   %1, %%mm2\n\t"
936                 "pand   %2, %%mm0\n\t"
937                 "pand   %3, %%mm1\n\t"
938                 "pand   %4, %%mm2\n\t"
939                 "psllq  $3, %%mm0\n\t"
940                 "psrlq  $2, %%mm1\n\t"
941                 "psrlq  $7, %%mm2\n\t"
942                 "movq   %%mm0, %%mm3\n\t"
943                 "movq   %%mm1, %%mm4\n\t"
944                 "movq   %%mm2, %%mm5\n\t"
945                 "punpcklwd %5, %%mm0\n\t"
946                 "punpcklwd %5, %%mm1\n\t"
947                 "punpcklwd %5, %%mm2\n\t"
948                 "punpckhwd %5, %%mm3\n\t"
949                 "punpckhwd %5, %%mm4\n\t"
950                 "punpckhwd %5, %%mm5\n\t"
951                 "psllq  $8, %%mm1\n\t"
952                 "psllq  $16, %%mm2\n\t"
953                 "por    %%mm1, %%mm0\n\t"
954                 "por    %%mm2, %%mm0\n\t"
955                 "psllq  $8, %%mm4\n\t"
956                 "psllq  $16, %%mm5\n\t"
957                 "por    %%mm4, %%mm3\n\t"
958                 "por    %%mm5, %%mm3\n\t"
959
960                 "movq   %%mm0, %%mm6\n\t"
961                 "movq   %%mm3, %%mm7\n\t"
962                 
963                 "movq   8%1, %%mm0\n\t"
964                 "movq   8%1, %%mm1\n\t"
965                 "movq   8%1, %%mm2\n\t"
966                 "pand   %2, %%mm0\n\t"
967                 "pand   %3, %%mm1\n\t"
968                 "pand   %4, %%mm2\n\t"
969                 "psllq  $3, %%mm0\n\t"
970                 "psrlq  $2, %%mm1\n\t"
971                 "psrlq  $7, %%mm2\n\t"
972                 "movq   %%mm0, %%mm3\n\t"
973                 "movq   %%mm1, %%mm4\n\t"
974                 "movq   %%mm2, %%mm5\n\t"
975                 "punpcklwd %5, %%mm0\n\t"
976                 "punpcklwd %5, %%mm1\n\t"
977                 "punpcklwd %5, %%mm2\n\t"
978                 "punpckhwd %5, %%mm3\n\t"
979                 "punpckhwd %5, %%mm4\n\t"
980                 "punpckhwd %5, %%mm5\n\t"
981                 "psllq  $8, %%mm1\n\t"
982                 "psllq  $16, %%mm2\n\t"
983                 "por    %%mm1, %%mm0\n\t"
984                 "por    %%mm2, %%mm0\n\t"
985                 "psllq  $8, %%mm4\n\t"
986                 "psllq  $16, %%mm5\n\t"
987                 "por    %%mm4, %%mm3\n\t"
988                 "por    %%mm5, %%mm3\n\t"
989
990                 :"=m"(*d)
991                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
992                 :"memory");
993             /* Borrowed 32 to 24 */
994             __asm __volatile(
995                 "movq   %%mm0, %%mm4\n\t"
996                 "movq   %%mm3, %%mm5\n\t"
997                 "movq   %%mm6, %%mm0\n\t"
998                 "movq   %%mm7, %%mm1\n\t"
999                 
1000                 "movq   %%mm4, %%mm6\n\t"
1001                 "movq   %%mm5, %%mm7\n\t"
1002                 "movq   %%mm0, %%mm2\n\t"
1003                 "movq   %%mm1, %%mm3\n\t"
1004
1005                 "psrlq  $8, %%mm2\n\t"
1006                 "psrlq  $8, %%mm3\n\t"
1007                 "psrlq  $8, %%mm6\n\t"
1008                 "psrlq  $8, %%mm7\n\t"
1009                 "pand   %2, %%mm0\n\t"
1010                 "pand   %2, %%mm1\n\t"
1011                 "pand   %2, %%mm4\n\t"
1012                 "pand   %2, %%mm5\n\t"
1013                 "pand   %3, %%mm2\n\t"
1014                 "pand   %3, %%mm3\n\t"
1015                 "pand   %3, %%mm6\n\t"
1016                 "pand   %3, %%mm7\n\t"
1017                 "por    %%mm2, %%mm0\n\t"
1018                 "por    %%mm3, %%mm1\n\t"
1019                 "por    %%mm6, %%mm4\n\t"
1020                 "por    %%mm7, %%mm5\n\t"
1021
1022                 "movq   %%mm1, %%mm2\n\t"
1023                 "movq   %%mm4, %%mm3\n\t"
1024                 "psllq  $48, %%mm2\n\t"
1025                 "psllq  $32, %%mm3\n\t"
1026                 "pand   %4, %%mm2\n\t"
1027                 "pand   %5, %%mm3\n\t"
1028                 "por    %%mm2, %%mm0\n\t"
1029                 "psrlq  $16, %%mm1\n\t"
1030                 "psrlq  $32, %%mm4\n\t"
1031                 "psllq  $16, %%mm5\n\t"
1032                 "por    %%mm3, %%mm1\n\t"
1033                 "pand   %6, %%mm5\n\t"
1034                 "por    %%mm5, %%mm4\n\t"
1035
1036                 MOVNTQ" %%mm0, %0\n\t"
1037                 MOVNTQ" %%mm1, 8%0\n\t"
1038                 MOVNTQ" %%mm4, 16%0"
1039
1040                 :"=m"(*d)
1041                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1042                 :"memory");
1043                 d += 24;
1044                 s += 8;
1045         }
1046         __asm __volatile(SFENCE:::"memory");
1047         __asm __volatile(EMMS:::"memory");
1048 #endif
1049         while(s < end)
1050         {
1051                 register uint16_t bgr;
1052                 bgr = *s++;
1053                 *d++ = (bgr&0x1F)<<3;
1054                 *d++ = (bgr&0x3E0)>>2;
1055                 *d++ = (bgr&0x7C00)>>7;
1056         }
1057 }
1058
1059 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1060 {
1061         const uint16_t *end;
1062 #ifdef HAVE_MMX
1063         const uint16_t *mm_end;
1064 #endif
1065         uint8_t *d = (uint8_t *)dst;
1066         const uint16_t *s = (const uint16_t *)src;
1067         end = s + src_size/2;
1068 #ifdef HAVE_MMX
1069         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1070         mm_end = end - 7;
1071         while(s < mm_end)
1072         {
1073             __asm __volatile(
1074                 PREFETCH" 32%1\n\t"
1075                 "movq   %1, %%mm0\n\t"
1076                 "movq   %1, %%mm1\n\t"
1077                 "movq   %1, %%mm2\n\t"
1078                 "pand   %2, %%mm0\n\t"
1079                 "pand   %3, %%mm1\n\t"
1080                 "pand   %4, %%mm2\n\t"
1081                 "psllq  $3, %%mm0\n\t"
1082                 "psrlq  $3, %%mm1\n\t"
1083                 "psrlq  $8, %%mm2\n\t"
1084                 "movq   %%mm0, %%mm3\n\t"
1085                 "movq   %%mm1, %%mm4\n\t"
1086                 "movq   %%mm2, %%mm5\n\t"
1087                 "punpcklwd %5, %%mm0\n\t"
1088                 "punpcklwd %5, %%mm1\n\t"
1089                 "punpcklwd %5, %%mm2\n\t"
1090                 "punpckhwd %5, %%mm3\n\t"
1091                 "punpckhwd %5, %%mm4\n\t"
1092                 "punpckhwd %5, %%mm5\n\t"
1093                 "psllq  $8, %%mm1\n\t"
1094                 "psllq  $16, %%mm2\n\t"
1095                 "por    %%mm1, %%mm0\n\t"
1096                 "por    %%mm2, %%mm0\n\t"
1097                 "psllq  $8, %%mm4\n\t"
1098                 "psllq  $16, %%mm5\n\t"
1099                 "por    %%mm4, %%mm3\n\t"
1100                 "por    %%mm5, %%mm3\n\t"
1101                 
1102                 "movq   %%mm0, %%mm6\n\t"
1103                 "movq   %%mm3, %%mm7\n\t"
1104
1105                 "movq   8%1, %%mm0\n\t"
1106                 "movq   8%1, %%mm1\n\t"
1107                 "movq   8%1, %%mm2\n\t"
1108                 "pand   %2, %%mm0\n\t"
1109                 "pand   %3, %%mm1\n\t"
1110                 "pand   %4, %%mm2\n\t"
1111                 "psllq  $3, %%mm0\n\t"
1112                 "psrlq  $3, %%mm1\n\t"
1113                 "psrlq  $8, %%mm2\n\t"
1114                 "movq   %%mm0, %%mm3\n\t"
1115                 "movq   %%mm1, %%mm4\n\t"
1116                 "movq   %%mm2, %%mm5\n\t"
1117                 "punpcklwd %5, %%mm0\n\t"
1118                 "punpcklwd %5, %%mm1\n\t"
1119                 "punpcklwd %5, %%mm2\n\t"
1120                 "punpckhwd %5, %%mm3\n\t"
1121                 "punpckhwd %5, %%mm4\n\t"
1122                 "punpckhwd %5, %%mm5\n\t"
1123                 "psllq  $8, %%mm1\n\t"
1124                 "psllq  $16, %%mm2\n\t"
1125                 "por    %%mm1, %%mm0\n\t"
1126                 "por    %%mm2, %%mm0\n\t"
1127                 "psllq  $8, %%mm4\n\t"
1128                 "psllq  $16, %%mm5\n\t"
1129                 "por    %%mm4, %%mm3\n\t"
1130                 "por    %%mm5, %%mm3\n\t"
1131                 :"=m"(*d)
1132                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1133                 :"memory");
1134             /* Borrowed 32 to 24 */
1135             __asm __volatile(
1136                 "movq   %%mm0, %%mm4\n\t"
1137                 "movq   %%mm3, %%mm5\n\t"
1138                 "movq   %%mm6, %%mm0\n\t"
1139                 "movq   %%mm7, %%mm1\n\t"
1140                 
1141                 "movq   %%mm4, %%mm6\n\t"
1142                 "movq   %%mm5, %%mm7\n\t"
1143                 "movq   %%mm0, %%mm2\n\t"
1144                 "movq   %%mm1, %%mm3\n\t"
1145
1146                 "psrlq  $8, %%mm2\n\t"
1147                 "psrlq  $8, %%mm3\n\t"
1148                 "psrlq  $8, %%mm6\n\t"
1149                 "psrlq  $8, %%mm7\n\t"
1150                 "pand   %2, %%mm0\n\t"
1151                 "pand   %2, %%mm1\n\t"
1152                 "pand   %2, %%mm4\n\t"
1153                 "pand   %2, %%mm5\n\t"
1154                 "pand   %3, %%mm2\n\t"
1155                 "pand   %3, %%mm3\n\t"
1156                 "pand   %3, %%mm6\n\t"
1157                 "pand   %3, %%mm7\n\t"
1158                 "por    %%mm2, %%mm0\n\t"
1159                 "por    %%mm3, %%mm1\n\t"
1160                 "por    %%mm6, %%mm4\n\t"
1161                 "por    %%mm7, %%mm5\n\t"
1162
1163                 "movq   %%mm1, %%mm2\n\t"
1164                 "movq   %%mm4, %%mm3\n\t"
1165                 "psllq  $48, %%mm2\n\t"
1166                 "psllq  $32, %%mm3\n\t"
1167                 "pand   %4, %%mm2\n\t"
1168                 "pand   %5, %%mm3\n\t"
1169                 "por    %%mm2, %%mm0\n\t"
1170                 "psrlq  $16, %%mm1\n\t"
1171                 "psrlq  $32, %%mm4\n\t"
1172                 "psllq  $16, %%mm5\n\t"
1173                 "por    %%mm3, %%mm1\n\t"
1174                 "pand   %6, %%mm5\n\t"
1175                 "por    %%mm5, %%mm4\n\t"
1176
1177                 MOVNTQ" %%mm0, %0\n\t"
1178                 MOVNTQ" %%mm1, 8%0\n\t"
1179                 MOVNTQ" %%mm4, 16%0"
1180
1181                 :"=m"(*d)
1182                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1183                 :"memory");
1184                 d += 24;
1185                 s += 8;
1186         }
1187         __asm __volatile(SFENCE:::"memory");
1188         __asm __volatile(EMMS:::"memory");
1189 #endif
1190         while(s < end)
1191         {
1192                 register uint16_t bgr;
1193                 bgr = *s++;
1194                 *d++ = (bgr&0x1F)<<3;
1195                 *d++ = (bgr&0x7E0)>>3;
1196                 *d++ = (bgr&0xF800)>>8;
1197         }
1198 }
1199
1200 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1201 {
1202         const uint16_t *end;
1203 #ifdef HAVE_MMX
1204         const uint16_t *mm_end;
1205 #endif
1206         uint8_t *d = (uint8_t *)dst;
1207         const uint16_t *s = (const uint16_t *)src;
1208         end = s + src_size/2;
1209 #ifdef HAVE_MMX
1210         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1211         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1212         mm_end = end - 3;
1213         while(s < mm_end)
1214         {
1215             __asm __volatile(
1216                 PREFETCH" 32%1\n\t"
1217                 "movq   %1, %%mm0\n\t"
1218                 "movq   %1, %%mm1\n\t"
1219                 "movq   %1, %%mm2\n\t"
1220                 "pand   %2, %%mm0\n\t"
1221                 "pand   %3, %%mm1\n\t"
1222                 "pand   %4, %%mm2\n\t"
1223                 "psllq  $3, %%mm0\n\t"
1224                 "psrlq  $2, %%mm1\n\t"
1225                 "psrlq  $7, %%mm2\n\t"
1226                 "movq   %%mm0, %%mm3\n\t"
1227                 "movq   %%mm1, %%mm4\n\t"
1228                 "movq   %%mm2, %%mm5\n\t"
1229                 "punpcklwd %%mm7, %%mm0\n\t"
1230                 "punpcklwd %%mm7, %%mm1\n\t"
1231                 "punpcklwd %%mm7, %%mm2\n\t"
1232                 "punpckhwd %%mm7, %%mm3\n\t"
1233                 "punpckhwd %%mm7, %%mm4\n\t"
1234                 "punpckhwd %%mm7, %%mm5\n\t"
1235                 "psllq  $8, %%mm1\n\t"
1236                 "psllq  $16, %%mm2\n\t"
1237                 "por    %%mm1, %%mm0\n\t"
1238                 "por    %%mm2, %%mm0\n\t"
1239                 "psllq  $8, %%mm4\n\t"
1240                 "psllq  $16, %%mm5\n\t"
1241                 "por    %%mm4, %%mm3\n\t"
1242                 "por    %%mm5, %%mm3\n\t"
1243                 MOVNTQ" %%mm0, %0\n\t"
1244                 MOVNTQ" %%mm3, 8%0\n\t"
1245                 :"=m"(*d)
1246                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1247                 :"memory");
1248                 d += 16;
1249                 s += 4;
1250         }
1251         __asm __volatile(SFENCE:::"memory");
1252         __asm __volatile(EMMS:::"memory");
1253 #endif
1254         while(s < end)
1255         {
1256 #if 0 //slightly slower on athlon
1257                 int bgr= *s++;
1258                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1259 #else
1260                 register uint16_t bgr;
1261                 bgr = *s++;
1262 #ifdef WORDS_BIGENDIAN
1263                 *d++ = 0;
1264                 *d++ = (bgr&0x7C00)>>7;
1265                 *d++ = (bgr&0x3E0)>>2;
1266                 *d++ = (bgr&0x1F)<<3;
1267 #else
1268                 *d++ = (bgr&0x1F)<<3;
1269                 *d++ = (bgr&0x3E0)>>2;
1270                 *d++ = (bgr&0x7C00)>>7;
1271                 *d++ = 0;
1272 #endif
1273
1274 #endif
1275         }
1276 }
1277
1278 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1279 {
1280         const uint16_t *end;
1281 #ifdef HAVE_MMX
1282         const uint16_t *mm_end;
1283 #endif
1284         uint8_t *d = (uint8_t *)dst;
1285         const uint16_t *s = (uint16_t *)src;
1286         end = s + src_size/2;
1287 #ifdef HAVE_MMX
1288         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1289         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1290         mm_end = end - 3;
1291         while(s < mm_end)
1292         {
1293             __asm __volatile(
1294                 PREFETCH" 32%1\n\t"
1295                 "movq   %1, %%mm0\n\t"
1296                 "movq   %1, %%mm1\n\t"
1297                 "movq   %1, %%mm2\n\t"
1298                 "pand   %2, %%mm0\n\t"
1299                 "pand   %3, %%mm1\n\t"
1300                 "pand   %4, %%mm2\n\t"
1301                 "psllq  $3, %%mm0\n\t"
1302                 "psrlq  $3, %%mm1\n\t"
1303                 "psrlq  $8, %%mm2\n\t"
1304                 "movq   %%mm0, %%mm3\n\t"
1305                 "movq   %%mm1, %%mm4\n\t"
1306                 "movq   %%mm2, %%mm5\n\t"
1307                 "punpcklwd %%mm7, %%mm0\n\t"
1308                 "punpcklwd %%mm7, %%mm1\n\t"
1309                 "punpcklwd %%mm7, %%mm2\n\t"
1310                 "punpckhwd %%mm7, %%mm3\n\t"
1311                 "punpckhwd %%mm7, %%mm4\n\t"
1312                 "punpckhwd %%mm7, %%mm5\n\t"
1313                 "psllq  $8, %%mm1\n\t"
1314                 "psllq  $16, %%mm2\n\t"
1315                 "por    %%mm1, %%mm0\n\t"
1316                 "por    %%mm2, %%mm0\n\t"
1317                 "psllq  $8, %%mm4\n\t"
1318                 "psllq  $16, %%mm5\n\t"
1319                 "por    %%mm4, %%mm3\n\t"
1320                 "por    %%mm5, %%mm3\n\t"
1321                 MOVNTQ" %%mm0, %0\n\t"
1322                 MOVNTQ" %%mm3, 8%0\n\t"
1323                 :"=m"(*d)
1324                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1325                 :"memory");
1326                 d += 16;
1327                 s += 4;
1328         }
1329         __asm __volatile(SFENCE:::"memory");
1330         __asm __volatile(EMMS:::"memory");
1331 #endif
1332         while(s < end)
1333         {
1334                 register uint16_t bgr;
1335                 bgr = *s++;
1336 #ifdef WORDS_BIGENDIAN
1337                 *d++ = 0;
1338                 *d++ = (bgr&0xF800)>>8;
1339                 *d++ = (bgr&0x7E0)>>3;
1340                 *d++ = (bgr&0x1F)<<3;
1341 #else
1342                 *d++ = (bgr&0x1F)<<3;
1343                 *d++ = (bgr&0x7E0)>>3;
1344                 *d++ = (bgr&0xF800)>>8;
1345                 *d++ = 0;
1346 #endif
1347         }
1348 }
1349
1350 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1351 {
1352 #ifdef HAVE_MMX
1353 /* TODO: unroll this loop */
1354         asm volatile (
1355                 "xor %%"REG_a", %%"REG_a"       \n\t"
1356                 ASMALIGN(4)
1357                 "1:                             \n\t"
1358                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1359                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1360                 "movq %%mm0, %%mm1              \n\t"
1361                 "movq %%mm0, %%mm2              \n\t"
1362                 "pslld $16, %%mm0               \n\t"
1363                 "psrld $16, %%mm1               \n\t"
1364                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1365                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1366                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1367                 "por %%mm0, %%mm2               \n\t"
1368                 "por %%mm1, %%mm2               \n\t"
1369                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1370                 "add $8, %%"REG_a"              \n\t"
1371                 "cmp %2, %%"REG_a"              \n\t"
1372                 " jb 1b                         \n\t"
1373                 :: "r" (src), "r"(dst), "r" (src_size-7)
1374                 : "%"REG_a
1375         );
1376
1377         __asm __volatile(SFENCE:::"memory");
1378         __asm __volatile(EMMS:::"memory");
1379 #else
1380         unsigned i;
1381         unsigned num_pixels = src_size >> 2;
1382         for(i=0; i<num_pixels; i++)
1383         {
1384 #ifdef WORDS_BIGENDIAN  
1385           dst[4*i + 1] = src[4*i + 3];
1386           dst[4*i + 2] = src[4*i + 2];
1387           dst[4*i + 3] = src[4*i + 1];
1388 #else
1389           dst[4*i + 0] = src[4*i + 2];
1390           dst[4*i + 1] = src[4*i + 1];
1391           dst[4*i + 2] = src[4*i + 0];
1392 #endif
1393         }
1394 #endif
1395 }
1396
1397 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1398 {
1399         unsigned i;
1400 #ifdef HAVE_MMX
1401         long mmx_size= 23 - src_size;
1402         asm volatile (
1403                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1404                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1405                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1406                 ASMALIGN(4)
1407                 "1:                             \n\t"
1408                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1409                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1410                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1411                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1412                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1413                 "pand %%mm5, %%mm0              \n\t"
1414                 "pand %%mm6, %%mm1              \n\t"
1415                 "pand %%mm7, %%mm2              \n\t"
1416                 "por %%mm0, %%mm1               \n\t"
1417                 "por %%mm2, %%mm1               \n\t"                
1418                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1419                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1420                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1421                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1422                 "pand %%mm7, %%mm0              \n\t"
1423                 "pand %%mm5, %%mm1              \n\t"
1424                 "pand %%mm6, %%mm2              \n\t"
1425                 "por %%mm0, %%mm1               \n\t"
1426                 "por %%mm2, %%mm1               \n\t"                
1427                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1428                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1429                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1430                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1431                 "pand %%mm6, %%mm0              \n\t"
1432                 "pand %%mm7, %%mm1              \n\t"
1433                 "pand %%mm5, %%mm2              \n\t"
1434                 "por %%mm0, %%mm1               \n\t"
1435                 "por %%mm2, %%mm1               \n\t"                
1436                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1437                 "add $24, %%"REG_a"             \n\t"
1438                 " js 1b                         \n\t"
1439                 : "+a" (mmx_size)
1440                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1441         );
1442
1443         __asm __volatile(SFENCE:::"memory");
1444         __asm __volatile(EMMS:::"memory");
1445
1446         if(mmx_size==23) return; //finihsed, was multiple of 8
1447
1448         src+= src_size;
1449         dst+= src_size;
1450         src_size= 23-mmx_size;
1451         src-= src_size;
1452         dst-= src_size;
1453 #endif
1454         for(i=0; i<src_size; i+=3)
1455         {
1456                 register uint8_t x;
1457                 x          = src[i + 2];
1458                 dst[i + 1] = src[i + 1];
1459                 dst[i + 2] = src[i + 0];
1460                 dst[i + 0] = x;
1461         }
1462 }
1463
1464 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465         long width, long height,
1466         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1467 {
1468         long y;
1469         const long chromWidth= width>>1;
1470         for(y=0; y<height; y++)
1471         {
1472 #ifdef HAVE_MMX
1473 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1474                 asm volatile(
1475                         "xor %%"REG_a", %%"REG_a"       \n\t"
1476                         ASMALIGN(4)
1477                         "1:                             \n\t"
1478                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1479                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1480                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1481                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1482                         "movq %%mm0, %%mm2              \n\t" // U(0)
1483                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1484                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1485                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1486
1487                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1488                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1489                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1490                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1491                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1492                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1493                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1494                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1495
1496                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1497                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1498                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1499                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1500
1501                         "add $8, %%"REG_a"              \n\t"
1502                         "cmp %4, %%"REG_a"              \n\t"
1503                         " jb 1b                         \n\t"
1504                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1505                         : "%"REG_a
1506                 );
1507 #else
1508
1509 #if defined ARCH_ALPHA && defined HAVE_MVI
1510 #define pl2yuy2(n)                                      \
1511         y1 = yc[n];                                     \
1512         y2 = yc2[n];                                    \
1513         u = uc[n];                                      \
1514         v = vc[n];                                      \
1515         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1516         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1517         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1518         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1519         yuv1 = (u << 8) + (v << 24);                    \
1520         yuv2 = yuv1 + y2;                               \
1521         yuv1 += y1;                                     \
1522         qdst[n] = yuv1;                                 \
1523         qdst2[n] = yuv2;
1524
1525                 int i;
1526                 uint64_t *qdst = (uint64_t *) dst;
1527                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1528                 const uint32_t *yc = (uint32_t *) ysrc;
1529                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1530                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1531                 for(i = 0; i < chromWidth; i += 8){
1532                         uint64_t y1, y2, yuv1, yuv2;
1533                         uint64_t u, v;
1534                         /* Prefetch */
1535                         asm("ldq $31,64(%0)" :: "r"(yc));
1536                         asm("ldq $31,64(%0)" :: "r"(yc2));
1537                         asm("ldq $31,64(%0)" :: "r"(uc));
1538                         asm("ldq $31,64(%0)" :: "r"(vc));
1539
1540                         pl2yuy2(0);
1541                         pl2yuy2(1);
1542                         pl2yuy2(2);
1543                         pl2yuy2(3);
1544
1545                         yc += 4;
1546                         yc2 += 4;
1547                         uc += 4;
1548                         vc += 4;
1549                         qdst += 4;
1550                         qdst2 += 4;
1551                 }
1552                 y++;
1553                 ysrc += lumStride;
1554                 dst += dstStride;
1555
1556 #elif __WORDSIZE >= 64
1557                 int i;
1558                 uint64_t *ldst = (uint64_t *) dst;
1559                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560                 for(i = 0; i < chromWidth; i += 2){
1561                         uint64_t k, l;
1562                         k = yc[0] + (uc[0] << 8) +
1563                             (yc[1] << 16) + (vc[0] << 24);
1564                         l = yc[2] + (uc[1] << 8) +
1565                             (yc[3] << 16) + (vc[1] << 24);
1566                         *ldst++ = k + (l << 32);
1567                         yc += 4;
1568                         uc += 2;
1569                         vc += 2;
1570                 }
1571
1572 #else
1573                 int i, *idst = (int32_t *) dst;
1574                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575                 for(i = 0; i < chromWidth; i++){
1576 #ifdef WORDS_BIGENDIAN
1577                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1578                             (yc[1] << 8) + (vc[0] << 0);
1579 #else
1580                         *idst++ = yc[0] + (uc[0] << 8) +
1581                             (yc[1] << 16) + (vc[0] << 24);
1582 #endif
1583                         yc += 2;
1584                         uc++;
1585                         vc++;
1586                 }
1587 #endif
1588 #endif
1589                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1590                 {
1591                         usrc += chromStride;
1592                         vsrc += chromStride;
1593                 }
1594                 ysrc += lumStride;
1595                 dst += dstStride;
1596         }
1597 #ifdef HAVE_MMX
1598 asm(    EMMS" \n\t"
1599         SFENCE" \n\t"
1600         :::"memory");
1601 #endif
1602 }
1603
1604 /**
1605  *
1606  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1607  * problem for anyone then tell me, and ill fix it)
1608  */
1609 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1610         long width, long height,
1611         long lumStride, long chromStride, long dstStride)
1612 {
1613         //FIXME interpolate chroma
1614         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1615 }
1616
1617 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1618         long width, long height,
1619         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1620 {
1621         long y;
1622         const long chromWidth= width>>1;
1623         for(y=0; y<height; y++)
1624         {
1625 #ifdef HAVE_MMX
1626 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1627                 asm volatile(
1628                         "xor %%"REG_a", %%"REG_a"       \n\t"
1629                         ASMALIGN(4)
1630                         "1:                             \n\t"
1631                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1632                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1633                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1634                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1635                         "movq %%mm0, %%mm2              \n\t" // U(0)
1636                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1637                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1638                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1639
1640                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1641                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1642                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1643                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1644                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1645                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1646                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1647                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1648
1649                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1650                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1651                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1652                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1653
1654                         "add $8, %%"REG_a"              \n\t"
1655                         "cmp %4, %%"REG_a"              \n\t"
1656                         " jb 1b                         \n\t"
1657                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1658                         : "%"REG_a
1659                 );
1660 #else
1661 //FIXME adapt the alpha asm code from yv12->yuy2
1662
1663 #if __WORDSIZE >= 64
1664                 int i;
1665                 uint64_t *ldst = (uint64_t *) dst;
1666                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1667                 for(i = 0; i < chromWidth; i += 2){
1668                         uint64_t k, l;
1669                         k = uc[0] + (yc[0] << 8) +
1670                             (vc[0] << 16) + (yc[1] << 24);
1671                         l = uc[1] + (yc[2] << 8) +
1672                             (vc[1] << 16) + (yc[3] << 24);
1673                         *ldst++ = k + (l << 32);
1674                         yc += 4;
1675                         uc += 2;
1676                         vc += 2;
1677                 }
1678
1679 #else
1680                 int i, *idst = (int32_t *) dst;
1681                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1682                 for(i = 0; i < chromWidth; i++){
1683 #ifdef WORDS_BIGENDIAN
1684                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1685                             (vc[0] << 8) + (yc[1] << 0);
1686 #else
1687                         *idst++ = uc[0] + (yc[0] << 8) +
1688                             (vc[0] << 16) + (yc[1] << 24);
1689 #endif
1690                         yc += 2;
1691                         uc++;
1692                         vc++;
1693                 }
1694 #endif
1695 #endif
1696                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1697                 {
1698                         usrc += chromStride;
1699                         vsrc += chromStride;
1700                 }
1701                 ysrc += lumStride;
1702                 dst += dstStride;
1703         }
1704 #ifdef HAVE_MMX
1705 asm(    EMMS" \n\t"
1706         SFENCE" \n\t"
1707         :::"memory");
1708 #endif
1709 }
1710
1711 /**
1712  *
1713  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1714  * problem for anyone then tell me, and ill fix it)
1715  */
1716 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1717         long width, long height,
1718         long lumStride, long chromStride, long dstStride)
1719 {
1720         //FIXME interpolate chroma
1721         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1722 }
1723
1724 /**
1725  *
1726  * width should be a multiple of 16
1727  */
1728 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1729         long width, long height,
1730         long lumStride, long chromStride, long dstStride)
1731 {
1732         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1733 }
1734
1735 /**
1736  *
1737  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1738  * problem for anyone then tell me, and ill fix it)
1739  */
1740 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1741         long width, long height,
1742         long lumStride, long chromStride, long srcStride)
1743 {
1744         long y;
1745         const long chromWidth= width>>1;
1746         for(y=0; y<height; y+=2)
1747         {
1748 #ifdef HAVE_MMX
1749                 asm volatile(
1750                         "xor %%"REG_a", %%"REG_a"       \n\t"
1751                         "pcmpeqw %%mm7, %%mm7           \n\t"
1752                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1753                         ASMALIGN(4)
1754                         "1:                             \n\t"
1755                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1756                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1757                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1758                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1759                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1760                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1761                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1762                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1763                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1764                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1765                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1766
1767                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1768
1769                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1770                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1771                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1772                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1773                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1774                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1775                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1776                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1777                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1778                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1779
1780                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1781
1782                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1783                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1784                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1785                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1786                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1787                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1788                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1789                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1790
1791                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1792                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1793
1794                         "add $8, %%"REG_a"              \n\t"
1795                         "cmp %4, %%"REG_a"              \n\t"
1796                         " jb 1b                         \n\t"
1797                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1798                         : "memory", "%"REG_a
1799                 );
1800
1801                 ydst += lumStride;
1802                 src  += srcStride;
1803
1804                 asm volatile(
1805                         "xor %%"REG_a", %%"REG_a"       \n\t"
1806                         ASMALIGN(4)
1807                         "1:                             \n\t"
1808                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1809                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1810                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1811                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1812                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1813                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1814                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1815                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1816                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1817                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1818                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1819
1820                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1821                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1822
1823                         "add $8, %%"REG_a"              \n\t"
1824                         "cmp %4, %%"REG_a"              \n\t"
1825                         " jb 1b                         \n\t"
1826
1827                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1828                         : "memory", "%"REG_a
1829                 );
1830 #else
1831                 long i;
1832                 for(i=0; i<chromWidth; i++)
1833                 {
1834                         ydst[2*i+0]     = src[4*i+0];
1835                         udst[i]         = src[4*i+1];
1836                         ydst[2*i+1]     = src[4*i+2];
1837                         vdst[i]         = src[4*i+3];
1838                 }
1839                 ydst += lumStride;
1840                 src  += srcStride;
1841
1842                 for(i=0; i<chromWidth; i++)
1843                 {
1844                         ydst[2*i+0]     = src[4*i+0];
1845                         ydst[2*i+1]     = src[4*i+2];
1846                 }
1847 #endif
1848                 udst += chromStride;
1849                 vdst += chromStride;
1850                 ydst += lumStride;
1851                 src  += srcStride;
1852         }
1853 #ifdef HAVE_MMX
1854 asm volatile(   EMMS" \n\t"
1855                 SFENCE" \n\t"
1856                 :::"memory");
1857 #endif
1858 }
1859
1860 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1861         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1862         long width, long height, long lumStride, long chromStride)
1863 {
1864         /* Y Plane */
1865         memcpy(ydst, ysrc, width*height);
1866
1867         /* XXX: implement upscaling for U,V */
1868 }
1869
1870 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1871 {
1872         long x,y;
1873         
1874         dst[0]= src[0];
1875         
1876         // first line
1877         for(x=0; x<srcWidth-1; x++){
1878                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1879                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1880         }
1881         dst[2*srcWidth-1]= src[srcWidth-1];
1882         
1883         dst+= dstStride;
1884
1885         for(y=1; y<srcHeight; y++){
1886 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1887                 const long mmxSize= srcWidth&~15;
1888                 asm volatile(
1889                         "mov %4, %%"REG_a"              \n\t"
1890                         "1:                             \n\t"
1891                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1892                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1893                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1894                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1895                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1896                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1897                         PAVGB" %%mm0, %%mm5             \n\t"
1898                         PAVGB" %%mm0, %%mm3             \n\t"
1899                         PAVGB" %%mm0, %%mm5             \n\t"
1900                         PAVGB" %%mm0, %%mm3             \n\t"
1901                         PAVGB" %%mm1, %%mm4             \n\t"
1902                         PAVGB" %%mm1, %%mm2             \n\t"
1903                         PAVGB" %%mm1, %%mm4             \n\t"
1904                         PAVGB" %%mm1, %%mm2             \n\t"
1905                         "movq %%mm5, %%mm7              \n\t"
1906                         "movq %%mm4, %%mm6              \n\t"
1907                         "punpcklbw %%mm3, %%mm5         \n\t"
1908                         "punpckhbw %%mm3, %%mm7         \n\t"
1909                         "punpcklbw %%mm2, %%mm4         \n\t"
1910                         "punpckhbw %%mm2, %%mm6         \n\t"
1911 #if 1
1912                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1913                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1914                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1915                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1916 #else
1917                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1918                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1919                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1920                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1921 #endif
1922                         "add $8, %%"REG_a"              \n\t"
1923                         " js 1b                         \n\t"
1924                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1925                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1926                            "g" (-mmxSize)
1927                         : "%"REG_a
1928
1929                 );
1930 #else
1931                 const long mmxSize=1;
1932 #endif
1933                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1934                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1935
1936                 for(x=mmxSize-1; x<srcWidth-1; x++){
1937                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1938                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1939                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1940                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1941                 }
1942                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1943                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1944
1945                 dst+=dstStride*2;
1946                 src+=srcStride;
1947         }
1948         
1949         // last line
1950 #if 1
1951         dst[0]= src[0];
1952         
1953         for(x=0; x<srcWidth-1; x++){
1954                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1955                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1956         }
1957         dst[2*srcWidth-1]= src[srcWidth-1];
1958 #else
1959         for(x=0; x<srcWidth; x++){
1960                 dst[2*x+0]=
1961                 dst[2*x+1]= src[x];
1962         }
1963 #endif
1964
1965 #ifdef HAVE_MMX
1966 asm volatile(   EMMS" \n\t"
1967                 SFENCE" \n\t"
1968                 :::"memory");
1969 #endif
1970 }
1971
1972 /**
1973  *
1974  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1975  * problem for anyone then tell me, and ill fix it)
1976  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1977  */
1978 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1979         long width, long height,
1980         long lumStride, long chromStride, long srcStride)
1981 {
1982         long y;
1983         const long chromWidth= width>>1;
1984         for(y=0; y<height; y+=2)
1985         {
1986 #ifdef HAVE_MMX
1987                 asm volatile(
1988                         "xorl %%eax, %%eax              \n\t"
1989                         "pcmpeqw %%mm7, %%mm7           \n\t"
1990                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1991                         ASMALIGN(4)
1992                         "1:                             \n\t"
1993                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1994                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1995                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1996                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1997                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1998                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1999                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
2000                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2001                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2002                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
2003                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
2004
2005                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
2006
2007                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2008                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2009                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2010                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2011                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2012                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2013                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2014                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2015                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2016                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2017
2018                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2019
2020                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2021                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2022                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2023                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2024                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2025                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2026                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2027                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2028
2029                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2030                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2031
2032                         "addl $8, %%eax                 \n\t"
2033                         "cmpl %4, %%eax                 \n\t"
2034                         " jb 1b                         \n\t"
2035                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2036                         : "memory", "%eax"
2037                 );
2038
2039                 ydst += lumStride;
2040                 src  += srcStride;
2041
2042                 asm volatile(
2043                         "xorl %%eax, %%eax              \n\t"
2044                         ASMALIGN(4)
2045                         "1:                             \n\t"
2046                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2047                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2048                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2049                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2050                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2051                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2052                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2053                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2054                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2055                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2056                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2057
2058                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2059                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2060
2061                         "addl $8, %%eax                 \n\t"
2062                         "cmpl %4, %%eax                 \n\t"
2063                         " jb 1b                         \n\t"
2064
2065                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2066                         : "memory", "%eax"
2067                 );
2068 #else
2069                 long i;
2070                 for(i=0; i<chromWidth; i++)
2071                 {
2072                         udst[i]         = src[4*i+0];
2073                         ydst[2*i+0]     = src[4*i+1];
2074                         vdst[i]         = src[4*i+2];
2075                         ydst[2*i+1]     = src[4*i+3];
2076                 }
2077                 ydst += lumStride;
2078                 src  += srcStride;
2079
2080                 for(i=0; i<chromWidth; i++)
2081                 {
2082                         ydst[2*i+0]     = src[4*i+1];
2083                         ydst[2*i+1]     = src[4*i+3];
2084                 }
2085 #endif
2086                 udst += chromStride;
2087                 vdst += chromStride;
2088                 ydst += lumStride;
2089                 src  += srcStride;
2090         }
2091 #ifdef HAVE_MMX
2092 asm volatile(   EMMS" \n\t"
2093                 SFENCE" \n\t"
2094                 :::"memory");
2095 #endif
2096 }
2097
2098 /**
2099  *
2100  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2101  * problem for anyone then tell me, and ill fix it)
2102  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2103  */
2104 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2105         long width, long height,
2106         long lumStride, long chromStride, long srcStride)
2107 {
2108         long y;
2109         const long chromWidth= width>>1;
2110 #ifdef HAVE_MMX
2111         for(y=0; y<height-2; y+=2)
2112         {
2113                 long i;
2114                 for(i=0; i<2; i++)
2115                 {
2116                         asm volatile(
2117                                 "mov %2, %%"REG_a"              \n\t"
2118                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2119                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2120                                 "pxor %%mm7, %%mm7              \n\t"
2121                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2122                                 ASMALIGN(4)
2123                                 "1:                             \n\t"
2124                                 PREFETCH" 64(%0, %%"REG_d")     \n\t"
2125                                 "movd (%0, %%"REG_d"), %%mm0    \n\t"
2126                                 "movd 3(%0, %%"REG_d"), %%mm1   \n\t"
2127                                 "punpcklbw %%mm7, %%mm0         \n\t"
2128                                 "punpcklbw %%mm7, %%mm1         \n\t"
2129                                 "movd 6(%0, %%"REG_d"), %%mm2   \n\t"
2130                                 "movd 9(%0, %%"REG_d"), %%mm3   \n\t"
2131                                 "punpcklbw %%mm7, %%mm2         \n\t"
2132                                 "punpcklbw %%mm7, %%mm3         \n\t"
2133                                 "pmaddwd %%mm6, %%mm0           \n\t"
2134                                 "pmaddwd %%mm6, %%mm1           \n\t"
2135                                 "pmaddwd %%mm6, %%mm2           \n\t"
2136                                 "pmaddwd %%mm6, %%mm3           \n\t"
2137 #ifndef FAST_BGR2YV12
2138                                 "psrad $8, %%mm0                \n\t"
2139                                 "psrad $8, %%mm1                \n\t"
2140                                 "psrad $8, %%mm2                \n\t"
2141                                 "psrad $8, %%mm3                \n\t"
2142 #endif
2143                                 "packssdw %%mm1, %%mm0          \n\t"
2144                                 "packssdw %%mm3, %%mm2          \n\t"
2145                                 "pmaddwd %%mm5, %%mm0           \n\t"
2146                                 "pmaddwd %%mm5, %%mm2           \n\t"
2147                                 "packssdw %%mm2, %%mm0          \n\t"
2148                                 "psraw $7, %%mm0                \n\t"
2149
2150                                 "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2151                                 "movd 15(%0, %%"REG_d"), %%mm1  \n\t"
2152                                 "punpcklbw %%mm7, %%mm4         \n\t"
2153                                 "punpcklbw %%mm7, %%mm1         \n\t"
2154                                 "movd 18(%0, %%"REG_d"), %%mm2  \n\t"
2155                                 "movd 21(%0, %%"REG_d"), %%mm3  \n\t"
2156                                 "punpcklbw %%mm7, %%mm2         \n\t"
2157                                 "punpcklbw %%mm7, %%mm3         \n\t"
2158                                 "pmaddwd %%mm6, %%mm4           \n\t"
2159                                 "pmaddwd %%mm6, %%mm1           \n\t"
2160                                 "pmaddwd %%mm6, %%mm2           \n\t"
2161                                 "pmaddwd %%mm6, %%mm3           \n\t"
2162 #ifndef FAST_BGR2YV12
2163                                 "psrad $8, %%mm4                \n\t"
2164                                 "psrad $8, %%mm1                \n\t"
2165                                 "psrad $8, %%mm2                \n\t"
2166                                 "psrad $8, %%mm3                \n\t"
2167 #endif
2168                                 "packssdw %%mm1, %%mm4          \n\t"
2169                                 "packssdw %%mm3, %%mm2          \n\t"
2170                                 "pmaddwd %%mm5, %%mm4           \n\t"
2171                                 "pmaddwd %%mm5, %%mm2           \n\t"
2172                                 "add $24, %%"REG_d"             \n\t"
2173                                 "packssdw %%mm2, %%mm4          \n\t"
2174                                 "psraw $7, %%mm4                \n\t"
2175
2176                                 "packuswb %%mm4, %%mm0          \n\t"
2177                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2178
2179                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2180                                 "add $8, %%"REG_a"              \n\t"
2181                                 " js 1b                         \n\t"
2182                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2183                                 : "%"REG_a, "%"REG_d
2184                         );
2185                         ydst += lumStride;
2186                         src  += srcStride;
2187                 }
2188                 src -= srcStride*2;
2189                 asm volatile(
2190                         "mov %4, %%"REG_a"              \n\t"
2191                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2192                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2193                         "pxor %%mm7, %%mm7              \n\t"
2194                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2195                         "add %%"REG_d", %%"REG_d"       \n\t"
2196                         ASMALIGN(4)
2197                         "1:                             \n\t"
2198                         PREFETCH" 64(%0, %%"REG_d")     \n\t"
2199                         PREFETCH" 64(%1, %%"REG_d")     \n\t"
2200 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2201                         "movq (%0, %%"REG_d"), %%mm0    \n\t"
2202                         "movq (%1, %%"REG_d"), %%mm1    \n\t"
2203                         "movq 6(%0, %%"REG_d"), %%mm2   \n\t"
2204                         "movq 6(%1, %%"REG_d"), %%mm3   \n\t"
2205                         PAVGB" %%mm1, %%mm0             \n\t"
2206                         PAVGB" %%mm3, %%mm2             \n\t"
2207                         "movq %%mm0, %%mm1              \n\t"
2208                         "movq %%mm2, %%mm3              \n\t"
2209                         "psrlq $24, %%mm0               \n\t"
2210                         "psrlq $24, %%mm2               \n\t"
2211                         PAVGB" %%mm1, %%mm0             \n\t"
2212                         PAVGB" %%mm3, %%mm2             \n\t"
2213                         "punpcklbw %%mm7, %%mm0         \n\t"
2214                         "punpcklbw %%mm7, %%mm2         \n\t"
2215 #else
2216                         "movd (%0, %%"REG_d"), %%mm0    \n\t"
2217                         "movd (%1, %%"REG_d"), %%mm1    \n\t"
2218                         "movd 3(%0, %%"REG_d"), %%mm2   \n\t"
2219                         "movd 3(%1, %%"REG_d"), %%mm3   \n\t"
2220                         "punpcklbw %%mm7, %%mm0         \n\t"
2221                         "punpcklbw %%mm7, %%mm1         \n\t"
2222                         "punpcklbw %%mm7, %%mm2         \n\t"
2223                         "punpcklbw %%mm7, %%mm3         \n\t"
2224                         "paddw %%mm1, %%mm0             \n\t"
2225                         "paddw %%mm3, %%mm2             \n\t"
2226                         "paddw %%mm2, %%mm0             \n\t"
2227                         "movd 6(%0, %%"REG_d"), %%mm4   \n\t"
2228                         "movd 6(%1, %%"REG_d"), %%mm1   \n\t"
2229                         "movd 9(%0, %%"REG_d"), %%mm2   \n\t"
2230                         "movd 9(%1, %%"REG_d"), %%mm3   \n\t"
2231                         "punpcklbw %%mm7, %%mm4         \n\t"
2232                         "punpcklbw %%mm7, %%mm1         \n\t"
2233                         "punpcklbw %%mm7, %%mm2         \n\t"
2234                         "punpcklbw %%mm7, %%mm3         \n\t"
2235                         "paddw %%mm1, %%mm4             \n\t"
2236                         "paddw %%mm3, %%mm2             \n\t"
2237                         "paddw %%mm4, %%mm2             \n\t"
2238                         "psrlw $2, %%mm0                \n\t"
2239                         "psrlw $2, %%mm2                \n\t"
2240 #endif
2241                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2242                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2243
2244                         "pmaddwd %%mm0, %%mm1           \n\t"
2245                         "pmaddwd %%mm2, %%mm3           \n\t"
2246                         "pmaddwd %%mm6, %%mm0           \n\t"
2247                         "pmaddwd %%mm6, %%mm2           \n\t"
2248 #ifndef FAST_BGR2YV12
2249                         "psrad $8, %%mm0                \n\t"
2250                         "psrad $8, %%mm1                \n\t"
2251                         "psrad $8, %%mm2                \n\t"
2252                         "psrad $8, %%mm3                \n\t"
2253 #endif
2254                         "packssdw %%mm2, %%mm0          \n\t"
2255                         "packssdw %%mm3, %%mm1          \n\t"
2256                         "pmaddwd %%mm5, %%mm0           \n\t"
2257                         "pmaddwd %%mm5, %%mm1           \n\t"
2258                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2259                         "psraw $7, %%mm0                \n\t"
2260
2261 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2262                         "movq 12(%0, %%"REG_d"), %%mm4  \n\t"
2263                         "movq 12(%1, %%"REG_d"), %%mm1  \n\t"
2264                         "movq 18(%0, %%"REG_d"), %%mm2  \n\t"
2265                         "movq 18(%1, %%"REG_d"), %%mm3  \n\t"
2266                         PAVGB" %%mm1, %%mm4             \n\t"
2267                         PAVGB" %%mm3, %%mm2             \n\t"
2268                         "movq %%mm4, %%mm1              \n\t"
2269                         "movq %%mm2, %%mm3              \n\t"
2270                         "psrlq $24, %%mm4               \n\t"
2271                         "psrlq $24, %%mm2               \n\t"
2272                         PAVGB" %%mm1, %%mm4             \n\t"
2273                         PAVGB" %%mm3, %%mm2             \n\t"
2274                         "punpcklbw %%mm7, %%mm4         \n\t"
2275                         "punpcklbw %%mm7, %%mm2         \n\t"
2276 #else
2277                         "movd 12(%0, %%"REG_d"), %%mm4  \n\t"
2278                         "movd 12(%1, %%"REG_d"), %%mm1  \n\t"
2279                         "movd 15(%0, %%"REG_d"), %%mm2  \n\t"
2280                         "movd 15(%1, %%"REG_d"), %%mm3  \n\t"
2281                         "punpcklbw %%mm7, %%mm4         \n\t"
2282                         "punpcklbw %%mm7, %%mm1         \n\t"
2283                         "punpcklbw %%mm7, %%mm2         \n\t"
2284                         "punpcklbw %%mm7, %%mm3         \n\t"
2285                         "paddw %%mm1, %%mm4             \n\t"
2286                         "paddw %%mm3, %%mm2             \n\t"
2287                         "paddw %%mm2, %%mm4             \n\t"
2288                         "movd 18(%0, %%"REG_d"), %%mm5  \n\t"
2289                         "movd 18(%1, %%"REG_d"), %%mm1  \n\t"
2290                         "movd 21(%0, %%"REG_d"), %%mm2  \n\t"
2291                         "movd 21(%1, %%"REG_d"), %%mm3  \n\t"
2292                         "punpcklbw %%mm7, %%mm5         \n\t"
2293                         "punpcklbw %%mm7, %%mm1         \n\t"
2294                         "punpcklbw %%mm7, %%mm2         \n\t"
2295                         "punpcklbw %%mm7, %%mm3         \n\t"
2296                         "paddw %%mm1, %%mm5             \n\t"
2297                         "paddw %%mm3, %%mm2             \n\t"
2298                         "paddw %%mm5, %%mm2             \n\t"
2299                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2300                         "psrlw $2, %%mm4                \n\t"
2301                         "psrlw $2, %%mm2                \n\t"
2302 #endif
2303                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2304                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2305
2306                         "pmaddwd %%mm4, %%mm1           \n\t"
2307                         "pmaddwd %%mm2, %%mm3           \n\t"
2308                         "pmaddwd %%mm6, %%mm4           \n\t"
2309                         "pmaddwd %%mm6, %%mm2           \n\t"
2310 #ifndef FAST_BGR2YV12
2311                         "psrad $8, %%mm4                \n\t"
2312                         "psrad $8, %%mm1                \n\t"
2313                         "psrad $8, %%mm2                \n\t"
2314                         "psrad $8, %%mm3                \n\t"
2315 #endif
2316                         "packssdw %%mm2, %%mm4          \n\t"
2317                         "packssdw %%mm3, %%mm1          \n\t"
2318                         "pmaddwd %%mm5, %%mm4           \n\t"
2319                         "pmaddwd %%mm5, %%mm1           \n\t"
2320                         "add $24, %%"REG_d"             \n\t"
2321                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2322                         "psraw $7, %%mm4                \n\t"
2323
2324                         "movq %%mm0, %%mm1              \n\t"
2325                         "punpckldq %%mm4, %%mm0         \n\t"
2326                         "punpckhdq %%mm4, %%mm1         \n\t"
2327                         "packsswb %%mm1, %%mm0          \n\t"
2328                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2329                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2330                         "punpckhdq %%mm0, %%mm0         \n\t"
2331                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2332                         "add $4, %%"REG_a"              \n\t"
2333                         " js 1b                         \n\t"
2334                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2335                         : "%"REG_a, "%"REG_d
2336                 );
2337
2338                 udst += chromStride;
2339                 vdst += chromStride;
2340                 src  += srcStride*2;
2341         }
2342
2343         asm volatile(   EMMS" \n\t"
2344                         SFENCE" \n\t"
2345                         :::"memory");
2346 #else
2347         y=0;
2348 #endif
2349         for(; y<height; y+=2)
2350         {
2351                 long i;
2352                 for(i=0; i<chromWidth; i++)
2353                 {
2354                         unsigned int b= src[6*i+0];
2355                         unsigned int g= src[6*i+1];
2356                         unsigned int r= src[6*i+2];
2357
2358                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2359                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2360                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2361
2362                         udst[i]         = U;
2363                         vdst[i]         = V;
2364                         ydst[2*i]       = Y;
2365
2366                         b= src[6*i+3];
2367                         g= src[6*i+4];
2368                         r= src[6*i+5];
2369
2370                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2371                         ydst[2*i+1]     = Y;
2372                 }
2373                 ydst += lumStride;
2374                 src  += srcStride;
2375
2376                 for(i=0; i<chromWidth; i++)
2377                 {
2378                         unsigned int b= src[6*i+0];
2379                         unsigned int g= src[6*i+1];
2380                         unsigned int r= src[6*i+2];
2381
2382                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2383
2384                         ydst[2*i]       = Y;
2385
2386                         b= src[6*i+3];
2387                         g= src[6*i+4];
2388                         r= src[6*i+5];
2389
2390                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391                         ydst[2*i+1]     = Y;
2392                 }
2393                 udst += chromStride;
2394                 vdst += chromStride;
2395                 ydst += lumStride;
2396                 src  += srcStride;
2397         }
2398 }
2399
2400 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2401                             long width, long height, long src1Stride,
2402                             long src2Stride, long dstStride){
2403         long h;
2404
2405         for(h=0; h < height; h++)
2406         {
2407                 long w;
2408
2409 #ifdef HAVE_MMX
2410 #ifdef HAVE_SSE2
2411                 asm(
2412                         "xor %%"REG_a", %%"REG_a"       \n\t"
2413                         "1:                             \n\t"
2414                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2415                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2416                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2417                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2418                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2419                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2420                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2421                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2422                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2423                         "add $16, %%"REG_a"             \n\t"
2424                         "cmp %3, %%"REG_a"              \n\t"
2425                         " jb 1b                         \n\t"
2426                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2427                         : "memory", "%"REG_a""
2428                 );
2429 #else
2430                 asm(
2431                         "xor %%"REG_a", %%"REG_a"       \n\t"
2432                         "1:                             \n\t"
2433                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2434                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2435                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2436                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2437                         "movq %%mm0, %%mm1              \n\t"
2438                         "movq %%mm2, %%mm3              \n\t"
2439                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2440                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2441                         "punpcklbw %%mm4, %%mm0         \n\t"
2442                         "punpckhbw %%mm4, %%mm1         \n\t"
2443                         "punpcklbw %%mm5, %%mm2         \n\t"
2444                         "punpckhbw %%mm5, %%mm3         \n\t"
2445                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2446                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2447                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2448                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2449                         "add $16, %%"REG_a"             \n\t"
2450                         "cmp %3, %%"REG_a"              \n\t"
2451                         " jb 1b                         \n\t"
2452                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2453                         : "memory", "%"REG_a
2454                 );
2455 #endif
2456                 for(w= (width&(~15)); w < width; w++)
2457                 {
2458                         dest[2*w+0] = src1[w];
2459                         dest[2*w+1] = src2[w];
2460                 }
2461 #else
2462                 for(w=0; w < width; w++)
2463                 {
2464                         dest[2*w+0] = src1[w];
2465                         dest[2*w+1] = src2[w];
2466                 }
2467 #endif
2468                 dest += dstStride;
2469                 src1 += src1Stride;
2470                 src2 += src2Stride;
2471         }
2472 #ifdef HAVE_MMX
2473         asm(
2474                 EMMS" \n\t"
2475                 SFENCE" \n\t"
2476                 ::: "memory"
2477                 );
2478 #endif
2479 }
2480
2481 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2482                         uint8_t *dst1, uint8_t *dst2,
2483                         long width, long height,
2484                         long srcStride1, long srcStride2,
2485                         long dstStride1, long dstStride2)
2486 {
2487     long y,x,w,h;
2488     w=width/2; h=height/2;
2489 #ifdef HAVE_MMX
2490     asm volatile(
2491         PREFETCH" %0\n\t"
2492         PREFETCH" %1\n\t"
2493         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2494 #endif
2495     for(y=0;y<h;y++){
2496         const uint8_t* s1=src1+srcStride1*(y>>1);
2497         uint8_t* d=dst1+dstStride1*y;
2498         x=0;
2499 #ifdef HAVE_MMX
2500         for(;x<w-31;x+=32)
2501         {
2502             asm volatile(
2503                 PREFETCH" 32%1\n\t"
2504                 "movq   %1, %%mm0\n\t"
2505                 "movq   8%1, %%mm2\n\t"
2506                 "movq   16%1, %%mm4\n\t"
2507                 "movq   24%1, %%mm6\n\t"
2508                 "movq   %%mm0, %%mm1\n\t"
2509                 "movq   %%mm2, %%mm3\n\t"
2510                 "movq   %%mm4, %%mm5\n\t"
2511                 "movq   %%mm6, %%mm7\n\t"
2512                 "punpcklbw %%mm0, %%mm0\n\t"
2513                 "punpckhbw %%mm1, %%mm1\n\t"
2514                 "punpcklbw %%mm2, %%mm2\n\t"
2515                 "punpckhbw %%mm3, %%mm3\n\t"
2516                 "punpcklbw %%mm4, %%mm4\n\t"
2517                 "punpckhbw %%mm5, %%mm5\n\t"
2518                 "punpcklbw %%mm6, %%mm6\n\t"
2519                 "punpckhbw %%mm7, %%mm7\n\t"
2520                 MOVNTQ" %%mm0, %0\n\t"
2521                 MOVNTQ" %%mm1, 8%0\n\t"
2522                 MOVNTQ" %%mm2, 16%0\n\t"
2523                 MOVNTQ" %%mm3, 24%0\n\t"
2524                 MOVNTQ" %%mm4, 32%0\n\t"
2525                 MOVNTQ" %%mm5, 40%0\n\t"
2526                 MOVNTQ" %%mm6, 48%0\n\t"
2527                 MOVNTQ" %%mm7, 56%0"
2528                 :"=m"(d[2*x])
2529                 :"m"(s1[x])
2530                 :"memory");
2531         }
2532 #endif
2533         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2534     }
2535     for(y=0;y<h;y++){
2536         const uint8_t* s2=src2+srcStride2*(y>>1);
2537         uint8_t* d=dst2+dstStride2*y;
2538         x=0;
2539 #ifdef HAVE_MMX
2540         for(;x<w-31;x+=32)
2541         {
2542             asm volatile(
2543                 PREFETCH" 32%1\n\t"
2544                 "movq   %1, %%mm0\n\t"
2545                 "movq   8%1, %%mm2\n\t"
2546                 "movq   16%1, %%mm4\n\t"
2547                 "movq   24%1, %%mm6\n\t"
2548                 "movq   %%mm0, %%mm1\n\t"
2549                 "movq   %%mm2, %%mm3\n\t"
2550                 "movq   %%mm4, %%mm5\n\t"
2551                 "movq   %%mm6, %%mm7\n\t"
2552                 "punpcklbw %%mm0, %%mm0\n\t"
2553                 "punpckhbw %%mm1, %%mm1\n\t"
2554                 "punpcklbw %%mm2, %%mm2\n\t"
2555                 "punpckhbw %%mm3, %%mm3\n\t"
2556                 "punpcklbw %%mm4, %%mm4\n\t"
2557                 "punpckhbw %%mm5, %%mm5\n\t"
2558                 "punpcklbw %%mm6, %%mm6\n\t"
2559                 "punpckhbw %%mm7, %%mm7\n\t"
2560                 MOVNTQ" %%mm0, %0\n\t"
2561                 MOVNTQ" %%mm1, 8%0\n\t"
2562                 MOVNTQ" %%mm2, 16%0\n\t"
2563                 MOVNTQ" %%mm3, 24%0\n\t"
2564                 MOVNTQ" %%mm4, 32%0\n\t"
2565                 MOVNTQ" %%mm5, 40%0\n\t"
2566                 MOVNTQ" %%mm6, 48%0\n\t"
2567                 MOVNTQ" %%mm7, 56%0"
2568                 :"=m"(d[2*x])
2569                 :"m"(s2[x])
2570                 :"memory");
2571         }
2572 #endif
2573         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2574     }
2575 #ifdef HAVE_MMX
2576         asm(
2577                 EMMS" \n\t"
2578                 SFENCE" \n\t"
2579                 ::: "memory"
2580                 );
2581 #endif
2582 }
2583
2584 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2585                         uint8_t *dst,
2586                         long width, long height,
2587                         long srcStride1, long srcStride2,
2588                         long srcStride3, long dstStride)
2589 {
2590     long y,x,w,h;
2591     w=width/2; h=height;
2592     for(y=0;y<h;y++){
2593         const uint8_t* yp=src1+srcStride1*y;
2594         const uint8_t* up=src2+srcStride2*(y>>2);
2595         const uint8_t* vp=src3+srcStride3*(y>>2);
2596         uint8_t* d=dst+dstStride*y;
2597         x=0;
2598 #ifdef HAVE_MMX
2599         for(;x<w-7;x+=8)
2600         {
2601             asm volatile(
2602                 PREFETCH" 32(%1, %0)\n\t"
2603                 PREFETCH" 32(%2, %0)\n\t"
2604                 PREFETCH" 32(%3, %0)\n\t"
2605                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2606                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2607                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2608                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2609                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2610                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2611                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2612                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2613                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2614                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2615
2616                 "movq   %%mm1, %%mm6\n\t"
2617                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2618                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2619                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2620                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2621                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2622                 
2623                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2624                 "movq   8(%1, %0, 4), %%mm0\n\t"
2625                 "movq   %%mm0, %%mm3\n\t"
2626                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2627                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2628                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2629                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2630
2631                 "movq   %%mm4, %%mm6\n\t"
2632                 "movq   16(%1, %0, 4), %%mm0\n\t"
2633                 "movq   %%mm0, %%mm3\n\t"
2634                 "punpcklbw %%mm5, %%mm4\n\t"
2635                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2636                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2637                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2638                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2639                 
2640                 "punpckhbw %%mm5, %%mm6\n\t"
2641                 "movq   24(%1, %0, 4), %%mm0\n\t"
2642                 "movq   %%mm0, %%mm3\n\t"
2643                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2644                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2645                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2646                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2647
2648                 : "+r" (x)
2649                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2650                 :"memory");
2651         }
2652 #endif
2653         for(; x<w; x++)
2654         {
2655             const long x2= x<<2;
2656             d[8*x+0]=yp[x2];
2657             d[8*x+1]=up[x];
2658             d[8*x+2]=yp[x2+1];
2659             d[8*x+3]=vp[x];
2660             d[8*x+4]=yp[x2+2];
2661             d[8*x+5]=up[x];
2662             d[8*x+6]=yp[x2+3];
2663             d[8*x+7]=vp[x];
2664         }
2665     }
2666 #ifdef HAVE_MMX
2667         asm(
2668                 EMMS" \n\t"
2669                 SFENCE" \n\t"
2670                 ::: "memory"
2671                 );
2672 #endif
2673 }