]> git.sesse.net Git - vlc/blob - modules/video_filter/swscale/rgb2rgb_template.c
Compile fix for C++ files
[vlc] / modules / video_filter / swscale / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245 {
246   register const uint8_t* s=src;
247   register uint8_t* d=dst;
248   register const uint8_t *end;
249   const uint8_t *mm_end;
250   end = s + src_size;
251 #ifdef HAVE_MMX
252   __asm __volatile(PREFETCH"    %0"::"m"(*s));
253   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255   mm_end = end - 15;
256   while(s<mm_end)
257   {
258         __asm __volatile(
259                 PREFETCH"       32%1\n\t"
260                 "movq   %1, %%mm0\n\t"
261                 "movq   8%1, %%mm2\n\t"
262                 "movq   %%mm0, %%mm1\n\t"
263                 "movq   %%mm2, %%mm3\n\t"
264                 "psrlq  $1, %%mm0\n\t"
265                 "psrlq  $1, %%mm2\n\t"
266                 "pand   %%mm7, %%mm0\n\t"
267                 "pand   %%mm7, %%mm2\n\t"
268                 "pand   %%mm6, %%mm1\n\t"
269                 "pand   %%mm6, %%mm3\n\t"
270                 "por    %%mm1, %%mm0\n\t"
271                 "por    %%mm3, %%mm2\n\t"
272                 MOVNTQ" %%mm0, %0\n\t"
273                 MOVNTQ" %%mm2, 8%0"
274                 :"=m"(*d)
275                 :"m"(*s)
276                 );
277         d+=16;
278         s+=16;
279   }
280   __asm __volatile(SFENCE:::"memory");
281   __asm __volatile(EMMS:::"memory");
282 #endif
283     mm_end = end - 3;
284     while(s < mm_end)
285     {
286         register uint32_t x= *((uint32_t *)s);
287         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288         s+=4;
289         d+=4;
290     }
291     if(s < end)
292     {
293         register uint16_t x= *((uint16_t *)s);
294         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295         s+=2;
296         d+=2;
297     }
298 }
299
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301 {
302         const uint8_t *s = src;
303         const uint8_t *end;
304 #ifdef HAVE_MMX
305         const uint8_t *mm_end;
306 #endif
307         uint16_t *d = (uint16_t *)dst;
308         end = s + src_size;
309 #ifdef HAVE_MMX
310         mm_end = end - 15;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312         asm volatile(
313                 "movq %3, %%mm5                 \n\t"
314                 "movq %4, %%mm6                 \n\t"
315                 "movq %5, %%mm7                 \n\t"
316                 ".balign 16                     \n\t"
317                 "1:                             \n\t"
318                 PREFETCH" 32(%1)                \n\t"
319                 "movd   (%1), %%mm0             \n\t"
320                 "movd   4(%1), %%mm3            \n\t"
321                 "punpckldq 8(%1), %%mm0         \n\t"
322                 "punpckldq 12(%1), %%mm3        \n\t"
323                 "movq %%mm0, %%mm1              \n\t"
324                 "movq %%mm3, %%mm4              \n\t"
325                 "pand %%mm6, %%mm0              \n\t"
326                 "pand %%mm6, %%mm3              \n\t"
327                 "pmaddwd %%mm7, %%mm0           \n\t"
328                 "pmaddwd %%mm7, %%mm3           \n\t"
329                 "pand %%mm5, %%mm1              \n\t"
330                 "pand %%mm5, %%mm4              \n\t"
331                 "por %%mm1, %%mm0               \n\t"   
332                 "por %%mm4, %%mm3               \n\t"
333                 "psrld $5, %%mm0                \n\t"
334                 "pslld $11, %%mm3               \n\t"
335                 "por %%mm3, %%mm0               \n\t"
336                 MOVNTQ" %%mm0, (%0)             \n\t"
337                 "addl $16, %1                   \n\t"
338                 "addl $8, %0                    \n\t"
339                 "cmpl %2, %1                    \n\t"
340                 " jb 1b                         \n\t"
341                 : "+r" (d), "+r"(s)
342                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343         );
344 #else
345         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
346         __asm __volatile(
347             "movq       %0, %%mm7\n\t"
348             "movq       %1, %%mm6\n\t"
349             ::"m"(red_16mask),"m"(green_16mask));
350         while(s < mm_end)
351         {
352             __asm __volatile(
353                 PREFETCH" 32%1\n\t"
354                 "movd   %1, %%mm0\n\t"
355                 "movd   4%1, %%mm3\n\t"
356                 "punpckldq 8%1, %%mm0\n\t"
357                 "punpckldq 12%1, %%mm3\n\t"
358                 "movq   %%mm0, %%mm1\n\t"
359                 "movq   %%mm0, %%mm2\n\t"
360                 "movq   %%mm3, %%mm4\n\t"
361                 "movq   %%mm3, %%mm5\n\t"
362                 "psrlq  $3, %%mm0\n\t"
363                 "psrlq  $3, %%mm3\n\t"
364                 "pand   %2, %%mm0\n\t"
365                 "pand   %2, %%mm3\n\t"
366                 "psrlq  $5, %%mm1\n\t"
367                 "psrlq  $5, %%mm4\n\t"
368                 "pand   %%mm6, %%mm1\n\t"
369                 "pand   %%mm6, %%mm4\n\t"
370                 "psrlq  $8, %%mm2\n\t"
371                 "psrlq  $8, %%mm5\n\t"
372                 "pand   %%mm7, %%mm2\n\t"
373                 "pand   %%mm7, %%mm5\n\t"
374                 "por    %%mm1, %%mm0\n\t"
375                 "por    %%mm4, %%mm3\n\t"
376                 "por    %%mm2, %%mm0\n\t"
377                 "por    %%mm5, %%mm3\n\t"
378                 "psllq  $16, %%mm3\n\t"
379                 "por    %%mm3, %%mm0\n\t"
380                 MOVNTQ" %%mm0, %0\n\t"
381                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382                 d += 4;
383                 s += 16;
384         }
385 #endif
386         __asm __volatile(SFENCE:::"memory");
387         __asm __volatile(EMMS:::"memory");
388 #endif
389         while(s < end)
390         {
391                 const int src= *s; s += 4;
392                 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 //              *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394         }
395 }
396
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398 {
399         const uint8_t *s = src;
400         const uint8_t *end;
401 #ifdef HAVE_MMX
402         const uint8_t *mm_end;
403 #endif
404         uint16_t *d = (uint16_t *)dst;
405         end = s + src_size;
406 #ifdef HAVE_MMX
407         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
408         __asm __volatile(
409             "movq       %0, %%mm7\n\t"
410             "movq       %1, %%mm6\n\t"
411             ::"m"(red_16mask),"m"(green_16mask));
412         mm_end = end - 15;
413         while(s < mm_end)
414         {
415             __asm __volatile(
416                 PREFETCH" 32%1\n\t"
417                 "movd   %1, %%mm0\n\t"
418                 "movd   4%1, %%mm3\n\t"
419                 "punpckldq 8%1, %%mm0\n\t"
420                 "punpckldq 12%1, %%mm3\n\t"
421                 "movq   %%mm0, %%mm1\n\t"
422                 "movq   %%mm0, %%mm2\n\t"
423                 "movq   %%mm3, %%mm4\n\t"
424                 "movq   %%mm3, %%mm5\n\t"
425                 "psllq  $8, %%mm0\n\t"
426                 "psllq  $8, %%mm3\n\t"
427                 "pand   %%mm7, %%mm0\n\t"
428                 "pand   %%mm7, %%mm3\n\t"
429                 "psrlq  $5, %%mm1\n\t"
430                 "psrlq  $5, %%mm4\n\t"
431                 "pand   %%mm6, %%mm1\n\t"
432                 "pand   %%mm6, %%mm4\n\t"
433                 "psrlq  $19, %%mm2\n\t"
434                 "psrlq  $19, %%mm5\n\t"
435                 "pand   %2, %%mm2\n\t"
436                 "pand   %2, %%mm5\n\t"
437                 "por    %%mm1, %%mm0\n\t"
438                 "por    %%mm4, %%mm3\n\t"
439                 "por    %%mm2, %%mm0\n\t"
440                 "por    %%mm5, %%mm3\n\t"
441                 "psllq  $16, %%mm3\n\t"
442                 "por    %%mm3, %%mm0\n\t"
443                 MOVNTQ" %%mm0, %0\n\t"
444                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445                 d += 4;
446                 s += 16;
447         }
448         __asm __volatile(SFENCE:::"memory");
449         __asm __volatile(EMMS:::"memory");
450 #endif
451         while(s < end)
452         {
453                 const int src= *s; s += 4;
454                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455         }
456 }
457
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459 {
460         const uint8_t *s = src;
461         const uint8_t *end;
462 #ifdef HAVE_MMX
463         const uint8_t *mm_end;
464 #endif
465         uint16_t *d = (uint16_t *)dst;
466         end = s + src_size;
467 #ifdef HAVE_MMX
468         mm_end = end - 15;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470         asm volatile(
471                 "movq %3, %%mm5                 \n\t"
472                 "movq %4, %%mm6                 \n\t"
473                 "movq %5, %%mm7                 \n\t"
474                 ".balign 16                     \n\t"
475                 "1:                             \n\t"
476                 PREFETCH" 32(%1)                \n\t"
477                 "movd   (%1), %%mm0             \n\t"
478                 "movd   4(%1), %%mm3            \n\t"
479                 "punpckldq 8(%1), %%mm0         \n\t"
480                 "punpckldq 12(%1), %%mm3        \n\t"
481                 "movq %%mm0, %%mm1              \n\t"
482                 "movq %%mm3, %%mm4              \n\t"
483                 "pand %%mm6, %%mm0              \n\t"
484                 "pand %%mm6, %%mm3              \n\t"
485                 "pmaddwd %%mm7, %%mm0           \n\t"
486                 "pmaddwd %%mm7, %%mm3           \n\t"
487                 "pand %%mm5, %%mm1              \n\t"
488                 "pand %%mm5, %%mm4              \n\t"
489                 "por %%mm1, %%mm0               \n\t"   
490                 "por %%mm4, %%mm3               \n\t"
491                 "psrld $6, %%mm0                \n\t"
492                 "pslld $10, %%mm3               \n\t"
493                 "por %%mm3, %%mm0               \n\t"
494                 MOVNTQ" %%mm0, (%0)             \n\t"
495                 "addl $16, %1                   \n\t"
496                 "addl $8, %0                    \n\t"
497                 "cmpl %2, %1                    \n\t"
498                 " jb 1b                         \n\t"
499                 : "+r" (d), "+r"(s)
500                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501         );
502 #else
503         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
504         __asm __volatile(
505             "movq       %0, %%mm7\n\t"
506             "movq       %1, %%mm6\n\t"
507             ::"m"(red_15mask),"m"(green_15mask));
508         while(s < mm_end)
509         {
510             __asm __volatile(
511                 PREFETCH" 32%1\n\t"
512                 "movd   %1, %%mm0\n\t"
513                 "movd   4%1, %%mm3\n\t"
514                 "punpckldq 8%1, %%mm0\n\t"
515                 "punpckldq 12%1, %%mm3\n\t"
516                 "movq   %%mm0, %%mm1\n\t"
517                 "movq   %%mm0, %%mm2\n\t"
518                 "movq   %%mm3, %%mm4\n\t"
519                 "movq   %%mm3, %%mm5\n\t"
520                 "psrlq  $3, %%mm0\n\t"
521                 "psrlq  $3, %%mm3\n\t"
522                 "pand   %2, %%mm0\n\t"
523                 "pand   %2, %%mm3\n\t"
524                 "psrlq  $6, %%mm1\n\t"
525                 "psrlq  $6, %%mm4\n\t"
526                 "pand   %%mm6, %%mm1\n\t"
527                 "pand   %%mm6, %%mm4\n\t"
528                 "psrlq  $9, %%mm2\n\t"
529                 "psrlq  $9, %%mm5\n\t"
530                 "pand   %%mm7, %%mm2\n\t"
531                 "pand   %%mm7, %%mm5\n\t"
532                 "por    %%mm1, %%mm0\n\t"
533                 "por    %%mm4, %%mm3\n\t"
534                 "por    %%mm2, %%mm0\n\t"
535                 "por    %%mm5, %%mm3\n\t"
536                 "psllq  $16, %%mm3\n\t"
537                 "por    %%mm3, %%mm0\n\t"
538                 MOVNTQ" %%mm0, %0\n\t"
539                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540                 d += 4;
541                 s += 16;
542         }
543 #endif
544         __asm __volatile(SFENCE:::"memory");
545         __asm __volatile(EMMS:::"memory");
546 #endif
547         while(s < end)
548         {
549                 const int src= *s; s += 4;
550                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551         }
552 }
553
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555 {
556         const uint8_t *s = src;
557         const uint8_t *end;
558 #ifdef HAVE_MMX
559         const uint8_t *mm_end;
560 #endif
561         uint16_t *d = (uint16_t *)dst;
562         end = s + src_size;
563 #ifdef HAVE_MMX
564         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
565         __asm __volatile(
566             "movq       %0, %%mm7\n\t"
567             "movq       %1, %%mm6\n\t"
568             ::"m"(red_15mask),"m"(green_15mask));
569         mm_end = end - 15;
570         while(s < mm_end)
571         {
572             __asm __volatile(
573                 PREFETCH" 32%1\n\t"
574                 "movd   %1, %%mm0\n\t"
575                 "movd   4%1, %%mm3\n\t"
576                 "punpckldq 8%1, %%mm0\n\t"
577                 "punpckldq 12%1, %%mm3\n\t"
578                 "movq   %%mm0, %%mm1\n\t"
579                 "movq   %%mm0, %%mm2\n\t"
580                 "movq   %%mm3, %%mm4\n\t"
581                 "movq   %%mm3, %%mm5\n\t"
582                 "psllq  $7, %%mm0\n\t"
583                 "psllq  $7, %%mm3\n\t"
584                 "pand   %%mm7, %%mm0\n\t"
585                 "pand   %%mm7, %%mm3\n\t"
586                 "psrlq  $6, %%mm1\n\t"
587                 "psrlq  $6, %%mm4\n\t"
588                 "pand   %%mm6, %%mm1\n\t"
589                 "pand   %%mm6, %%mm4\n\t"
590                 "psrlq  $19, %%mm2\n\t"
591                 "psrlq  $19, %%mm5\n\t"
592                 "pand   %2, %%mm2\n\t"
593                 "pand   %2, %%mm5\n\t"
594                 "por    %%mm1, %%mm0\n\t"
595                 "por    %%mm4, %%mm3\n\t"
596                 "por    %%mm2, %%mm0\n\t"
597                 "por    %%mm5, %%mm3\n\t"
598                 "psllq  $16, %%mm3\n\t"
599                 "por    %%mm3, %%mm0\n\t"
600                 MOVNTQ" %%mm0, %0\n\t"
601                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602                 d += 4;
603                 s += 16;
604         }
605         __asm __volatile(SFENCE:::"memory");
606         __asm __volatile(EMMS:::"memory");
607 #endif
608         while(s < end)
609         {
610                 const int src= *s; s += 4;
611                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612         }
613 }
614
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616 {
617         const uint8_t *s = src;
618         const uint8_t *end;
619 #ifdef HAVE_MMX
620         const uint8_t *mm_end;
621 #endif
622         uint16_t *d = (uint16_t *)dst;
623         end = s + src_size;
624 #ifdef HAVE_MMX
625         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
626         __asm __volatile(
627             "movq       %0, %%mm7\n\t"
628             "movq       %1, %%mm6\n\t"
629             ::"m"(red_16mask),"m"(green_16mask));
630         mm_end = end - 11;
631         while(s < mm_end)
632         {
633             __asm __volatile(
634                 PREFETCH" 32%1\n\t"
635                 "movd   %1, %%mm0\n\t"
636                 "movd   3%1, %%mm3\n\t"
637                 "punpckldq 6%1, %%mm0\n\t"
638                 "punpckldq 9%1, %%mm3\n\t"
639                 "movq   %%mm0, %%mm1\n\t"
640                 "movq   %%mm0, %%mm2\n\t"
641                 "movq   %%mm3, %%mm4\n\t"
642                 "movq   %%mm3, %%mm5\n\t"
643                 "psrlq  $3, %%mm0\n\t"
644                 "psrlq  $3, %%mm3\n\t"
645                 "pand   %2, %%mm0\n\t"
646                 "pand   %2, %%mm3\n\t"
647                 "psrlq  $5, %%mm1\n\t"
648                 "psrlq  $5, %%mm4\n\t"
649                 "pand   %%mm6, %%mm1\n\t"
650                 "pand   %%mm6, %%mm4\n\t"
651                 "psrlq  $8, %%mm2\n\t"
652                 "psrlq  $8, %%mm5\n\t"
653                 "pand   %%mm7, %%mm2\n\t"
654                 "pand   %%mm7, %%mm5\n\t"
655                 "por    %%mm1, %%mm0\n\t"
656                 "por    %%mm4, %%mm3\n\t"
657                 "por    %%mm2, %%mm0\n\t"
658                 "por    %%mm5, %%mm3\n\t"
659                 "psllq  $16, %%mm3\n\t"
660                 "por    %%mm3, %%mm0\n\t"
661                 MOVNTQ" %%mm0, %0\n\t"
662                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663                 d += 4;
664                 s += 12;
665         }
666         __asm __volatile(SFENCE:::"memory");
667         __asm __volatile(EMMS:::"memory");
668 #endif
669         while(s < end)
670         {
671                 const int b= *s++;
672                 const int g= *s++;
673                 const int r= *s++;
674                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675         }
676 }
677
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679 {
680         const uint8_t *s = src;
681         const uint8_t *end;
682 #ifdef HAVE_MMX
683         const uint8_t *mm_end;
684 #endif
685         uint16_t *d = (uint16_t *)dst;
686         end = s + src_size;
687 #ifdef HAVE_MMX
688         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
689         __asm __volatile(
690             "movq       %0, %%mm7\n\t"
691             "movq       %1, %%mm6\n\t"
692             ::"m"(red_16mask),"m"(green_16mask));
693         mm_end = end - 15;
694         while(s < mm_end)
695         {
696             __asm __volatile(
697                 PREFETCH" 32%1\n\t"
698                 "movd   %1, %%mm0\n\t"
699                 "movd   3%1, %%mm3\n\t"
700                 "punpckldq 6%1, %%mm0\n\t"
701                 "punpckldq 9%1, %%mm3\n\t"
702                 "movq   %%mm0, %%mm1\n\t"
703                 "movq   %%mm0, %%mm2\n\t"
704                 "movq   %%mm3, %%mm4\n\t"
705                 "movq   %%mm3, %%mm5\n\t"
706                 "psllq  $8, %%mm0\n\t"
707                 "psllq  $8, %%mm3\n\t"
708                 "pand   %%mm7, %%mm0\n\t"
709                 "pand   %%mm7, %%mm3\n\t"
710                 "psrlq  $5, %%mm1\n\t"
711                 "psrlq  $5, %%mm4\n\t"
712                 "pand   %%mm6, %%mm1\n\t"
713                 "pand   %%mm6, %%mm4\n\t"
714                 "psrlq  $19, %%mm2\n\t"
715                 "psrlq  $19, %%mm5\n\t"
716                 "pand   %2, %%mm2\n\t"
717                 "pand   %2, %%mm5\n\t"
718                 "por    %%mm1, %%mm0\n\t"
719                 "por    %%mm4, %%mm3\n\t"
720                 "por    %%mm2, %%mm0\n\t"
721                 "por    %%mm5, %%mm3\n\t"
722                 "psllq  $16, %%mm3\n\t"
723                 "por    %%mm3, %%mm0\n\t"
724                 MOVNTQ" %%mm0, %0\n\t"
725                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726                 d += 4;
727                 s += 12;
728         }
729         __asm __volatile(SFENCE:::"memory");
730         __asm __volatile(EMMS:::"memory");
731 #endif
732         while(s < end)
733         {
734                 const int r= *s++;
735                 const int g= *s++;
736                 const int b= *s++;
737                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738         }
739 }
740
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742 {
743         const uint8_t *s = src;
744         const uint8_t *end;
745 #ifdef HAVE_MMX
746         const uint8_t *mm_end;
747 #endif
748         uint16_t *d = (uint16_t *)dst;
749         end = s + src_size;
750 #ifdef HAVE_MMX
751         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
752         __asm __volatile(
753             "movq       %0, %%mm7\n\t"
754             "movq       %1, %%mm6\n\t"
755             ::"m"(red_15mask),"m"(green_15mask));
756         mm_end = end - 11;
757         while(s < mm_end)
758         {
759             __asm __volatile(
760                 PREFETCH" 32%1\n\t"
761                 "movd   %1, %%mm0\n\t"
762                 "movd   3%1, %%mm3\n\t"
763                 "punpckldq 6%1, %%mm0\n\t"
764                 "punpckldq 9%1, %%mm3\n\t"
765                 "movq   %%mm0, %%mm1\n\t"
766                 "movq   %%mm0, %%mm2\n\t"
767                 "movq   %%mm3, %%mm4\n\t"
768                 "movq   %%mm3, %%mm5\n\t"
769                 "psrlq  $3, %%mm0\n\t"
770                 "psrlq  $3, %%mm3\n\t"
771                 "pand   %2, %%mm0\n\t"
772                 "pand   %2, %%mm3\n\t"
773                 "psrlq  $6, %%mm1\n\t"
774                 "psrlq  $6, %%mm4\n\t"
775                 "pand   %%mm6, %%mm1\n\t"
776                 "pand   %%mm6, %%mm4\n\t"
777                 "psrlq  $9, %%mm2\n\t"
778                 "psrlq  $9, %%mm5\n\t"
779                 "pand   %%mm7, %%mm2\n\t"
780                 "pand   %%mm7, %%mm5\n\t"
781                 "por    %%mm1, %%mm0\n\t"
782                 "por    %%mm4, %%mm3\n\t"
783                 "por    %%mm2, %%mm0\n\t"
784                 "por    %%mm5, %%mm3\n\t"
785                 "psllq  $16, %%mm3\n\t"
786                 "por    %%mm3, %%mm0\n\t"
787                 MOVNTQ" %%mm0, %0\n\t"
788                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789                 d += 4;
790                 s += 12;
791         }
792         __asm __volatile(SFENCE:::"memory");
793         __asm __volatile(EMMS:::"memory");
794 #endif
795         while(s < end)
796         {
797                 const int b= *s++;
798                 const int g= *s++;
799                 const int r= *s++;
800                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801         }
802 }
803
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805 {
806         const uint8_t *s = src;
807         const uint8_t *end;
808 #ifdef HAVE_MMX
809         const uint8_t *mm_end;
810 #endif
811         uint16_t *d = (uint16_t *)dst;
812         end = s + src_size;
813 #ifdef HAVE_MMX
814         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
815         __asm __volatile(
816             "movq       %0, %%mm7\n\t"
817             "movq       %1, %%mm6\n\t"
818             ::"m"(red_15mask),"m"(green_15mask));
819         mm_end = end - 15;
820         while(s < mm_end)
821         {
822             __asm __volatile(
823                 PREFETCH" 32%1\n\t"
824                 "movd   %1, %%mm0\n\t"
825                 "movd   3%1, %%mm3\n\t"
826                 "punpckldq 6%1, %%mm0\n\t"
827                 "punpckldq 9%1, %%mm3\n\t"
828                 "movq   %%mm0, %%mm1\n\t"
829                 "movq   %%mm0, %%mm2\n\t"
830                 "movq   %%mm3, %%mm4\n\t"
831                 "movq   %%mm3, %%mm5\n\t"
832                 "psllq  $7, %%mm0\n\t"
833                 "psllq  $7, %%mm3\n\t"
834                 "pand   %%mm7, %%mm0\n\t"
835                 "pand   %%mm7, %%mm3\n\t"
836                 "psrlq  $6, %%mm1\n\t"
837                 "psrlq  $6, %%mm4\n\t"
838                 "pand   %%mm6, %%mm1\n\t"
839                 "pand   %%mm6, %%mm4\n\t"
840                 "psrlq  $19, %%mm2\n\t"
841                 "psrlq  $19, %%mm5\n\t"
842                 "pand   %2, %%mm2\n\t"
843                 "pand   %2, %%mm5\n\t"
844                 "por    %%mm1, %%mm0\n\t"
845                 "por    %%mm4, %%mm3\n\t"
846                 "por    %%mm2, %%mm0\n\t"
847                 "por    %%mm5, %%mm3\n\t"
848                 "psllq  $16, %%mm3\n\t"
849                 "por    %%mm3, %%mm0\n\t"
850                 MOVNTQ" %%mm0, %0\n\t"
851                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852                 d += 4;
853                 s += 12;
854         }
855         __asm __volatile(SFENCE:::"memory");
856         __asm __volatile(EMMS:::"memory");
857 #endif
858         while(s < end)
859         {
860                 const int r= *s++;
861                 const int g= *s++;
862                 const int b= *s++;
863                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864         }
865 }
866
867 /*
868   I use here less accurate approximation by simply
869  left-shifting the input
870   value and filling the low order bits with
871  zeroes. This method improves png's
872   compression but this scheme cannot reproduce white exactly, since it does not
873   generate an all-ones maximum value; the net effect is to darken the
874   image slightly.
875
876   The better method should be "left bit replication":
877
878    4 3 2 1 0
879    ---------
880    1 1 0 1 1
881
882    7 6 5 4 3  2 1 0
883    ----------------
884    1 1 0 1 1  1 1 0
885    |=======|  |===|
886        |      Leftmost Bits Repeated to Fill Open Bits
887        |
888    Original Bits
889 */
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891 {
892         const uint16_t *end;
893 #ifdef HAVE_MMX
894         const uint16_t *mm_end;
895 #endif
896         uint8_t *d = (uint8_t *)dst;
897         const uint16_t *s = (uint16_t *)src;
898         end = s + src_size/2;
899 #ifdef HAVE_MMX
900         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
901         mm_end = end - 7;
902         while(s < mm_end)
903         {
904             __asm __volatile(
905                 PREFETCH" 32%1\n\t"
906                 "movq   %1, %%mm0\n\t"
907                 "movq   %1, %%mm1\n\t"
908                 "movq   %1, %%mm2\n\t"
909                 "pand   %2, %%mm0\n\t"
910                 "pand   %3, %%mm1\n\t"
911                 "pand   %4, %%mm2\n\t"
912                 "psllq  $3, %%mm0\n\t"
913                 "psrlq  $2, %%mm1\n\t"
914                 "psrlq  $7, %%mm2\n\t"
915                 "movq   %%mm0, %%mm3\n\t"
916                 "movq   %%mm1, %%mm4\n\t"
917                 "movq   %%mm2, %%mm5\n\t"
918                 "punpcklwd %5, %%mm0\n\t"
919                 "punpcklwd %5, %%mm1\n\t"
920                 "punpcklwd %5, %%mm2\n\t"
921                 "punpckhwd %5, %%mm3\n\t"
922                 "punpckhwd %5, %%mm4\n\t"
923                 "punpckhwd %5, %%mm5\n\t"
924                 "psllq  $8, %%mm1\n\t"
925                 "psllq  $16, %%mm2\n\t"
926                 "por    %%mm1, %%mm0\n\t"
927                 "por    %%mm2, %%mm0\n\t"
928                 "psllq  $8, %%mm4\n\t"
929                 "psllq  $16, %%mm5\n\t"
930                 "por    %%mm4, %%mm3\n\t"
931                 "por    %%mm5, %%mm3\n\t"
932
933                 "movq   %%mm0, %%mm6\n\t"
934                 "movq   %%mm3, %%mm7\n\t"
935                 
936                 "movq   8%1, %%mm0\n\t"
937                 "movq   8%1, %%mm1\n\t"
938                 "movq   8%1, %%mm2\n\t"
939                 "pand   %2, %%mm0\n\t"
940                 "pand   %3, %%mm1\n\t"
941                 "pand   %4, %%mm2\n\t"
942                 "psllq  $3, %%mm0\n\t"
943                 "psrlq  $2, %%mm1\n\t"
944                 "psrlq  $7, %%mm2\n\t"
945                 "movq   %%mm0, %%mm3\n\t"
946                 "movq   %%mm1, %%mm4\n\t"
947                 "movq   %%mm2, %%mm5\n\t"
948                 "punpcklwd %5, %%mm0\n\t"
949                 "punpcklwd %5, %%mm1\n\t"
950                 "punpcklwd %5, %%mm2\n\t"
951                 "punpckhwd %5, %%mm3\n\t"
952                 "punpckhwd %5, %%mm4\n\t"
953                 "punpckhwd %5, %%mm5\n\t"
954                 "psllq  $8, %%mm1\n\t"
955                 "psllq  $16, %%mm2\n\t"
956                 "por    %%mm1, %%mm0\n\t"
957                 "por    %%mm2, %%mm0\n\t"
958                 "psllq  $8, %%mm4\n\t"
959                 "psllq  $16, %%mm5\n\t"
960                 "por    %%mm4, %%mm3\n\t"
961                 "por    %%mm5, %%mm3\n\t"
962
963                 :"=m"(*d)
964                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965                 :"memory");
966             /* Borrowed 32 to 24 */
967             __asm __volatile(
968                 "movq   %%mm0, %%mm4\n\t"
969                 "movq   %%mm3, %%mm5\n\t"
970                 "movq   %%mm6, %%mm0\n\t"
971                 "movq   %%mm7, %%mm1\n\t"
972                 
973                 "movq   %%mm4, %%mm6\n\t"
974                 "movq   %%mm5, %%mm7\n\t"
975                 "movq   %%mm0, %%mm2\n\t"
976                 "movq   %%mm1, %%mm3\n\t"
977
978                 "psrlq  $8, %%mm2\n\t"
979                 "psrlq  $8, %%mm3\n\t"
980                 "psrlq  $8, %%mm6\n\t"
981                 "psrlq  $8, %%mm7\n\t"
982                 "pand   %2, %%mm0\n\t"
983                 "pand   %2, %%mm1\n\t"
984                 "pand   %2, %%mm4\n\t"
985                 "pand   %2, %%mm5\n\t"
986                 "pand   %3, %%mm2\n\t"
987                 "pand   %3, %%mm3\n\t"
988                 "pand   %3, %%mm6\n\t"
989                 "pand   %3, %%mm7\n\t"
990                 "por    %%mm2, %%mm0\n\t"
991                 "por    %%mm3, %%mm1\n\t"
992                 "por    %%mm6, %%mm4\n\t"
993                 "por    %%mm7, %%mm5\n\t"
994
995                 "movq   %%mm1, %%mm2\n\t"
996                 "movq   %%mm4, %%mm3\n\t"
997                 "psllq  $48, %%mm2\n\t"
998                 "psllq  $32, %%mm3\n\t"
999                 "pand   %4, %%mm2\n\t"
1000                 "pand   %5, %%mm3\n\t"
1001                 "por    %%mm2, %%mm0\n\t"
1002                 "psrlq  $16, %%mm1\n\t"
1003                 "psrlq  $32, %%mm4\n\t"
1004                 "psllq  $16, %%mm5\n\t"
1005                 "por    %%mm3, %%mm1\n\t"
1006                 "pand   %6, %%mm5\n\t"
1007                 "por    %%mm5, %%mm4\n\t"
1008
1009                 MOVNTQ" %%mm0, %0\n\t"
1010                 MOVNTQ" %%mm1, 8%0\n\t"
1011                 MOVNTQ" %%mm4, 16%0"
1012
1013                 :"=m"(*d)
1014                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015                 :"memory");
1016                 d += 24;
1017                 s += 8;
1018         }
1019         __asm __volatile(SFENCE:::"memory");
1020         __asm __volatile(EMMS:::"memory");
1021 #endif
1022         while(s < end)
1023         {
1024                 register uint16_t bgr;
1025                 bgr = *s++;
1026                 *d++ = (bgr&0x1F)<<3;
1027                 *d++ = (bgr&0x3E0)>>2;
1028                 *d++ = (bgr&0x7C00)>>7;
1029         }
1030 }
1031
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033 {
1034         const uint16_t *end;
1035 #ifdef HAVE_MMX
1036         const uint16_t *mm_end;
1037 #endif
1038         uint8_t *d = (uint8_t *)dst;
1039         const uint16_t *s = (const uint16_t *)src;
1040         end = s + src_size/2;
1041 #ifdef HAVE_MMX
1042         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1043         mm_end = end - 7;
1044         while(s < mm_end)
1045         {
1046             __asm __volatile(
1047                 PREFETCH" 32%1\n\t"
1048                 "movq   %1, %%mm0\n\t"
1049                 "movq   %1, %%mm1\n\t"
1050                 "movq   %1, %%mm2\n\t"
1051                 "pand   %2, %%mm0\n\t"
1052                 "pand   %3, %%mm1\n\t"
1053                 "pand   %4, %%mm2\n\t"
1054                 "psllq  $3, %%mm0\n\t"
1055                 "psrlq  $3, %%mm1\n\t"
1056                 "psrlq  $8, %%mm2\n\t"
1057                 "movq   %%mm0, %%mm3\n\t"
1058                 "movq   %%mm1, %%mm4\n\t"
1059                 "movq   %%mm2, %%mm5\n\t"
1060                 "punpcklwd %5, %%mm0\n\t"
1061                 "punpcklwd %5, %%mm1\n\t"
1062                 "punpcklwd %5, %%mm2\n\t"
1063                 "punpckhwd %5, %%mm3\n\t"
1064                 "punpckhwd %5, %%mm4\n\t"
1065                 "punpckhwd %5, %%mm5\n\t"
1066                 "psllq  $8, %%mm1\n\t"
1067                 "psllq  $16, %%mm2\n\t"
1068                 "por    %%mm1, %%mm0\n\t"
1069                 "por    %%mm2, %%mm0\n\t"
1070                 "psllq  $8, %%mm4\n\t"
1071                 "psllq  $16, %%mm5\n\t"
1072                 "por    %%mm4, %%mm3\n\t"
1073                 "por    %%mm5, %%mm3\n\t"
1074                 
1075                 "movq   %%mm0, %%mm6\n\t"
1076                 "movq   %%mm3, %%mm7\n\t"
1077
1078                 "movq   8%1, %%mm0\n\t"
1079                 "movq   8%1, %%mm1\n\t"
1080                 "movq   8%1, %%mm2\n\t"
1081                 "pand   %2, %%mm0\n\t"
1082                 "pand   %3, %%mm1\n\t"
1083                 "pand   %4, %%mm2\n\t"
1084                 "psllq  $3, %%mm0\n\t"
1085                 "psrlq  $3, %%mm1\n\t"
1086                 "psrlq  $8, %%mm2\n\t"
1087                 "movq   %%mm0, %%mm3\n\t"
1088                 "movq   %%mm1, %%mm4\n\t"
1089                 "movq   %%mm2, %%mm5\n\t"
1090                 "punpcklwd %5, %%mm0\n\t"
1091                 "punpcklwd %5, %%mm1\n\t"
1092                 "punpcklwd %5, %%mm2\n\t"
1093                 "punpckhwd %5, %%mm3\n\t"
1094                 "punpckhwd %5, %%mm4\n\t"
1095                 "punpckhwd %5, %%mm5\n\t"
1096                 "psllq  $8, %%mm1\n\t"
1097                 "psllq  $16, %%mm2\n\t"
1098                 "por    %%mm1, %%mm0\n\t"
1099                 "por    %%mm2, %%mm0\n\t"
1100                 "psllq  $8, %%mm4\n\t"
1101                 "psllq  $16, %%mm5\n\t"
1102                 "por    %%mm4, %%mm3\n\t"
1103                 "por    %%mm5, %%mm3\n\t"
1104                 :"=m"(*d)
1105                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1106                 :"memory");
1107             /* Borrowed 32 to 24 */
1108             __asm __volatile(
1109                 "movq   %%mm0, %%mm4\n\t"
1110                 "movq   %%mm3, %%mm5\n\t"
1111                 "movq   %%mm6, %%mm0\n\t"
1112                 "movq   %%mm7, %%mm1\n\t"
1113                 
1114                 "movq   %%mm4, %%mm6\n\t"
1115                 "movq   %%mm5, %%mm7\n\t"
1116                 "movq   %%mm0, %%mm2\n\t"
1117                 "movq   %%mm1, %%mm3\n\t"
1118
1119                 "psrlq  $8, %%mm2\n\t"
1120                 "psrlq  $8, %%mm3\n\t"
1121                 "psrlq  $8, %%mm6\n\t"
1122                 "psrlq  $8, %%mm7\n\t"
1123                 "pand   %2, %%mm0\n\t"
1124                 "pand   %2, %%mm1\n\t"
1125                 "pand   %2, %%mm4\n\t"
1126                 "pand   %2, %%mm5\n\t"
1127                 "pand   %3, %%mm2\n\t"
1128                 "pand   %3, %%mm3\n\t"
1129                 "pand   %3, %%mm6\n\t"
1130                 "pand   %3, %%mm7\n\t"
1131                 "por    %%mm2, %%mm0\n\t"
1132                 "por    %%mm3, %%mm1\n\t"
1133                 "por    %%mm6, %%mm4\n\t"
1134                 "por    %%mm7, %%mm5\n\t"
1135
1136                 "movq   %%mm1, %%mm2\n\t"
1137                 "movq   %%mm4, %%mm3\n\t"
1138                 "psllq  $48, %%mm2\n\t"
1139                 "psllq  $32, %%mm3\n\t"
1140                 "pand   %4, %%mm2\n\t"
1141                 "pand   %5, %%mm3\n\t"
1142                 "por    %%mm2, %%mm0\n\t"
1143                 "psrlq  $16, %%mm1\n\t"
1144                 "psrlq  $32, %%mm4\n\t"
1145                 "psllq  $16, %%mm5\n\t"
1146                 "por    %%mm3, %%mm1\n\t"
1147                 "pand   %6, %%mm5\n\t"
1148                 "por    %%mm5, %%mm4\n\t"
1149
1150                 MOVNTQ" %%mm0, %0\n\t"
1151                 MOVNTQ" %%mm1, 8%0\n\t"
1152                 MOVNTQ" %%mm4, 16%0"
1153
1154                 :"=m"(*d)
1155                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156                 :"memory");
1157                 d += 24;
1158                 s += 8;
1159         }
1160         __asm __volatile(SFENCE:::"memory");
1161         __asm __volatile(EMMS:::"memory");
1162 #endif
1163         while(s < end)
1164         {
1165                 register uint16_t bgr;
1166                 bgr = *s++;
1167                 *d++ = (bgr&0x1F)<<3;
1168                 *d++ = (bgr&0x7E0)>>3;
1169                 *d++ = (bgr&0xF800)>>8;
1170         }
1171 }
1172
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174 {
1175         const uint16_t *end;
1176 #ifdef HAVE_MMX
1177         const uint16_t *mm_end;
1178 #endif
1179         uint8_t *d = (uint8_t *)dst;
1180         const uint16_t *s = (const uint16_t *)src;
1181         end = s + src_size/2;
1182 #ifdef HAVE_MMX
1183         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1184         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1185         mm_end = end - 3;
1186         while(s < mm_end)
1187         {
1188             __asm __volatile(
1189                 PREFETCH" 32%1\n\t"
1190                 "movq   %1, %%mm0\n\t"
1191                 "movq   %1, %%mm1\n\t"
1192                 "movq   %1, %%mm2\n\t"
1193                 "pand   %2, %%mm0\n\t"
1194                 "pand   %3, %%mm1\n\t"
1195                 "pand   %4, %%mm2\n\t"
1196                 "psllq  $3, %%mm0\n\t"
1197                 "psrlq  $2, %%mm1\n\t"
1198                 "psrlq  $7, %%mm2\n\t"
1199                 "movq   %%mm0, %%mm3\n\t"
1200                 "movq   %%mm1, %%mm4\n\t"
1201                 "movq   %%mm2, %%mm5\n\t"
1202                 "punpcklwd %%mm7, %%mm0\n\t"
1203                 "punpcklwd %%mm7, %%mm1\n\t"
1204                 "punpcklwd %%mm7, %%mm2\n\t"
1205                 "punpckhwd %%mm7, %%mm3\n\t"
1206                 "punpckhwd %%mm7, %%mm4\n\t"
1207                 "punpckhwd %%mm7, %%mm5\n\t"
1208                 "psllq  $8, %%mm1\n\t"
1209                 "psllq  $16, %%mm2\n\t"
1210                 "por    %%mm1, %%mm0\n\t"
1211                 "por    %%mm2, %%mm0\n\t"
1212                 "psllq  $8, %%mm4\n\t"
1213                 "psllq  $16, %%mm5\n\t"
1214                 "por    %%mm4, %%mm3\n\t"
1215                 "por    %%mm5, %%mm3\n\t"
1216                 MOVNTQ" %%mm0, %0\n\t"
1217                 MOVNTQ" %%mm3, 8%0\n\t"
1218                 :"=m"(*d)
1219                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220                 :"memory");
1221                 d += 16;
1222                 s += 4;
1223         }
1224         __asm __volatile(SFENCE:::"memory");
1225         __asm __volatile(EMMS:::"memory");
1226 #endif
1227         while(s < end)
1228         {
1229 #if 0 //slightly slower on athlon
1230                 int bgr= *s++;
1231                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232 #else
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234                 register uint16_t bgr;
1235                 bgr = *s++;
1236                 *d++ = (bgr&0x1F)<<3;
1237                 *d++ = (bgr&0x3E0)>>2;
1238                 *d++ = (bgr&0x7C00)>>7;
1239                 *d++ = 0;
1240 #endif
1241         }
1242 }
1243
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245 {
1246         const uint16_t *end;
1247 #ifdef HAVE_MMX
1248         const uint16_t *mm_end;
1249 #endif
1250         uint8_t *d = (uint8_t *)dst;
1251         const uint16_t *s = (uint16_t *)src;
1252         end = s + src_size/2;
1253 #ifdef HAVE_MMX
1254         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1255         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1256         mm_end = end - 3;
1257         while(s < mm_end)
1258         {
1259             __asm __volatile(
1260                 PREFETCH" 32%1\n\t"
1261                 "movq   %1, %%mm0\n\t"
1262                 "movq   %1, %%mm1\n\t"
1263                 "movq   %1, %%mm2\n\t"
1264                 "pand   %2, %%mm0\n\t"
1265                 "pand   %3, %%mm1\n\t"
1266                 "pand   %4, %%mm2\n\t"
1267                 "psllq  $3, %%mm0\n\t"
1268                 "psrlq  $3, %%mm1\n\t"
1269                 "psrlq  $8, %%mm2\n\t"
1270                 "movq   %%mm0, %%mm3\n\t"
1271                 "movq   %%mm1, %%mm4\n\t"
1272                 "movq   %%mm2, %%mm5\n\t"
1273                 "punpcklwd %%mm7, %%mm0\n\t"
1274                 "punpcklwd %%mm7, %%mm1\n\t"
1275                 "punpcklwd %%mm7, %%mm2\n\t"
1276                 "punpckhwd %%mm7, %%mm3\n\t"
1277                 "punpckhwd %%mm7, %%mm4\n\t"
1278                 "punpckhwd %%mm7, %%mm5\n\t"
1279                 "psllq  $8, %%mm1\n\t"
1280                 "psllq  $16, %%mm2\n\t"
1281                 "por    %%mm1, %%mm0\n\t"
1282                 "por    %%mm2, %%mm0\n\t"
1283                 "psllq  $8, %%mm4\n\t"
1284                 "psllq  $16, %%mm5\n\t"
1285                 "por    %%mm4, %%mm3\n\t"
1286                 "por    %%mm5, %%mm3\n\t"
1287                 MOVNTQ" %%mm0, %0\n\t"
1288                 MOVNTQ" %%mm3, 8%0\n\t"
1289                 :"=m"(*d)
1290                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291                 :"memory");
1292                 d += 16;
1293                 s += 4;
1294         }
1295         __asm __volatile(SFENCE:::"memory");
1296         __asm __volatile(EMMS:::"memory");
1297 #endif
1298         while(s < end)
1299         {
1300                 register uint16_t bgr;
1301                 bgr = *s++;
1302                 *d++ = (bgr&0x1F)<<3;
1303                 *d++ = (bgr&0x7E0)>>3;
1304                 *d++ = (bgr&0xF800)>>8;
1305                 *d++ = 0;
1306         }
1307 }
1308
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310 {
1311 #ifdef HAVE_MMX
1312 /* TODO: unroll this loop */
1313         asm volatile (
1314                 "xorl %%eax, %%eax              \n\t"
1315                 ".balign 16                     \n\t"
1316                 "1:                             \n\t"
1317                 PREFETCH" 32(%0, %%eax)         \n\t"
1318                 "movq (%0, %%eax), %%mm0        \n\t"
1319                 "movq %%mm0, %%mm1              \n\t"
1320                 "movq %%mm0, %%mm2              \n\t"
1321                 "pslld $16, %%mm0               \n\t"
1322                 "psrld $16, %%mm1               \n\t"
1323                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1324                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1325                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1326                 "por %%mm0, %%mm2               \n\t"
1327                 "por %%mm1, %%mm2               \n\t"
1328                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1329                 "addl $8, %%eax                 \n\t"
1330                 "cmpl %2, %%eax                 \n\t"
1331                 " jb 1b                         \n\t"
1332                 :: "r" (src), "r"(dst), "r" (src_size-7)
1333                 : "%eax"
1334         );
1335
1336         __asm __volatile(SFENCE:::"memory");
1337         __asm __volatile(EMMS:::"memory");
1338 #else
1339         unsigned i;
1340         unsigned num_pixels = src_size >> 2;
1341         for(i=0; i<num_pixels; i++)
1342         {
1343 #ifdef WORDS_BIGENDIAN  
1344           dst[4*i + 1] = src[4*i + 3];
1345           dst[4*i + 2] = src[4*i + 2];
1346           dst[4*i + 3] = src[4*i + 1];
1347 #else
1348           dst[4*i + 0] = src[4*i + 2];
1349           dst[4*i + 1] = src[4*i + 1];
1350           dst[4*i + 2] = src[4*i + 0];
1351 #endif
1352         }
1353 #endif
1354 }
1355
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357 {
1358         unsigned i;
1359 #ifdef HAVE_MMX
1360         int mmx_size= 23 - src_size;
1361         asm volatile (
1362                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1363                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1364                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1365                 ".balign 16                     \n\t"
1366                 "1:                             \n\t"
1367                 PREFETCH" 32(%1, %%eax)         \n\t"
1368                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1369                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1370                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1371                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1372                 "pand %%mm5, %%mm0              \n\t"
1373                 "pand %%mm6, %%mm1              \n\t"
1374                 "pand %%mm7, %%mm2              \n\t"
1375                 "por %%mm0, %%mm1               \n\t"
1376                 "por %%mm2, %%mm1               \n\t"                
1377                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1378                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1379                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1380                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1381                 "pand %%mm7, %%mm0              \n\t"
1382                 "pand %%mm5, %%mm1              \n\t"
1383                 "pand %%mm6, %%mm2              \n\t"
1384                 "por %%mm0, %%mm1               \n\t"
1385                 "por %%mm2, %%mm1               \n\t"                
1386                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1387                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1388                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1389                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1390                 "pand %%mm6, %%mm0              \n\t"
1391                 "pand %%mm7, %%mm1              \n\t"
1392                 "pand %%mm5, %%mm2              \n\t"
1393                 "por %%mm0, %%mm1               \n\t"
1394                 "por %%mm2, %%mm1               \n\t"                
1395                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1396                 "addl $24, %%eax                \n\t"
1397                 " js 1b                         \n\t"
1398                 : "+a" (mmx_size)
1399                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1400         );
1401
1402         __asm __volatile(SFENCE:::"memory");
1403         __asm __volatile(EMMS:::"memory");
1404
1405         if(mmx_size==23) return; //finihsed, was multiple of 8
1406
1407         src+= src_size;
1408         dst+= src_size;
1409         src_size= 23-mmx_size;
1410         src-= src_size;
1411         dst-= src_size;
1412 #endif
1413         for(i=0; i<src_size; i+=3)
1414         {
1415                 register uint8_t x;
1416                 x          = src[i + 2];
1417                 dst[i + 1] = src[i + 1];
1418                 dst[i + 2] = src[i + 0];
1419                 dst[i + 0] = x;
1420         }
1421 }
1422
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424         unsigned int width, unsigned int height,
1425         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426 {
1427         unsigned y;
1428         const unsigned chromWidth= width>>1;
1429         for(y=0; y<height; y++)
1430         {
1431 #ifdef HAVE_MMX
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433                 asm volatile(
1434                         "xorl %%eax, %%eax              \n\t"
1435                         ".balign 16                     \n\t"
1436                         "1:                             \n\t"
1437                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1438                         PREFETCH" 32(%2, %%eax)         \n\t"
1439                         PREFETCH" 32(%3, %%eax)         \n\t"
1440                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441                         "movq %%mm0, %%mm2              \n\t" // U(0)
1442                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1444                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1445
1446                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1447                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1448                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1449                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1450                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1451                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1452                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1453                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1454
1455                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1456                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1457                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1459
1460                         "addl $8, %%eax                 \n\t"
1461                         "cmpl %4, %%eax                 \n\t"
1462                         " jb 1b                         \n\t"
1463                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464                         : "%eax"
1465                 );
1466 #else
1467
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n)                                      \
1470         y1 = yc[n];                                     \
1471         y2 = yc2[n];                                    \
1472         u = uc[n];                                      \
1473         v = vc[n];                                      \
1474         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1475         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1476         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478         yuv1 = (u << 8) + (v << 24);                    \
1479         yuv2 = yuv1 + y2;                               \
1480         yuv1 += y1;                                     \
1481         qdst[n] = yuv1;                                 \
1482         qdst2[n] = yuv2;
1483
1484                 int i;
1485                 uint64_t *qdst = (uint64_t *) dst;
1486                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487                 const uint32_t *yc = (uint32_t *) ysrc;
1488                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490                 for(i = 0; i < chromWidth; i += 8){
1491                         uint64_t y1, y2, yuv1, yuv2;
1492                         uint64_t u, v;
1493                         /* Prefetch */
1494                         asm("ldq $31,64(%0)" :: "r"(yc));
1495                         asm("ldq $31,64(%0)" :: "r"(yc2));
1496                         asm("ldq $31,64(%0)" :: "r"(uc));
1497                         asm("ldq $31,64(%0)" :: "r"(vc));
1498
1499                         pl2yuy2(0);
1500                         pl2yuy2(1);
1501                         pl2yuy2(2);
1502                         pl2yuy2(3);
1503
1504                         yc += 4;
1505                         yc2 += 4;
1506                         uc += 4;
1507                         vc += 4;
1508                         qdst += 4;
1509                         qdst2 += 4;
1510                 }
1511                 y++;
1512                 ysrc += lumStride;
1513                 dst += dstStride;
1514
1515 #elif __WORDSIZE >= 64
1516                 int i;
1517                 uint64_t *ldst = (uint64_t *) dst;
1518                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519                 for(i = 0; i < chromWidth; i += 2){
1520                         uint64_t k, l;
1521                         k = yc[0] + (uc[0] << 8) +
1522                             (yc[1] << 16) + (vc[0] << 24);
1523                         l = yc[2] + (uc[1] << 8) +
1524                             (yc[3] << 16) + (vc[1] << 24);
1525                         *ldst++ = k + (l << 32);
1526                         yc += 4;
1527                         uc += 2;
1528                         vc += 2;
1529                 }
1530
1531 #else
1532                 int i, *idst = (int32_t *) dst;
1533                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534                 for(i = 0; i < chromWidth; i++){
1535 #ifdef WORDS_BIGENDIAN
1536                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1537                             (yc[1] << 8) + (vc[0] << 0);
1538 #else
1539                         *idst++ = yc[0] + (uc[0] << 8) +
1540                             (yc[1] << 16) + (vc[0] << 24);
1541 #endif
1542                         yc += 2;
1543                         uc++;
1544                         vc++;
1545                 }
1546 #endif
1547 #endif
1548                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1549                 {
1550                         usrc += chromStride;
1551                         vsrc += chromStride;
1552                 }
1553                 ysrc += lumStride;
1554                 dst += dstStride;
1555         }
1556 #ifdef HAVE_MMX
1557 asm(    EMMS" \n\t"
1558         SFENCE" \n\t"
1559         :::"memory");
1560 #endif
1561 }
1562
1563 /**
1564  *
1565  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1566  * problem for anyone then tell me, and ill fix it)
1567  */
1568 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1569         unsigned int width, unsigned int height,
1570         int lumStride, int chromStride, int dstStride)
1571 {
1572         //FIXME interpolate chroma
1573         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1574 }
1575
1576 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1577         unsigned int width, unsigned int height,
1578         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1579 {
1580         unsigned y;
1581         const unsigned chromWidth= width>>1;
1582         for(y=0; y<height; y++)
1583         {
1584 #ifdef HAVE_MMX
1585 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1586                 asm volatile(
1587                         "xorl %%eax, %%eax              \n\t"
1588                         ".balign 16                     \n\t"
1589                         "1:                             \n\t"
1590                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1591                         PREFETCH" 32(%2, %%eax)         \n\t"
1592                         PREFETCH" 32(%3, %%eax)         \n\t"
1593                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1594                         "movq %%mm0, %%mm2              \n\t" // U(0)
1595                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1596                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1597                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1598
1599                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1600                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1601                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1602                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1603                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1604                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1605                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1606                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1607
1608                         MOVNTQ" %%mm0, (%0, %%eax, 4)   \n\t"
1609                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1610                         MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1611                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1612
1613                         "addl $8, %%eax                 \n\t"
1614                         "cmpl %4, %%eax                 \n\t"
1615                         " jb 1b                         \n\t"
1616                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1617                         : "%eax"
1618                 );
1619 #else
1620 //FIXME adapt the alpha asm code from yv12->yuy2
1621
1622 #if __WORDSIZE >= 64
1623                 int i;
1624                 uint64_t *ldst = (uint64_t *) dst;
1625                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626                 for(i = 0; i < chromWidth; i += 2){
1627                         uint64_t k, l;
1628                         k = uc[0] + (yc[0] << 8) +
1629                             (vc[0] << 16) + (yc[1] << 24);
1630                         l = uc[1] + (yc[2] << 8) +
1631                             (vc[1] << 16) + (yc[3] << 24);
1632                         *ldst++ = k + (l << 32);
1633                         yc += 4;
1634                         uc += 2;
1635                         vc += 2;
1636                 }
1637
1638 #else
1639                 int i, *idst = (int32_t *) dst;
1640                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641                 for(i = 0; i < chromWidth; i++){
1642 #ifdef WORDS_BIGENDIAN
1643                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644                             (vc[0] << 8) + (yc[1] << 0);
1645 #else
1646                         *idst++ = uc[0] + (yc[0] << 8) +
1647                             (vc[0] << 16) + (yc[1] << 24);
1648 #endif
1649                         yc += 2;
1650                         uc++;
1651                         vc++;
1652                 }
1653 #endif
1654 #endif
1655                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1656                 {
1657                         usrc += chromStride;
1658                         vsrc += chromStride;
1659                 }
1660                 ysrc += lumStride;
1661                 dst += dstStride;
1662         }
1663 #ifdef HAVE_MMX
1664 asm(    EMMS" \n\t"
1665         SFENCE" \n\t"
1666         :::"memory");
1667 #endif
1668 }
1669
1670 /**
1671  *
1672  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1673  * problem for anyone then tell me, and ill fix it)
1674  */
1675 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1676         unsigned int width, unsigned int height,
1677         int lumStride, int chromStride, int dstStride)
1678 {
1679         //FIXME interpolate chroma
1680         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1681 }
1682
1683 /**
1684  *
1685  * width should be a multiple of 16
1686  */
1687 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1688         unsigned int width, unsigned int height,
1689         int lumStride, int chromStride, int dstStride)
1690 {
1691         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1692 }
1693
1694 /**
1695  *
1696  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1697  * problem for anyone then tell me, and ill fix it)
1698  */
1699 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1700         unsigned int width, unsigned int height,
1701         int lumStride, int chromStride, int srcStride)
1702 {
1703         unsigned y;
1704         const unsigned chromWidth= width>>1;
1705         for(y=0; y<height; y+=2)
1706         {
1707 #ifdef HAVE_MMX
1708                 asm volatile(
1709                         "xorl %%eax, %%eax              \n\t"
1710                         "pcmpeqw %%mm7, %%mm7           \n\t"
1711                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1712                         ".balign 16                     \n\t"
1713                         "1:                             \n\t"
1714                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1715                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1716                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1717                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1718                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1719                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1720                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1721                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1722                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1723                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1724                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1725
1726                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1727
1728                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1729                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1730                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1731                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1732                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1733                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1734                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1735                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1736                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1737                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1738
1739                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1740
1741                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1742                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1743                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1744                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1745                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1746                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1747                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1748                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1749
1750                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1751                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1752
1753                         "addl $8, %%eax                 \n\t"
1754                         "cmpl %4, %%eax                 \n\t"
1755                         " jb 1b                         \n\t"
1756                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1757                         : "memory", "%eax"
1758                 );
1759
1760                 ydst += lumStride;
1761                 src  += srcStride;
1762
1763                 asm volatile(
1764                         "xorl %%eax, %%eax              \n\t"
1765                         ".balign 16                     \n\t"
1766                         "1:                             \n\t"
1767                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1768                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1769                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1770                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1771                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1772                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1773                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1774                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1775                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1776                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1777                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1778
1779                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1780                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1781
1782                         "addl $8, %%eax                 \n\t"
1783                         "cmpl %4, %%eax                 \n\t"
1784                         " jb 1b                         \n\t"
1785
1786                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1787                         : "memory", "%eax"
1788                 );
1789 #else
1790                 unsigned i;
1791                 for(i=0; i<chromWidth; i++)
1792                 {
1793                         ydst[2*i+0]     = src[4*i+0];
1794                         udst[i]         = src[4*i+1];
1795                         ydst[2*i+1]     = src[4*i+2];
1796                         vdst[i]         = src[4*i+3];
1797                 }
1798                 ydst += lumStride;
1799                 src  += srcStride;
1800
1801                 for(i=0; i<chromWidth; i++)
1802                 {
1803                         ydst[2*i+0]     = src[4*i+0];
1804                         ydst[2*i+1]     = src[4*i+2];
1805                 }
1806 #endif
1807                 udst += chromStride;
1808                 vdst += chromStride;
1809                 ydst += lumStride;
1810                 src  += srcStride;
1811         }
1812 #ifdef HAVE_MMX
1813 asm volatile(   EMMS" \n\t"
1814                 SFENCE" \n\t"
1815                 :::"memory");
1816 #endif
1817 }
1818
1819 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1820         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1821         unsigned int width, unsigned int height, int lumStride, int chromStride)
1822 {
1823         /* Y Plane */
1824         memcpy(ydst, ysrc, width*height);
1825
1826         /* XXX: implement upscaling for U,V */
1827 }
1828
1829 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1830 {
1831         int x,y;
1832         
1833         dst[0]= src[0];
1834         
1835         // first line
1836         for(x=0; x<srcWidth-1; x++){
1837                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1838                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1839         }
1840         dst[2*srcWidth-1]= src[srcWidth-1];
1841         
1842         dst+= dstStride;
1843
1844         for(y=1; y<srcHeight; y++){
1845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1846                 const int mmxSize= srcWidth&~15;
1847                 asm volatile(
1848                         "movl %4, %%eax                 \n\t"
1849                         "1:                             \n\t"
1850                         "movq (%0, %%eax), %%mm0        \n\t"
1851                         "movq (%1, %%eax), %%mm1        \n\t"
1852                         "movq 1(%0, %%eax), %%mm2       \n\t"
1853                         "movq 1(%1, %%eax), %%mm3       \n\t"
1854                         "movq -1(%0, %%eax), %%mm4      \n\t"
1855                         "movq -1(%1, %%eax), %%mm5      \n\t"
1856                         PAVGB" %%mm0, %%mm5             \n\t"
1857                         PAVGB" %%mm0, %%mm3             \n\t"
1858                         PAVGB" %%mm0, %%mm5             \n\t"
1859                         PAVGB" %%mm0, %%mm3             \n\t"
1860                         PAVGB" %%mm1, %%mm4             \n\t"
1861                         PAVGB" %%mm1, %%mm2             \n\t"
1862                         PAVGB" %%mm1, %%mm4             \n\t"
1863                         PAVGB" %%mm1, %%mm2             \n\t"
1864                         "movq %%mm5, %%mm7              \n\t"
1865                         "movq %%mm4, %%mm6              \n\t"
1866                         "punpcklbw %%mm3, %%mm5         \n\t"
1867                         "punpckhbw %%mm3, %%mm7         \n\t"
1868                         "punpcklbw %%mm2, %%mm4         \n\t"
1869                         "punpckhbw %%mm2, %%mm6         \n\t"
1870 #if 1
1871                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
1872                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
1873                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
1874                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
1875 #else
1876                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
1877                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
1878                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
1879                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
1880 #endif
1881                         "addl $8, %%eax                 \n\t"
1882                         " js 1b                         \n\t"
1883                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1884                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1885                            "g" (-mmxSize)
1886                         : "%eax"
1887
1888                 );
1889 #else
1890                 const int mmxSize=1;
1891 #endif
1892                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1893                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1894
1895                 for(x=mmxSize-1; x<srcWidth-1; x++){
1896                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1897                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1898                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1899                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1900                 }
1901                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1902                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1903
1904                 dst+=dstStride*2;
1905                 src+=srcStride;
1906         }
1907         
1908         // last line
1909 #if 1
1910         dst[0]= src[0];
1911         
1912         for(x=0; x<srcWidth-1; x++){
1913                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1914                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1915         }
1916         dst[2*srcWidth-1]= src[srcWidth-1];
1917 #else
1918         for(x=0; x<srcWidth; x++){
1919                 dst[2*x+0]=
1920                 dst[2*x+1]= src[x];
1921         }
1922 #endif
1923
1924 #ifdef HAVE_MMX
1925 asm volatile(   EMMS" \n\t"
1926                 SFENCE" \n\t"
1927                 :::"memory");
1928 #endif
1929 }
1930
1931 /**
1932  *
1933  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1934  * problem for anyone then tell me, and ill fix it)
1935  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1936  */
1937 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1938         unsigned int width, unsigned int height,
1939         int lumStride, int chromStride, int srcStride)
1940 {
1941         unsigned y;
1942         const unsigned chromWidth= width>>1;
1943         for(y=0; y<height; y+=2)
1944         {
1945 #ifdef HAVE_MMX
1946                 asm volatile(
1947                         "xorl %%eax, %%eax              \n\t"
1948                         "pcmpeqw %%mm7, %%mm7           \n\t"
1949                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1950                         ".balign 16                     \n\t"
1951                         "1:                             \n\t"
1952                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1953                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1954                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1955                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1956                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1957                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1958                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1959                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1960                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1961                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1962                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1963
1964                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1965
1966                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1967                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1968                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1969                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1970                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1971                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1972                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1973                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1974                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1975                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1976
1977                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1978
1979                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1980                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1981                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1982                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1983                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1984                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1985                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1986                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1987
1988                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1989                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1990
1991                         "addl $8, %%eax                 \n\t"
1992                         "cmpl %4, %%eax                 \n\t"
1993                         " jb 1b                         \n\t"
1994                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1995                         : "memory", "%eax"
1996                 );
1997
1998                 ydst += lumStride;
1999                 src  += srcStride;
2000
2001                 asm volatile(
2002                         "xorl %%eax, %%eax              \n\t"
2003                         ".balign 16                     \n\t"
2004                         "1:                             \n\t"
2005                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2006                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2007                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2008                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2009                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2010                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2011                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2012                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2013                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2014                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2015                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2016
2017                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2018                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2019
2020                         "addl $8, %%eax                 \n\t"
2021                         "cmpl %4, %%eax                 \n\t"
2022                         " jb 1b                         \n\t"
2023
2024                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2025                         : "memory", "%eax"
2026                 );
2027 #else
2028                 unsigned i;
2029                 for(i=0; i<chromWidth; i++)
2030                 {
2031                         udst[i]         = src[4*i+0];
2032                         ydst[2*i+0]     = src[4*i+1];
2033                         vdst[i]         = src[4*i+2];
2034                         ydst[2*i+1]     = src[4*i+3];
2035                 }
2036                 ydst += lumStride;
2037                 src  += srcStride;
2038
2039                 for(i=0; i<chromWidth; i++)
2040                 {
2041                         ydst[2*i+0]     = src[4*i+1];
2042                         ydst[2*i+1]     = src[4*i+3];
2043                 }
2044 #endif
2045                 udst += chromStride;
2046                 vdst += chromStride;
2047                 ydst += lumStride;
2048                 src  += srcStride;
2049         }
2050 #ifdef HAVE_MMX
2051 asm volatile(   EMMS" \n\t"
2052                 SFENCE" \n\t"
2053                 :::"memory");
2054 #endif
2055 }
2056
2057 /**
2058  *
2059  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2060  * problem for anyone then tell me, and ill fix it)
2061  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2062  */
2063 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2064         unsigned int width, unsigned int height,
2065         int lumStride, int chromStride, int srcStride)
2066 {
2067         unsigned y;
2068         const unsigned chromWidth= width>>1;
2069 #ifdef HAVE_MMX
2070         for(y=0; y<height-2; y+=2)
2071         {
2072                 unsigned i;
2073                 for(i=0; i<2; i++)
2074                 {
2075                         asm volatile(
2076                                 "movl %2, %%eax                 \n\t"
2077                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2078                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2079                                 "pxor %%mm7, %%mm7              \n\t"
2080                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2081                                 ".balign 16                     \n\t"
2082                                 "1:                             \n\t"
2083                                 PREFETCH" 64(%0, %%ebx)         \n\t"
2084                                 "movd (%0, %%ebx), %%mm0        \n\t"
2085                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
2086                                 "punpcklbw %%mm7, %%mm0         \n\t"
2087                                 "punpcklbw %%mm7, %%mm1         \n\t"
2088                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
2089                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
2090                                 "punpcklbw %%mm7, %%mm2         \n\t"
2091                                 "punpcklbw %%mm7, %%mm3         \n\t"
2092                                 "pmaddwd %%mm6, %%mm0           \n\t"
2093                                 "pmaddwd %%mm6, %%mm1           \n\t"
2094                                 "pmaddwd %%mm6, %%mm2           \n\t"
2095                                 "pmaddwd %%mm6, %%mm3           \n\t"
2096 #ifndef FAST_BGR2YV12
2097                                 "psrad $8, %%mm0                \n\t"
2098                                 "psrad $8, %%mm1                \n\t"
2099                                 "psrad $8, %%mm2                \n\t"
2100                                 "psrad $8, %%mm3                \n\t"
2101 #endif
2102                                 "packssdw %%mm1, %%mm0          \n\t"
2103                                 "packssdw %%mm3, %%mm2          \n\t"
2104                                 "pmaddwd %%mm5, %%mm0           \n\t"
2105                                 "pmaddwd %%mm5, %%mm2           \n\t"
2106                                 "packssdw %%mm2, %%mm0          \n\t"
2107                                 "psraw $7, %%mm0                \n\t"
2108
2109                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
2110                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
2111                                 "punpcklbw %%mm7, %%mm4         \n\t"
2112                                 "punpcklbw %%mm7, %%mm1         \n\t"
2113                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
2114                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
2115                                 "punpcklbw %%mm7, %%mm2         \n\t"
2116                                 "punpcklbw %%mm7, %%mm3         \n\t"
2117                                 "pmaddwd %%mm6, %%mm4           \n\t"
2118                                 "pmaddwd %%mm6, %%mm1           \n\t"
2119                                 "pmaddwd %%mm6, %%mm2           \n\t"
2120                                 "pmaddwd %%mm6, %%mm3           \n\t"
2121 #ifndef FAST_BGR2YV12
2122                                 "psrad $8, %%mm4                \n\t"
2123                                 "psrad $8, %%mm1                \n\t"
2124                                 "psrad $8, %%mm2                \n\t"
2125                                 "psrad $8, %%mm3                \n\t"
2126 #endif
2127                                 "packssdw %%mm1, %%mm4          \n\t"
2128                                 "packssdw %%mm3, %%mm2          \n\t"
2129                                 "pmaddwd %%mm5, %%mm4           \n\t"
2130                                 "pmaddwd %%mm5, %%mm2           \n\t"
2131                                 "addl $24, %%ebx                \n\t"
2132                                 "packssdw %%mm2, %%mm4          \n\t"
2133                                 "psraw $7, %%mm4                \n\t"
2134
2135                                 "packuswb %%mm4, %%mm0          \n\t"
2136                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2137
2138                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
2139                                 "addl $8, %%eax                 \n\t"
2140                                 " js 1b                         \n\t"
2141                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2142                                 : "%eax", "%ebx"
2143                         );
2144                         ydst += lumStride;
2145                         src  += srcStride;
2146                 }
2147                 src -= srcStride*2;
2148                 asm volatile(
2149                         "movl %4, %%eax                 \n\t"
2150                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2151                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2152                         "pxor %%mm7, %%mm7              \n\t"
2153                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2154                         "addl %%ebx, %%ebx              \n\t"
2155                         ".balign 16                     \n\t"
2156                         "1:                             \n\t"
2157                         PREFETCH" 64(%0, %%ebx)         \n\t"
2158                         PREFETCH" 64(%1, %%ebx)         \n\t"
2159 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2160                         "movq (%0, %%ebx), %%mm0        \n\t"
2161                         "movq (%1, %%ebx), %%mm1        \n\t"
2162                         "movq 6(%0, %%ebx), %%mm2       \n\t"
2163                         "movq 6(%1, %%ebx), %%mm3       \n\t"
2164                         PAVGB" %%mm1, %%mm0             \n\t"
2165                         PAVGB" %%mm3, %%mm2             \n\t"
2166                         "movq %%mm0, %%mm1              \n\t"
2167                         "movq %%mm2, %%mm3              \n\t"
2168                         "psrlq $24, %%mm0               \n\t"
2169                         "psrlq $24, %%mm2               \n\t"
2170                         PAVGB" %%mm1, %%mm0             \n\t"
2171                         PAVGB" %%mm3, %%mm2             \n\t"
2172                         "punpcklbw %%mm7, %%mm0         \n\t"
2173                         "punpcklbw %%mm7, %%mm2         \n\t"
2174 #else
2175                         "movd (%0, %%ebx), %%mm0        \n\t"
2176                         "movd (%1, %%ebx), %%mm1        \n\t"
2177                         "movd 3(%0, %%ebx), %%mm2       \n\t"
2178                         "movd 3(%1, %%ebx), %%mm3       \n\t"
2179                         "punpcklbw %%mm7, %%mm0         \n\t"
2180                         "punpcklbw %%mm7, %%mm1         \n\t"
2181                         "punpcklbw %%mm7, %%mm2         \n\t"
2182                         "punpcklbw %%mm7, %%mm3         \n\t"
2183                         "paddw %%mm1, %%mm0             \n\t"
2184                         "paddw %%mm3, %%mm2             \n\t"
2185                         "paddw %%mm2, %%mm0             \n\t"
2186                         "movd 6(%0, %%ebx), %%mm4       \n\t"
2187                         "movd 6(%1, %%ebx), %%mm1       \n\t"
2188                         "movd 9(%0, %%ebx), %%mm2       \n\t"
2189                         "movd 9(%1, %%ebx), %%mm3       \n\t"
2190                         "punpcklbw %%mm7, %%mm4         \n\t"
2191                         "punpcklbw %%mm7, %%mm1         \n\t"
2192                         "punpcklbw %%mm7, %%mm2         \n\t"
2193                         "punpcklbw %%mm7, %%mm3         \n\t"
2194                         "paddw %%mm1, %%mm4             \n\t"
2195                         "paddw %%mm3, %%mm2             \n\t"
2196                         "paddw %%mm4, %%mm2             \n\t"
2197                         "psrlw $2, %%mm0                \n\t"
2198                         "psrlw $2, %%mm2                \n\t"
2199 #endif
2200                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2201                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2202
2203                         "pmaddwd %%mm0, %%mm1           \n\t"
2204                         "pmaddwd %%mm2, %%mm3           \n\t"
2205                         "pmaddwd %%mm6, %%mm0           \n\t"
2206                         "pmaddwd %%mm6, %%mm2           \n\t"
2207 #ifndef FAST_BGR2YV12
2208                         "psrad $8, %%mm0                \n\t"
2209                         "psrad $8, %%mm1                \n\t"
2210                         "psrad $8, %%mm2                \n\t"
2211                         "psrad $8, %%mm3                \n\t"
2212 #endif
2213                         "packssdw %%mm2, %%mm0          \n\t"
2214                         "packssdw %%mm3, %%mm1          \n\t"
2215                         "pmaddwd %%mm5, %%mm0           \n\t"
2216                         "pmaddwd %%mm5, %%mm1           \n\t"
2217                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2218                         "psraw $7, %%mm0                \n\t"
2219
2220 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2221                         "movq 12(%0, %%ebx), %%mm4      \n\t"
2222                         "movq 12(%1, %%ebx), %%mm1      \n\t"
2223                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2224                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2225                         PAVGB" %%mm1, %%mm4             \n\t"
2226                         PAVGB" %%mm3, %%mm2             \n\t"
2227                         "movq %%mm4, %%mm1              \n\t"
2228                         "movq %%mm2, %%mm3              \n\t"
2229                         "psrlq $24, %%mm4               \n\t"
2230                         "psrlq $24, %%mm2               \n\t"
2231                         PAVGB" %%mm1, %%mm4             \n\t"
2232                         PAVGB" %%mm3, %%mm2             \n\t"
2233                         "punpcklbw %%mm7, %%mm4         \n\t"
2234                         "punpcklbw %%mm7, %%mm2         \n\t"
2235 #else
2236                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2237                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2238                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2239                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2240                         "punpcklbw %%mm7, %%mm4         \n\t"
2241                         "punpcklbw %%mm7, %%mm1         \n\t"
2242                         "punpcklbw %%mm7, %%mm2         \n\t"
2243                         "punpcklbw %%mm7, %%mm3         \n\t"
2244                         "paddw %%mm1, %%mm4             \n\t"
2245                         "paddw %%mm3, %%mm2             \n\t"
2246                         "paddw %%mm2, %%mm4             \n\t"
2247                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2248                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2249                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2250                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2251                         "punpcklbw %%mm7, %%mm5         \n\t"
2252                         "punpcklbw %%mm7, %%mm1         \n\t"
2253                         "punpcklbw %%mm7, %%mm2         \n\t"
2254                         "punpcklbw %%mm7, %%mm3         \n\t"
2255                         "paddw %%mm1, %%mm5             \n\t"
2256                         "paddw %%mm3, %%mm2             \n\t"
2257                         "paddw %%mm5, %%mm2             \n\t"
2258                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2259                         "psrlw $2, %%mm4                \n\t"
2260                         "psrlw $2, %%mm2                \n\t"
2261 #endif
2262                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2263                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2264
2265                         "pmaddwd %%mm4, %%mm1           \n\t"
2266                         "pmaddwd %%mm2, %%mm3           \n\t"
2267                         "pmaddwd %%mm6, %%mm4           \n\t"
2268                         "pmaddwd %%mm6, %%mm2           \n\t"
2269 #ifndef FAST_BGR2YV12
2270                         "psrad $8, %%mm4                \n\t"
2271                         "psrad $8, %%mm1                \n\t"
2272                         "psrad $8, %%mm2                \n\t"
2273                         "psrad $8, %%mm3                \n\t"
2274 #endif
2275                         "packssdw %%mm2, %%mm4          \n\t"
2276                         "packssdw %%mm3, %%mm1          \n\t"
2277                         "pmaddwd %%mm5, %%mm4           \n\t"
2278                         "pmaddwd %%mm5, %%mm1           \n\t"
2279                         "addl $24, %%ebx                \n\t"
2280                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2281                         "psraw $7, %%mm4                \n\t"
2282
2283                         "movq %%mm0, %%mm1              \n\t"
2284                         "punpckldq %%mm4, %%mm0         \n\t"
2285                         "punpckhdq %%mm4, %%mm1         \n\t"
2286                         "packsswb %%mm1, %%mm0          \n\t"
2287                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2288
2289                         "movd %%mm0, (%2, %%eax)        \n\t"
2290                         "punpckhdq %%mm0, %%mm0         \n\t"
2291                         "movd %%mm0, (%3, %%eax)        \n\t"
2292                         "addl $4, %%eax                 \n\t"
2293                         " js 1b                         \n\t"
2294                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2295                         : "%eax", "%ebx"
2296                 );
2297
2298                 udst += chromStride;
2299                 vdst += chromStride;
2300                 src  += srcStride*2;
2301         }
2302
2303         asm volatile(   EMMS" \n\t"
2304                         SFENCE" \n\t"
2305                         :::"memory");
2306 #else
2307         y=0;
2308 #endif
2309         for(; y<height; y+=2)
2310         {
2311                 unsigned i;
2312                 for(i=0; i<chromWidth; i++)
2313                 {
2314                         unsigned int b= src[6*i+0];
2315                         unsigned int g= src[6*i+1];
2316                         unsigned int r= src[6*i+2];
2317
2318                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2321
2322                         udst[i]         = U;
2323                         vdst[i]         = V;
2324                         ydst[2*i]       = Y;
2325
2326                         b= src[6*i+3];
2327                         g= src[6*i+4];
2328                         r= src[6*i+5];
2329
2330                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2331                         ydst[2*i+1]     = Y;
2332                 }
2333                 ydst += lumStride;
2334                 src  += srcStride;
2335
2336                 for(i=0; i<chromWidth; i++)
2337                 {
2338                         unsigned int b= src[6*i+0];
2339                         unsigned int g= src[6*i+1];
2340                         unsigned int r= src[6*i+2];
2341
2342                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2343
2344                         ydst[2*i]       = Y;
2345
2346                         b= src[6*i+3];
2347                         g= src[6*i+4];
2348                         r= src[6*i+5];
2349
2350                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2351                         ydst[2*i+1]     = Y;
2352                 }
2353                 udst += chromStride;
2354                 vdst += chromStride;
2355                 ydst += lumStride;
2356                 src  += srcStride;
2357         }
2358 }
2359
2360 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2361                             unsigned width, unsigned height, int src1Stride,
2362                             int src2Stride, int dstStride){
2363         unsigned h;
2364
2365         for(h=0; h < height; h++)
2366         {
2367                 unsigned w;
2368
2369 #ifdef HAVE_MMX
2370 #ifdef HAVE_SSE2
2371                 asm(
2372                         "xorl %%eax, %%eax              \n\t"
2373                         "1:                             \n\t"
2374                         PREFETCH" 64(%1, %%eax)         \n\t"
2375                         PREFETCH" 64(%2, %%eax)         \n\t"
2376                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2377                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2378                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2379                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2380                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2381                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2382                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2383                         "addl $16, %%eax                        \n\t"
2384                         "cmpl %3, %%eax                 \n\t"
2385                         " jb 1b                         \n\t"
2386                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2387                         : "memory", "%eax"
2388                 );
2389 #else
2390                 asm(
2391                         "xorl %%eax, %%eax              \n\t"
2392                         "1:                             \n\t"
2393                         PREFETCH" 64(%1, %%eax)         \n\t"
2394                         PREFETCH" 64(%2, %%eax)         \n\t"
2395                         "movq (%1, %%eax), %%mm0        \n\t"
2396                         "movq 8(%1, %%eax), %%mm2       \n\t"
2397                         "movq %%mm0, %%mm1              \n\t"
2398                         "movq %%mm2, %%mm3              \n\t"
2399                         "movq (%2, %%eax), %%mm4        \n\t"
2400                         "movq 8(%2, %%eax), %%mm5       \n\t"
2401                         "punpcklbw %%mm4, %%mm0         \n\t"
2402                         "punpckhbw %%mm4, %%mm1         \n\t"
2403                         "punpcklbw %%mm5, %%mm2         \n\t"
2404                         "punpckhbw %%mm5, %%mm3         \n\t"
2405                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2406                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2407                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2408                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2409                         "addl $16, %%eax                        \n\t"
2410                         "cmpl %3, %%eax                 \n\t"
2411                         " jb 1b                         \n\t"
2412                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2413                         : "memory", "%eax"
2414                 );
2415 #endif
2416                 for(w= (width&(~15)); w < width; w++)
2417                 {
2418                         dest[2*w+0] = src1[w];
2419                         dest[2*w+1] = src2[w];
2420                 }
2421 #else
2422                 for(w=0; w < width; w++)
2423                 {
2424                         dest[2*w+0] = src1[w];
2425                         dest[2*w+1] = src2[w];
2426                 }
2427 #endif
2428                 dest += dstStride;
2429                 src1 += src1Stride;
2430                 src2 += src2Stride;
2431         }
2432 #ifdef HAVE_MMX
2433         asm(
2434                 EMMS" \n\t"
2435                 SFENCE" \n\t"
2436                 ::: "memory"
2437                 );
2438 #endif
2439 }
2440
2441 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2442                         uint8_t *dst1, uint8_t *dst2,
2443                         unsigned width, unsigned height,
2444                         int srcStride1, int srcStride2,
2445                         int dstStride1, int dstStride2)
2446 {
2447     unsigned int y,x,h;
2448     int w;
2449     w=width/2; h=height/2;
2450 #ifdef HAVE_MMX
2451     asm volatile(
2452         PREFETCH" %0\n\t"
2453         PREFETCH" %1\n\t"
2454         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2455 #endif
2456     for(y=0;y<h;y++){
2457         const uint8_t* s1=src1+srcStride1*(y>>1);
2458         uint8_t* d=dst1+dstStride1*y;
2459         x=0;
2460 #ifdef HAVE_MMX
2461         for(;x<w-31;x+=32)
2462         {
2463             asm volatile(
2464                 PREFETCH" 32%1\n\t"
2465                 "movq   %1, %%mm0\n\t"
2466                 "movq   8%1, %%mm2\n\t"
2467                 "movq   16%1, %%mm4\n\t"
2468                 "movq   24%1, %%mm6\n\t"
2469                 "movq   %%mm0, %%mm1\n\t"
2470                 "movq   %%mm2, %%mm3\n\t"
2471                 "movq   %%mm4, %%mm5\n\t"
2472                 "movq   %%mm6, %%mm7\n\t"
2473                 "punpcklbw %%mm0, %%mm0\n\t"
2474                 "punpckhbw %%mm1, %%mm1\n\t"
2475                 "punpcklbw %%mm2, %%mm2\n\t"
2476                 "punpckhbw %%mm3, %%mm3\n\t"
2477                 "punpcklbw %%mm4, %%mm4\n\t"
2478                 "punpckhbw %%mm5, %%mm5\n\t"
2479                 "punpcklbw %%mm6, %%mm6\n\t"
2480                 "punpckhbw %%mm7, %%mm7\n\t"
2481                 MOVNTQ" %%mm0, %0\n\t"
2482                 MOVNTQ" %%mm1, 8%0\n\t"
2483                 MOVNTQ" %%mm2, 16%0\n\t"
2484                 MOVNTQ" %%mm3, 24%0\n\t"
2485                 MOVNTQ" %%mm4, 32%0\n\t"
2486                 MOVNTQ" %%mm5, 40%0\n\t"
2487                 MOVNTQ" %%mm6, 48%0\n\t"
2488                 MOVNTQ" %%mm7, 56%0"
2489                 :"=m"(d[2*x])
2490                 :"m"(s1[x])
2491                 :"memory");
2492         }
2493 #endif
2494         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2495     }
2496     for(y=0;y<h;y++){
2497         const uint8_t* s2=src2+srcStride2*(y>>1);
2498         uint8_t* d=dst2+dstStride2*y;
2499         x=0;
2500 #ifdef HAVE_MMX
2501         for(;x<w-31;x+=32)
2502         {
2503             asm volatile(
2504                 PREFETCH" 32%1\n\t"
2505                 "movq   %1, %%mm0\n\t"
2506                 "movq   8%1, %%mm2\n\t"
2507                 "movq   16%1, %%mm4\n\t"
2508                 "movq   24%1, %%mm6\n\t"
2509                 "movq   %%mm0, %%mm1\n\t"
2510                 "movq   %%mm2, %%mm3\n\t"
2511                 "movq   %%mm4, %%mm5\n\t"
2512                 "movq   %%mm6, %%mm7\n\t"
2513                 "punpcklbw %%mm0, %%mm0\n\t"
2514                 "punpckhbw %%mm1, %%mm1\n\t"
2515                 "punpcklbw %%mm2, %%mm2\n\t"
2516                 "punpckhbw %%mm3, %%mm3\n\t"
2517                 "punpcklbw %%mm4, %%mm4\n\t"
2518                 "punpckhbw %%mm5, %%mm5\n\t"
2519                 "punpcklbw %%mm6, %%mm6\n\t"
2520                 "punpckhbw %%mm7, %%mm7\n\t"
2521                 MOVNTQ" %%mm0, %0\n\t"
2522                 MOVNTQ" %%mm1, 8%0\n\t"
2523                 MOVNTQ" %%mm2, 16%0\n\t"
2524                 MOVNTQ" %%mm3, 24%0\n\t"
2525                 MOVNTQ" %%mm4, 32%0\n\t"
2526                 MOVNTQ" %%mm5, 40%0\n\t"
2527                 MOVNTQ" %%mm6, 48%0\n\t"
2528                 MOVNTQ" %%mm7, 56%0"
2529                 :"=m"(d[2*x])
2530                 :"m"(s2[x])
2531                 :"memory");
2532         }
2533 #endif
2534         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2535     }
2536 #ifdef HAVE_MMX
2537         asm(
2538                 EMMS" \n\t"
2539                 SFENCE" \n\t"
2540                 ::: "memory"
2541                 );
2542 #endif
2543 }
2544
2545 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2546                         uint8_t *dst,
2547                         unsigned width, unsigned height,
2548                         int srcStride1, int srcStride2,
2549                         int srcStride3, int dstStride)
2550 {
2551     unsigned y,x,w,h;
2552     w=width/2; h=height;
2553     for(y=0;y<h;y++){
2554         const uint8_t* yp=src1+srcStride1*y;
2555         const uint8_t* up=src2+srcStride2*(y>>2);
2556         const uint8_t* vp=src3+srcStride3*(y>>2);
2557         uint8_t* d=dst+dstStride*y;
2558         x=0;
2559 #ifdef HAVE_MMX
2560         for(;x<w-7;x+=8)
2561         {
2562             asm volatile(
2563                 PREFETCH" 32(%1, %0)\n\t"
2564                 PREFETCH" 32(%2, %0)\n\t"
2565                 PREFETCH" 32(%3, %0)\n\t"
2566                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2567                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2568                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2569                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2570                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2571                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2572                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2573                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2574                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2575                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2576
2577                 "movq   %%mm1, %%mm6\n\t"
2578                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2579                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2580                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2581                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2582                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2583                 
2584                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2585                 "movq   8(%1, %0, 4), %%mm0\n\t"
2586                 "movq   %%mm0, %%mm3\n\t"
2587                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2588                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2589                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2590                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2591
2592                 "movq   %%mm4, %%mm6\n\t"
2593                 "movq   16(%1, %0, 4), %%mm0\n\t"
2594                 "movq   %%mm0, %%mm3\n\t"
2595                 "punpcklbw %%mm5, %%mm4\n\t"
2596                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2597                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2598                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2599                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2600                 
2601                 "punpckhbw %%mm5, %%mm6\n\t"
2602                 "movq   24(%1, %0, 4), %%mm0\n\t"
2603                 "movq   %%mm0, %%mm3\n\t"
2604                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2605                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2606                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2607                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2608
2609                 : "+r" (x)
2610                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2611                 :"memory");
2612         }
2613 #endif
2614         for(; x<w; x++)
2615         {
2616             const int x2= x<<2;
2617             d[8*x+0]=yp[x2];
2618             d[8*x+1]=up[x];
2619             d[8*x+2]=yp[x2+1];
2620             d[8*x+3]=vp[x];
2621             d[8*x+4]=yp[x2+2];
2622             d[8*x+5]=up[x];
2623             d[8*x+6]=yp[x2+3];
2624             d[8*x+7]=vp[x];
2625         }
2626     }
2627 #ifdef HAVE_MMX
2628         asm(
2629                 EMMS" \n\t"
2630                 SFENCE" \n\t"
2631                 ::: "memory"
2632                 );
2633 #endif
2634 }