]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot...
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245 {
246   register const uint8_t* s=src;
247   register uint8_t* d=dst;
248   register const uint8_t *end;
249   const uint8_t *mm_end;
250   end = s + src_size;
251 #ifdef HAVE_MMX
252   __asm __volatile(PREFETCH"    %0"::"m"(*s));
253   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255   mm_end = end - 15;
256   while(s<mm_end)
257   {
258         __asm __volatile(
259                 PREFETCH"       32%1\n\t"
260                 "movq   %1, %%mm0\n\t"
261                 "movq   8%1, %%mm2\n\t"
262                 "movq   %%mm0, %%mm1\n\t"
263                 "movq   %%mm2, %%mm3\n\t"
264                 "psrlq  $1, %%mm0\n\t"
265                 "psrlq  $1, %%mm2\n\t"
266                 "pand   %%mm7, %%mm0\n\t"
267                 "pand   %%mm7, %%mm2\n\t"
268                 "pand   %%mm6, %%mm1\n\t"
269                 "pand   %%mm6, %%mm3\n\t"
270                 "por    %%mm1, %%mm0\n\t"
271                 "por    %%mm3, %%mm2\n\t"
272                 MOVNTQ" %%mm0, %0\n\t"
273                 MOVNTQ" %%mm2, 8%0"
274                 :"=m"(*d)
275                 :"m"(*s)
276                 );
277         d+=16;
278         s+=16;
279   }
280   __asm __volatile(SFENCE:::"memory");
281   __asm __volatile(EMMS:::"memory");
282 #endif
283     mm_end = end - 3;
284     while(s < mm_end)
285     {
286         register uint32_t x= *((uint32_t *)s);
287         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288         s+=4;
289         d+=4;
290     }
291     if(s < end)
292     {
293         register uint16_t x= *((uint16_t *)s);
294         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295         s+=2;
296         d+=2;
297     }
298 }
299
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301 {
302         const uint8_t *s = src;
303         const uint8_t *end;
304 #ifdef HAVE_MMX
305         const uint8_t *mm_end;
306 #endif
307         uint16_t *d = (uint16_t *)dst;
308         end = s + src_size;
309 #ifdef HAVE_MMX
310         mm_end = end - 15;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312         asm volatile(
313                 "movq %3, %%mm5                 \n\t"
314                 "movq %4, %%mm6                 \n\t"
315                 "movq %5, %%mm7                 \n\t"
316                 ".balign 16                     \n\t"
317                 "1:                             \n\t"
318                 PREFETCH" 32(%1)                \n\t"
319                 "movd   (%1), %%mm0             \n\t"
320                 "movd   4(%1), %%mm3            \n\t"
321                 "punpckldq 8(%1), %%mm0         \n\t"
322                 "punpckldq 12(%1), %%mm3        \n\t"
323                 "movq %%mm0, %%mm1              \n\t"
324                 "movq %%mm3, %%mm4              \n\t"
325                 "pand %%mm6, %%mm0              \n\t"
326                 "pand %%mm6, %%mm3              \n\t"
327                 "pmaddwd %%mm7, %%mm0           \n\t"
328                 "pmaddwd %%mm7, %%mm3           \n\t"
329                 "pand %%mm5, %%mm1              \n\t"
330                 "pand %%mm5, %%mm4              \n\t"
331                 "por %%mm1, %%mm0               \n\t"   
332                 "por %%mm4, %%mm3               \n\t"
333                 "psrld $5, %%mm0                \n\t"
334                 "pslld $11, %%mm3               \n\t"
335                 "por %%mm3, %%mm0               \n\t"
336                 MOVNTQ" %%mm0, (%0)             \n\t"
337                 "addl $16, %1                   \n\t"
338                 "addl $8, %0                    \n\t"
339                 "cmpl %2, %1                    \n\t"
340                 " jb 1b                         \n\t"
341                 : "+r" (d), "+r"(s)
342                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343         );
344 #else
345         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
346         __asm __volatile(
347             "movq       %0, %%mm7\n\t"
348             "movq       %1, %%mm6\n\t"
349             ::"m"(red_16mask),"m"(green_16mask));
350         while(s < mm_end)
351         {
352             __asm __volatile(
353                 PREFETCH" 32%1\n\t"
354                 "movd   %1, %%mm0\n\t"
355                 "movd   4%1, %%mm3\n\t"
356                 "punpckldq 8%1, %%mm0\n\t"
357                 "punpckldq 12%1, %%mm3\n\t"
358                 "movq   %%mm0, %%mm1\n\t"
359                 "movq   %%mm0, %%mm2\n\t"
360                 "movq   %%mm3, %%mm4\n\t"
361                 "movq   %%mm3, %%mm5\n\t"
362                 "psrlq  $3, %%mm0\n\t"
363                 "psrlq  $3, %%mm3\n\t"
364                 "pand   %2, %%mm0\n\t"
365                 "pand   %2, %%mm3\n\t"
366                 "psrlq  $5, %%mm1\n\t"
367                 "psrlq  $5, %%mm4\n\t"
368                 "pand   %%mm6, %%mm1\n\t"
369                 "pand   %%mm6, %%mm4\n\t"
370                 "psrlq  $8, %%mm2\n\t"
371                 "psrlq  $8, %%mm5\n\t"
372                 "pand   %%mm7, %%mm2\n\t"
373                 "pand   %%mm7, %%mm5\n\t"
374                 "por    %%mm1, %%mm0\n\t"
375                 "por    %%mm4, %%mm3\n\t"
376                 "por    %%mm2, %%mm0\n\t"
377                 "por    %%mm5, %%mm3\n\t"
378                 "psllq  $16, %%mm3\n\t"
379                 "por    %%mm3, %%mm0\n\t"
380                 MOVNTQ" %%mm0, %0\n\t"
381                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382                 d += 4;
383                 s += 16;
384         }
385 #endif
386         __asm __volatile(SFENCE:::"memory");
387         __asm __volatile(EMMS:::"memory");
388 #endif
389         while(s < end)
390         {
391                 const int src= *((uint32_t*)s)++;
392                 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 //              *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394         }
395 }
396
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398 {
399         const uint8_t *s = src;
400         const uint8_t *end;
401 #ifdef HAVE_MMX
402         const uint8_t *mm_end;
403 #endif
404         uint16_t *d = (uint16_t *)dst;
405         end = s + src_size;
406 #ifdef HAVE_MMX
407         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
408         __asm __volatile(
409             "movq       %0, %%mm7\n\t"
410             "movq       %1, %%mm6\n\t"
411             ::"m"(red_16mask),"m"(green_16mask));
412         mm_end = end - 15;
413         while(s < mm_end)
414         {
415             __asm __volatile(
416                 PREFETCH" 32%1\n\t"
417                 "movd   %1, %%mm0\n\t"
418                 "movd   4%1, %%mm3\n\t"
419                 "punpckldq 8%1, %%mm0\n\t"
420                 "punpckldq 12%1, %%mm3\n\t"
421                 "movq   %%mm0, %%mm1\n\t"
422                 "movq   %%mm0, %%mm2\n\t"
423                 "movq   %%mm3, %%mm4\n\t"
424                 "movq   %%mm3, %%mm5\n\t"
425                 "psllq  $8, %%mm0\n\t"
426                 "psllq  $8, %%mm3\n\t"
427                 "pand   %%mm7, %%mm0\n\t"
428                 "pand   %%mm7, %%mm3\n\t"
429                 "psrlq  $5, %%mm1\n\t"
430                 "psrlq  $5, %%mm4\n\t"
431                 "pand   %%mm6, %%mm1\n\t"
432                 "pand   %%mm6, %%mm4\n\t"
433                 "psrlq  $19, %%mm2\n\t"
434                 "psrlq  $19, %%mm5\n\t"
435                 "pand   %2, %%mm2\n\t"
436                 "pand   %2, %%mm5\n\t"
437                 "por    %%mm1, %%mm0\n\t"
438                 "por    %%mm4, %%mm3\n\t"
439                 "por    %%mm2, %%mm0\n\t"
440                 "por    %%mm5, %%mm3\n\t"
441                 "psllq  $16, %%mm3\n\t"
442                 "por    %%mm3, %%mm0\n\t"
443                 MOVNTQ" %%mm0, %0\n\t"
444                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445                 d += 4;
446                 s += 16;
447         }
448         __asm __volatile(SFENCE:::"memory");
449         __asm __volatile(EMMS:::"memory");
450 #endif
451         while(s < end)
452         {
453                 const int src= *((uint32_t*)s)++;
454                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455         }
456 }
457
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459 {
460         const uint8_t *s = src;
461         const uint8_t *end;
462 #ifdef HAVE_MMX
463         const uint8_t *mm_end;
464 #endif
465         uint16_t *d = (uint16_t *)dst;
466         end = s + src_size;
467 #ifdef HAVE_MMX
468         mm_end = end - 15;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470         asm volatile(
471                 "movq %3, %%mm5                 \n\t"
472                 "movq %4, %%mm6                 \n\t"
473                 "movq %5, %%mm7                 \n\t"
474                 ".balign 16                     \n\t"
475                 "1:                             \n\t"
476                 PREFETCH" 32(%1)                \n\t"
477                 "movd   (%1), %%mm0             \n\t"
478                 "movd   4(%1), %%mm3            \n\t"
479                 "punpckldq 8(%1), %%mm0         \n\t"
480                 "punpckldq 12(%1), %%mm3        \n\t"
481                 "movq %%mm0, %%mm1              \n\t"
482                 "movq %%mm3, %%mm4              \n\t"
483                 "pand %%mm6, %%mm0              \n\t"
484                 "pand %%mm6, %%mm3              \n\t"
485                 "pmaddwd %%mm7, %%mm0           \n\t"
486                 "pmaddwd %%mm7, %%mm3           \n\t"
487                 "pand %%mm5, %%mm1              \n\t"
488                 "pand %%mm5, %%mm4              \n\t"
489                 "por %%mm1, %%mm0               \n\t"   
490                 "por %%mm4, %%mm3               \n\t"
491                 "psrld $6, %%mm0                \n\t"
492                 "pslld $10, %%mm3               \n\t"
493                 "por %%mm3, %%mm0               \n\t"
494                 MOVNTQ" %%mm0, (%0)             \n\t"
495                 "addl $16, %1                   \n\t"
496                 "addl $8, %0                    \n\t"
497                 "cmpl %2, %1                    \n\t"
498                 " jb 1b                         \n\t"
499                 : "+r" (d), "+r"(s)
500                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501         );
502 #else
503         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
504         __asm __volatile(
505             "movq       %0, %%mm7\n\t"
506             "movq       %1, %%mm6\n\t"
507             ::"m"(red_15mask),"m"(green_15mask));
508         while(s < mm_end)
509         {
510             __asm __volatile(
511                 PREFETCH" 32%1\n\t"
512                 "movd   %1, %%mm0\n\t"
513                 "movd   4%1, %%mm3\n\t"
514                 "punpckldq 8%1, %%mm0\n\t"
515                 "punpckldq 12%1, %%mm3\n\t"
516                 "movq   %%mm0, %%mm1\n\t"
517                 "movq   %%mm0, %%mm2\n\t"
518                 "movq   %%mm3, %%mm4\n\t"
519                 "movq   %%mm3, %%mm5\n\t"
520                 "psrlq  $3, %%mm0\n\t"
521                 "psrlq  $3, %%mm3\n\t"
522                 "pand   %2, %%mm0\n\t"
523                 "pand   %2, %%mm3\n\t"
524                 "psrlq  $6, %%mm1\n\t"
525                 "psrlq  $6, %%mm4\n\t"
526                 "pand   %%mm6, %%mm1\n\t"
527                 "pand   %%mm6, %%mm4\n\t"
528                 "psrlq  $9, %%mm2\n\t"
529                 "psrlq  $9, %%mm5\n\t"
530                 "pand   %%mm7, %%mm2\n\t"
531                 "pand   %%mm7, %%mm5\n\t"
532                 "por    %%mm1, %%mm0\n\t"
533                 "por    %%mm4, %%mm3\n\t"
534                 "por    %%mm2, %%mm0\n\t"
535                 "por    %%mm5, %%mm3\n\t"
536                 "psllq  $16, %%mm3\n\t"
537                 "por    %%mm3, %%mm0\n\t"
538                 MOVNTQ" %%mm0, %0\n\t"
539                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540                 d += 4;
541                 s += 16;
542         }
543 #endif
544         __asm __volatile(SFENCE:::"memory");
545         __asm __volatile(EMMS:::"memory");
546 #endif
547         while(s < end)
548         {
549                 const int src= *((uint32_t*)s)++;
550                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551         }
552 }
553
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555 {
556         const uint8_t *s = src;
557         const uint8_t *end;
558 #ifdef HAVE_MMX
559         const uint8_t *mm_end;
560 #endif
561         uint16_t *d = (uint16_t *)dst;
562         end = s + src_size;
563 #ifdef HAVE_MMX
564         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
565         __asm __volatile(
566             "movq       %0, %%mm7\n\t"
567             "movq       %1, %%mm6\n\t"
568             ::"m"(red_15mask),"m"(green_15mask));
569         mm_end = end - 15;
570         while(s < mm_end)
571         {
572             __asm __volatile(
573                 PREFETCH" 32%1\n\t"
574                 "movd   %1, %%mm0\n\t"
575                 "movd   4%1, %%mm3\n\t"
576                 "punpckldq 8%1, %%mm0\n\t"
577                 "punpckldq 12%1, %%mm3\n\t"
578                 "movq   %%mm0, %%mm1\n\t"
579                 "movq   %%mm0, %%mm2\n\t"
580                 "movq   %%mm3, %%mm4\n\t"
581                 "movq   %%mm3, %%mm5\n\t"
582                 "psllq  $7, %%mm0\n\t"
583                 "psllq  $7, %%mm3\n\t"
584                 "pand   %%mm7, %%mm0\n\t"
585                 "pand   %%mm7, %%mm3\n\t"
586                 "psrlq  $6, %%mm1\n\t"
587                 "psrlq  $6, %%mm4\n\t"
588                 "pand   %%mm6, %%mm1\n\t"
589                 "pand   %%mm6, %%mm4\n\t"
590                 "psrlq  $19, %%mm2\n\t"
591                 "psrlq  $19, %%mm5\n\t"
592                 "pand   %2, %%mm2\n\t"
593                 "pand   %2, %%mm5\n\t"
594                 "por    %%mm1, %%mm0\n\t"
595                 "por    %%mm4, %%mm3\n\t"
596                 "por    %%mm2, %%mm0\n\t"
597                 "por    %%mm5, %%mm3\n\t"
598                 "psllq  $16, %%mm3\n\t"
599                 "por    %%mm3, %%mm0\n\t"
600                 MOVNTQ" %%mm0, %0\n\t"
601                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602                 d += 4;
603                 s += 16;
604         }
605         __asm __volatile(SFENCE:::"memory");
606         __asm __volatile(EMMS:::"memory");
607 #endif
608         while(s < end)
609         {
610                 const int src= *((uint32_t*)s)++;
611                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612         }
613 }
614
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616 {
617         const uint8_t *s = src;
618         const uint8_t *end;
619 #ifdef HAVE_MMX
620         const uint8_t *mm_end;
621 #endif
622         uint16_t *d = (uint16_t *)dst;
623         end = s + src_size;
624 #ifdef HAVE_MMX
625         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
626         __asm __volatile(
627             "movq       %0, %%mm7\n\t"
628             "movq       %1, %%mm6\n\t"
629             ::"m"(red_16mask),"m"(green_16mask));
630         mm_end = end - 11;
631         while(s < mm_end)
632         {
633             __asm __volatile(
634                 PREFETCH" 32%1\n\t"
635                 "movd   %1, %%mm0\n\t"
636                 "movd   3%1, %%mm3\n\t"
637                 "punpckldq 6%1, %%mm0\n\t"
638                 "punpckldq 9%1, %%mm3\n\t"
639                 "movq   %%mm0, %%mm1\n\t"
640                 "movq   %%mm0, %%mm2\n\t"
641                 "movq   %%mm3, %%mm4\n\t"
642                 "movq   %%mm3, %%mm5\n\t"
643                 "psrlq  $3, %%mm0\n\t"
644                 "psrlq  $3, %%mm3\n\t"
645                 "pand   %2, %%mm0\n\t"
646                 "pand   %2, %%mm3\n\t"
647                 "psrlq  $5, %%mm1\n\t"
648                 "psrlq  $5, %%mm4\n\t"
649                 "pand   %%mm6, %%mm1\n\t"
650                 "pand   %%mm6, %%mm4\n\t"
651                 "psrlq  $8, %%mm2\n\t"
652                 "psrlq  $8, %%mm5\n\t"
653                 "pand   %%mm7, %%mm2\n\t"
654                 "pand   %%mm7, %%mm5\n\t"
655                 "por    %%mm1, %%mm0\n\t"
656                 "por    %%mm4, %%mm3\n\t"
657                 "por    %%mm2, %%mm0\n\t"
658                 "por    %%mm5, %%mm3\n\t"
659                 "psllq  $16, %%mm3\n\t"
660                 "por    %%mm3, %%mm0\n\t"
661                 MOVNTQ" %%mm0, %0\n\t"
662                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663                 d += 4;
664                 s += 12;
665         }
666         __asm __volatile(SFENCE:::"memory");
667         __asm __volatile(EMMS:::"memory");
668 #endif
669         while(s < end)
670         {
671                 const int b= *s++;
672                 const int g= *s++;
673                 const int r= *s++;
674                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675         }
676 }
677
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679 {
680         const uint8_t *s = src;
681         const uint8_t *end;
682 #ifdef HAVE_MMX
683         const uint8_t *mm_end;
684 #endif
685         uint16_t *d = (uint16_t *)dst;
686         end = s + src_size;
687 #ifdef HAVE_MMX
688         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
689         __asm __volatile(
690             "movq       %0, %%mm7\n\t"
691             "movq       %1, %%mm6\n\t"
692             ::"m"(red_16mask),"m"(green_16mask));
693         mm_end = end - 15;
694         while(s < mm_end)
695         {
696             __asm __volatile(
697                 PREFETCH" 32%1\n\t"
698                 "movd   %1, %%mm0\n\t"
699                 "movd   3%1, %%mm3\n\t"
700                 "punpckldq 6%1, %%mm0\n\t"
701                 "punpckldq 9%1, %%mm3\n\t"
702                 "movq   %%mm0, %%mm1\n\t"
703                 "movq   %%mm0, %%mm2\n\t"
704                 "movq   %%mm3, %%mm4\n\t"
705                 "movq   %%mm3, %%mm5\n\t"
706                 "psllq  $8, %%mm0\n\t"
707                 "psllq  $8, %%mm3\n\t"
708                 "pand   %%mm7, %%mm0\n\t"
709                 "pand   %%mm7, %%mm3\n\t"
710                 "psrlq  $5, %%mm1\n\t"
711                 "psrlq  $5, %%mm4\n\t"
712                 "pand   %%mm6, %%mm1\n\t"
713                 "pand   %%mm6, %%mm4\n\t"
714                 "psrlq  $19, %%mm2\n\t"
715                 "psrlq  $19, %%mm5\n\t"
716                 "pand   %2, %%mm2\n\t"
717                 "pand   %2, %%mm5\n\t"
718                 "por    %%mm1, %%mm0\n\t"
719                 "por    %%mm4, %%mm3\n\t"
720                 "por    %%mm2, %%mm0\n\t"
721                 "por    %%mm5, %%mm3\n\t"
722                 "psllq  $16, %%mm3\n\t"
723                 "por    %%mm3, %%mm0\n\t"
724                 MOVNTQ" %%mm0, %0\n\t"
725                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726                 d += 4;
727                 s += 12;
728         }
729         __asm __volatile(SFENCE:::"memory");
730         __asm __volatile(EMMS:::"memory");
731 #endif
732         while(s < end)
733         {
734                 const int r= *s++;
735                 const int g= *s++;
736                 const int b= *s++;
737                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738         }
739 }
740
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742 {
743         const uint8_t *s = src;
744         const uint8_t *end;
745 #ifdef HAVE_MMX
746         const uint8_t *mm_end;
747 #endif
748         uint16_t *d = (uint16_t *)dst;
749         end = s + src_size;
750 #ifdef HAVE_MMX
751         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
752         __asm __volatile(
753             "movq       %0, %%mm7\n\t"
754             "movq       %1, %%mm6\n\t"
755             ::"m"(red_15mask),"m"(green_15mask));
756         mm_end = end - 11;
757         while(s < mm_end)
758         {
759             __asm __volatile(
760                 PREFETCH" 32%1\n\t"
761                 "movd   %1, %%mm0\n\t"
762                 "movd   3%1, %%mm3\n\t"
763                 "punpckldq 6%1, %%mm0\n\t"
764                 "punpckldq 9%1, %%mm3\n\t"
765                 "movq   %%mm0, %%mm1\n\t"
766                 "movq   %%mm0, %%mm2\n\t"
767                 "movq   %%mm3, %%mm4\n\t"
768                 "movq   %%mm3, %%mm5\n\t"
769                 "psrlq  $3, %%mm0\n\t"
770                 "psrlq  $3, %%mm3\n\t"
771                 "pand   %2, %%mm0\n\t"
772                 "pand   %2, %%mm3\n\t"
773                 "psrlq  $6, %%mm1\n\t"
774                 "psrlq  $6, %%mm4\n\t"
775                 "pand   %%mm6, %%mm1\n\t"
776                 "pand   %%mm6, %%mm4\n\t"
777                 "psrlq  $9, %%mm2\n\t"
778                 "psrlq  $9, %%mm5\n\t"
779                 "pand   %%mm7, %%mm2\n\t"
780                 "pand   %%mm7, %%mm5\n\t"
781                 "por    %%mm1, %%mm0\n\t"
782                 "por    %%mm4, %%mm3\n\t"
783                 "por    %%mm2, %%mm0\n\t"
784                 "por    %%mm5, %%mm3\n\t"
785                 "psllq  $16, %%mm3\n\t"
786                 "por    %%mm3, %%mm0\n\t"
787                 MOVNTQ" %%mm0, %0\n\t"
788                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789                 d += 4;
790                 s += 12;
791         }
792         __asm __volatile(SFENCE:::"memory");
793         __asm __volatile(EMMS:::"memory");
794 #endif
795         while(s < end)
796         {
797                 const int b= *s++;
798                 const int g= *s++;
799                 const int r= *s++;
800                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801         }
802 }
803
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805 {
806         const uint8_t *s = src;
807         const uint8_t *end;
808 #ifdef HAVE_MMX
809         const uint8_t *mm_end;
810 #endif
811         uint16_t *d = (uint16_t *)dst;
812         end = s + src_size;
813 #ifdef HAVE_MMX
814         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
815         __asm __volatile(
816             "movq       %0, %%mm7\n\t"
817             "movq       %1, %%mm6\n\t"
818             ::"m"(red_15mask),"m"(green_15mask));
819         mm_end = end - 15;
820         while(s < mm_end)
821         {
822             __asm __volatile(
823                 PREFETCH" 32%1\n\t"
824                 "movd   %1, %%mm0\n\t"
825                 "movd   3%1, %%mm3\n\t"
826                 "punpckldq 6%1, %%mm0\n\t"
827                 "punpckldq 9%1, %%mm3\n\t"
828                 "movq   %%mm0, %%mm1\n\t"
829                 "movq   %%mm0, %%mm2\n\t"
830                 "movq   %%mm3, %%mm4\n\t"
831                 "movq   %%mm3, %%mm5\n\t"
832                 "psllq  $7, %%mm0\n\t"
833                 "psllq  $7, %%mm3\n\t"
834                 "pand   %%mm7, %%mm0\n\t"
835                 "pand   %%mm7, %%mm3\n\t"
836                 "psrlq  $6, %%mm1\n\t"
837                 "psrlq  $6, %%mm4\n\t"
838                 "pand   %%mm6, %%mm1\n\t"
839                 "pand   %%mm6, %%mm4\n\t"
840                 "psrlq  $19, %%mm2\n\t"
841                 "psrlq  $19, %%mm5\n\t"
842                 "pand   %2, %%mm2\n\t"
843                 "pand   %2, %%mm5\n\t"
844                 "por    %%mm1, %%mm0\n\t"
845                 "por    %%mm4, %%mm3\n\t"
846                 "por    %%mm2, %%mm0\n\t"
847                 "por    %%mm5, %%mm3\n\t"
848                 "psllq  $16, %%mm3\n\t"
849                 "por    %%mm3, %%mm0\n\t"
850                 MOVNTQ" %%mm0, %0\n\t"
851                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852                 d += 4;
853                 s += 12;
854         }
855         __asm __volatile(SFENCE:::"memory");
856         __asm __volatile(EMMS:::"memory");
857 #endif
858         while(s < end)
859         {
860                 const int r= *s++;
861                 const int g= *s++;
862                 const int b= *s++;
863                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864         }
865 }
866
867 /*
868   I use here less accurate approximation by simply
869  left-shifting the input
870   value and filling the low order bits with
871  zeroes. This method improves png's
872   compression but this scheme cannot reproduce white exactly, since it does not
873   generate an all-ones maximum value; the net effect is to darken the
874   image slightly.
875
876   The better method should be "left bit replication":
877
878    4 3 2 1 0
879    ---------
880    1 1 0 1 1
881
882    7 6 5 4 3  2 1 0
883    ----------------
884    1 1 0 1 1  1 1 0
885    |=======|  |===|
886        |      Leftmost Bits Repeated to Fill Open Bits
887        |
888    Original Bits
889 */
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891 {
892         const uint16_t *end;
893 #ifdef HAVE_MMX
894         const uint16_t *mm_end;
895 #endif
896         uint8_t *d = (uint8_t *)dst;
897         const uint16_t *s = (uint16_t *)src;
898         end = s + src_size/2;
899 #ifdef HAVE_MMX
900         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
901         mm_end = end - 7;
902         while(s < mm_end)
903         {
904             __asm __volatile(
905                 PREFETCH" 32%1\n\t"
906                 "movq   %1, %%mm0\n\t"
907                 "movq   %1, %%mm1\n\t"
908                 "movq   %1, %%mm2\n\t"
909                 "pand   %2, %%mm0\n\t"
910                 "pand   %3, %%mm1\n\t"
911                 "pand   %4, %%mm2\n\t"
912                 "psllq  $3, %%mm0\n\t"
913                 "psrlq  $2, %%mm1\n\t"
914                 "psrlq  $7, %%mm2\n\t"
915                 "movq   %%mm0, %%mm3\n\t"
916                 "movq   %%mm1, %%mm4\n\t"
917                 "movq   %%mm2, %%mm5\n\t"
918                 "punpcklwd %5, %%mm0\n\t"
919                 "punpcklwd %5, %%mm1\n\t"
920                 "punpcklwd %5, %%mm2\n\t"
921                 "punpckhwd %5, %%mm3\n\t"
922                 "punpckhwd %5, %%mm4\n\t"
923                 "punpckhwd %5, %%mm5\n\t"
924                 "psllq  $8, %%mm1\n\t"
925                 "psllq  $16, %%mm2\n\t"
926                 "por    %%mm1, %%mm0\n\t"
927                 "por    %%mm2, %%mm0\n\t"
928                 "psllq  $8, %%mm4\n\t"
929                 "psllq  $16, %%mm5\n\t"
930                 "por    %%mm4, %%mm3\n\t"
931                 "por    %%mm5, %%mm3\n\t"
932
933                 "movq   %%mm0, %%mm6\n\t"
934                 "movq   %%mm3, %%mm7\n\t"
935                 
936                 "movq   8%1, %%mm0\n\t"
937                 "movq   8%1, %%mm1\n\t"
938                 "movq   8%1, %%mm2\n\t"
939                 "pand   %2, %%mm0\n\t"
940                 "pand   %3, %%mm1\n\t"
941                 "pand   %4, %%mm2\n\t"
942                 "psllq  $3, %%mm0\n\t"
943                 "psrlq  $2, %%mm1\n\t"
944                 "psrlq  $7, %%mm2\n\t"
945                 "movq   %%mm0, %%mm3\n\t"
946                 "movq   %%mm1, %%mm4\n\t"
947                 "movq   %%mm2, %%mm5\n\t"
948                 "punpcklwd %5, %%mm0\n\t"
949                 "punpcklwd %5, %%mm1\n\t"
950                 "punpcklwd %5, %%mm2\n\t"
951                 "punpckhwd %5, %%mm3\n\t"
952                 "punpckhwd %5, %%mm4\n\t"
953                 "punpckhwd %5, %%mm5\n\t"
954                 "psllq  $8, %%mm1\n\t"
955                 "psllq  $16, %%mm2\n\t"
956                 "por    %%mm1, %%mm0\n\t"
957                 "por    %%mm2, %%mm0\n\t"
958                 "psllq  $8, %%mm4\n\t"
959                 "psllq  $16, %%mm5\n\t"
960                 "por    %%mm4, %%mm3\n\t"
961                 "por    %%mm5, %%mm3\n\t"
962
963                 :"=m"(*d)
964                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965                 :"memory");
966             /* Borrowed 32 to 24 */
967             __asm __volatile(
968                 "movq   %%mm0, %%mm4\n\t"
969                 "movq   %%mm3, %%mm5\n\t"
970                 "movq   %%mm6, %%mm0\n\t"
971                 "movq   %%mm7, %%mm1\n\t"
972                 
973                 "movq   %%mm4, %%mm6\n\t"
974                 "movq   %%mm5, %%mm7\n\t"
975                 "movq   %%mm0, %%mm2\n\t"
976                 "movq   %%mm1, %%mm3\n\t"
977
978                 "psrlq  $8, %%mm2\n\t"
979                 "psrlq  $8, %%mm3\n\t"
980                 "psrlq  $8, %%mm6\n\t"
981                 "psrlq  $8, %%mm7\n\t"
982                 "pand   %2, %%mm0\n\t"
983                 "pand   %2, %%mm1\n\t"
984                 "pand   %2, %%mm4\n\t"
985                 "pand   %2, %%mm5\n\t"
986                 "pand   %3, %%mm2\n\t"
987                 "pand   %3, %%mm3\n\t"
988                 "pand   %3, %%mm6\n\t"
989                 "pand   %3, %%mm7\n\t"
990                 "por    %%mm2, %%mm0\n\t"
991                 "por    %%mm3, %%mm1\n\t"
992                 "por    %%mm6, %%mm4\n\t"
993                 "por    %%mm7, %%mm5\n\t"
994
995                 "movq   %%mm1, %%mm2\n\t"
996                 "movq   %%mm4, %%mm3\n\t"
997                 "psllq  $48, %%mm2\n\t"
998                 "psllq  $32, %%mm3\n\t"
999                 "pand   %4, %%mm2\n\t"
1000                 "pand   %5, %%mm3\n\t"
1001                 "por    %%mm2, %%mm0\n\t"
1002                 "psrlq  $16, %%mm1\n\t"
1003                 "psrlq  $32, %%mm4\n\t"
1004                 "psllq  $16, %%mm5\n\t"
1005                 "por    %%mm3, %%mm1\n\t"
1006                 "pand   %6, %%mm5\n\t"
1007                 "por    %%mm5, %%mm4\n\t"
1008
1009                 MOVNTQ" %%mm0, %0\n\t"
1010                 MOVNTQ" %%mm1, 8%0\n\t"
1011                 MOVNTQ" %%mm4, 16%0"
1012
1013                 :"=m"(*d)
1014                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015                 :"memory");
1016                 d += 24;
1017                 s += 8;
1018         }
1019         __asm __volatile(SFENCE:::"memory");
1020         __asm __volatile(EMMS:::"memory");
1021 #endif
1022         while(s < end)
1023         {
1024                 register uint16_t bgr;
1025                 bgr = *s++;
1026                 *d++ = (bgr&0x1F)<<3;
1027                 *d++ = (bgr&0x3E0)>>2;
1028                 *d++ = (bgr&0x7C00)>>7;
1029         }
1030 }
1031
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033 {
1034         const uint16_t *end;
1035 #ifdef HAVE_MMX
1036         const uint16_t *mm_end;
1037 #endif
1038         uint8_t *d = (uint8_t *)dst;
1039         const uint16_t *s = (const uint16_t *)src;
1040         end = s + src_size/2;
1041 #ifdef HAVE_MMX
1042         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1043         mm_end = end - 7;
1044         while(s < mm_end)
1045         {
1046             __asm __volatile(
1047                 PREFETCH" 32%1\n\t"
1048                 "movq   %1, %%mm0\n\t"
1049                 "movq   %1, %%mm1\n\t"
1050                 "movq   %1, %%mm2\n\t"
1051                 "pand   %2, %%mm0\n\t"
1052                 "pand   %3, %%mm1\n\t"
1053                 "pand   %4, %%mm2\n\t"
1054                 "psllq  $3, %%mm0\n\t"
1055                 "psrlq  $3, %%mm1\n\t"
1056                 "psrlq  $8, %%mm2\n\t"
1057                 "movq   %%mm0, %%mm3\n\t"
1058                 "movq   %%mm1, %%mm4\n\t"
1059                 "movq   %%mm2, %%mm5\n\t"
1060                 "punpcklwd %5, %%mm0\n\t"
1061                 "punpcklwd %5, %%mm1\n\t"
1062                 "punpcklwd %5, %%mm2\n\t"
1063                 "punpckhwd %5, %%mm3\n\t"
1064                 "punpckhwd %5, %%mm4\n\t"
1065                 "punpckhwd %5, %%mm5\n\t"
1066                 "psllq  $8, %%mm1\n\t"
1067                 "psllq  $16, %%mm2\n\t"
1068                 "por    %%mm1, %%mm0\n\t"
1069                 "por    %%mm2, %%mm0\n\t"
1070                 "psllq  $8, %%mm4\n\t"
1071                 "psllq  $16, %%mm5\n\t"
1072                 "por    %%mm4, %%mm3\n\t"
1073                 "por    %%mm5, %%mm3\n\t"
1074                 
1075                 "movq   %%mm0, %%mm6\n\t"
1076                 "movq   %%mm3, %%mm7\n\t"
1077
1078                 "movq   8%1, %%mm0\n\t"
1079                 "movq   8%1, %%mm1\n\t"
1080                 "movq   8%1, %%mm2\n\t"
1081                 "pand   %2, %%mm0\n\t"
1082                 "pand   %3, %%mm1\n\t"
1083                 "pand   %4, %%mm2\n\t"
1084                 "psllq  $3, %%mm0\n\t"
1085                 "psrlq  $3, %%mm1\n\t"
1086                 "psrlq  $8, %%mm2\n\t"
1087                 "movq   %%mm0, %%mm3\n\t"
1088                 "movq   %%mm1, %%mm4\n\t"
1089                 "movq   %%mm2, %%mm5\n\t"
1090                 "punpcklwd %5, %%mm0\n\t"
1091                 "punpcklwd %5, %%mm1\n\t"
1092                 "punpcklwd %5, %%mm2\n\t"
1093                 "punpckhwd %5, %%mm3\n\t"
1094                 "punpckhwd %5, %%mm4\n\t"
1095                 "punpckhwd %5, %%mm5\n\t"
1096                 "psllq  $8, %%mm1\n\t"
1097                 "psllq  $16, %%mm2\n\t"
1098                 "por    %%mm1, %%mm0\n\t"
1099                 "por    %%mm2, %%mm0\n\t"
1100                 "psllq  $8, %%mm4\n\t"
1101                 "psllq  $16, %%mm5\n\t"
1102                 "por    %%mm4, %%mm3\n\t"
1103                 "por    %%mm5, %%mm3\n\t"
1104                 :"=m"(*d)
1105                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1106                 :"memory");
1107             /* Borrowed 32 to 24 */
1108             __asm __volatile(
1109                 "movq   %%mm0, %%mm4\n\t"
1110                 "movq   %%mm3, %%mm5\n\t"
1111                 "movq   %%mm6, %%mm0\n\t"
1112                 "movq   %%mm7, %%mm1\n\t"
1113                 
1114                 "movq   %%mm4, %%mm6\n\t"
1115                 "movq   %%mm5, %%mm7\n\t"
1116                 "movq   %%mm0, %%mm2\n\t"
1117                 "movq   %%mm1, %%mm3\n\t"
1118
1119                 "psrlq  $8, %%mm2\n\t"
1120                 "psrlq  $8, %%mm3\n\t"
1121                 "psrlq  $8, %%mm6\n\t"
1122                 "psrlq  $8, %%mm7\n\t"
1123                 "pand   %2, %%mm0\n\t"
1124                 "pand   %2, %%mm1\n\t"
1125                 "pand   %2, %%mm4\n\t"
1126                 "pand   %2, %%mm5\n\t"
1127                 "pand   %3, %%mm2\n\t"
1128                 "pand   %3, %%mm3\n\t"
1129                 "pand   %3, %%mm6\n\t"
1130                 "pand   %3, %%mm7\n\t"
1131                 "por    %%mm2, %%mm0\n\t"
1132                 "por    %%mm3, %%mm1\n\t"
1133                 "por    %%mm6, %%mm4\n\t"
1134                 "por    %%mm7, %%mm5\n\t"
1135
1136                 "movq   %%mm1, %%mm2\n\t"
1137                 "movq   %%mm4, %%mm3\n\t"
1138                 "psllq  $48, %%mm2\n\t"
1139                 "psllq  $32, %%mm3\n\t"
1140                 "pand   %4, %%mm2\n\t"
1141                 "pand   %5, %%mm3\n\t"
1142                 "por    %%mm2, %%mm0\n\t"
1143                 "psrlq  $16, %%mm1\n\t"
1144                 "psrlq  $32, %%mm4\n\t"
1145                 "psllq  $16, %%mm5\n\t"
1146                 "por    %%mm3, %%mm1\n\t"
1147                 "pand   %6, %%mm5\n\t"
1148                 "por    %%mm5, %%mm4\n\t"
1149
1150                 MOVNTQ" %%mm0, %0\n\t"
1151                 MOVNTQ" %%mm1, 8%0\n\t"
1152                 MOVNTQ" %%mm4, 16%0"
1153
1154                 :"=m"(*d)
1155                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156                 :"memory");
1157                 d += 24;
1158                 s += 8;
1159         }
1160         __asm __volatile(SFENCE:::"memory");
1161         __asm __volatile(EMMS:::"memory");
1162 #endif
1163         while(s < end)
1164         {
1165                 register uint16_t bgr;
1166                 bgr = *s++;
1167                 *d++ = (bgr&0x1F)<<3;
1168                 *d++ = (bgr&0x7E0)>>3;
1169                 *d++ = (bgr&0xF800)>>8;
1170         }
1171 }
1172
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174 {
1175         const uint16_t *end;
1176 #ifdef HAVE_MMX
1177         const uint16_t *mm_end;
1178 #endif
1179         uint8_t *d = (uint8_t *)dst;
1180         const uint16_t *s = (const uint16_t *)src;
1181         end = s + src_size/2;
1182 #ifdef HAVE_MMX
1183         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1184         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1185         mm_end = end - 3;
1186         while(s < mm_end)
1187         {
1188             __asm __volatile(
1189                 PREFETCH" 32%1\n\t"
1190                 "movq   %1, %%mm0\n\t"
1191                 "movq   %1, %%mm1\n\t"
1192                 "movq   %1, %%mm2\n\t"
1193                 "pand   %2, %%mm0\n\t"
1194                 "pand   %3, %%mm1\n\t"
1195                 "pand   %4, %%mm2\n\t"
1196                 "psllq  $3, %%mm0\n\t"
1197                 "psrlq  $2, %%mm1\n\t"
1198                 "psrlq  $7, %%mm2\n\t"
1199                 "movq   %%mm0, %%mm3\n\t"
1200                 "movq   %%mm1, %%mm4\n\t"
1201                 "movq   %%mm2, %%mm5\n\t"
1202                 "punpcklwd %%mm7, %%mm0\n\t"
1203                 "punpcklwd %%mm7, %%mm1\n\t"
1204                 "punpcklwd %%mm7, %%mm2\n\t"
1205                 "punpckhwd %%mm7, %%mm3\n\t"
1206                 "punpckhwd %%mm7, %%mm4\n\t"
1207                 "punpckhwd %%mm7, %%mm5\n\t"
1208                 "psllq  $8, %%mm1\n\t"
1209                 "psllq  $16, %%mm2\n\t"
1210                 "por    %%mm1, %%mm0\n\t"
1211                 "por    %%mm2, %%mm0\n\t"
1212                 "psllq  $8, %%mm4\n\t"
1213                 "psllq  $16, %%mm5\n\t"
1214                 "por    %%mm4, %%mm3\n\t"
1215                 "por    %%mm5, %%mm3\n\t"
1216                 MOVNTQ" %%mm0, %0\n\t"
1217                 MOVNTQ" %%mm3, 8%0\n\t"
1218                 :"=m"(*d)
1219                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220                 :"memory");
1221                 d += 16;
1222                 s += 4;
1223         }
1224         __asm __volatile(SFENCE:::"memory");
1225         __asm __volatile(EMMS:::"memory");
1226 #endif
1227         while(s < end)
1228         {
1229 #if 0 //slightly slower on athlon
1230                 int bgr= *s++;
1231                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232 #else
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234                 register uint16_t bgr;
1235                 bgr = *s++;
1236                 *d++ = (bgr&0x1F)<<3;
1237                 *d++ = (bgr&0x3E0)>>2;
1238                 *d++ = (bgr&0x7C00)>>7;
1239                 *d++ = 0;
1240 #endif
1241         }
1242 }
1243
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245 {
1246         const uint16_t *end;
1247 #ifdef HAVE_MMX
1248         const uint16_t *mm_end;
1249 #endif
1250         uint8_t *d = (uint8_t *)dst;
1251         const uint16_t *s = (uint16_t *)src;
1252         end = s + src_size/2;
1253 #ifdef HAVE_MMX
1254         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1255         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1256         mm_end = end - 3;
1257         while(s < mm_end)
1258         {
1259             __asm __volatile(
1260                 PREFETCH" 32%1\n\t"
1261                 "movq   %1, %%mm0\n\t"
1262                 "movq   %1, %%mm1\n\t"
1263                 "movq   %1, %%mm2\n\t"
1264                 "pand   %2, %%mm0\n\t"
1265                 "pand   %3, %%mm1\n\t"
1266                 "pand   %4, %%mm2\n\t"
1267                 "psllq  $3, %%mm0\n\t"
1268                 "psrlq  $3, %%mm1\n\t"
1269                 "psrlq  $8, %%mm2\n\t"
1270                 "movq   %%mm0, %%mm3\n\t"
1271                 "movq   %%mm1, %%mm4\n\t"
1272                 "movq   %%mm2, %%mm5\n\t"
1273                 "punpcklwd %%mm7, %%mm0\n\t"
1274                 "punpcklwd %%mm7, %%mm1\n\t"
1275                 "punpcklwd %%mm7, %%mm2\n\t"
1276                 "punpckhwd %%mm7, %%mm3\n\t"
1277                 "punpckhwd %%mm7, %%mm4\n\t"
1278                 "punpckhwd %%mm7, %%mm5\n\t"
1279                 "psllq  $8, %%mm1\n\t"
1280                 "psllq  $16, %%mm2\n\t"
1281                 "por    %%mm1, %%mm0\n\t"
1282                 "por    %%mm2, %%mm0\n\t"
1283                 "psllq  $8, %%mm4\n\t"
1284                 "psllq  $16, %%mm5\n\t"
1285                 "por    %%mm4, %%mm3\n\t"
1286                 "por    %%mm5, %%mm3\n\t"
1287                 MOVNTQ" %%mm0, %0\n\t"
1288                 MOVNTQ" %%mm3, 8%0\n\t"
1289                 :"=m"(*d)
1290                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291                 :"memory");
1292                 d += 16;
1293                 s += 4;
1294         }
1295         __asm __volatile(SFENCE:::"memory");
1296         __asm __volatile(EMMS:::"memory");
1297 #endif
1298         while(s < end)
1299         {
1300                 register uint16_t bgr;
1301                 bgr = *s++;
1302                 *d++ = (bgr&0x1F)<<3;
1303                 *d++ = (bgr&0x7E0)>>3;
1304                 *d++ = (bgr&0xF800)>>8;
1305                 *d++ = 0;
1306         }
1307 }
1308
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310 {
1311 #ifdef HAVE_MMX
1312 /* TODO: unroll this loop */
1313         asm volatile (
1314                 "xorl %%eax, %%eax              \n\t"
1315                 ".balign 16                     \n\t"
1316                 "1:                             \n\t"
1317                 PREFETCH" 32(%0, %%eax)         \n\t"
1318                 "movq (%0, %%eax), %%mm0        \n\t"
1319                 "movq %%mm0, %%mm1              \n\t"
1320                 "movq %%mm0, %%mm2              \n\t"
1321                 "pslld $16, %%mm0               \n\t"
1322                 "psrld $16, %%mm1               \n\t"
1323                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1324                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1325                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1326                 "por %%mm0, %%mm2               \n\t"
1327                 "por %%mm1, %%mm2               \n\t"
1328                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1329                 "addl $8, %%eax                 \n\t"
1330                 "cmpl %2, %%eax                 \n\t"
1331                 " jb 1b                         \n\t"
1332                 :: "r" (src), "r"(dst), "r" (src_size-7)
1333                 : "%eax"
1334         );
1335
1336         __asm __volatile(SFENCE:::"memory");
1337         __asm __volatile(EMMS:::"memory");
1338 #else
1339         unsigned i;
1340         unsigned num_pixels = src_size >> 2;
1341         for(i=0; i<num_pixels; i++)
1342         {
1343 #ifdef WORDS_BIGENDIAN  
1344           dst[4*i + 1] = src[4*i + 3];
1345           dst[4*i + 2] = src[4*i + 2];
1346           dst[4*i + 3] = src[4*i + 1];
1347 #else
1348           dst[4*i + 0] = src[4*i + 2];
1349           dst[4*i + 1] = src[4*i + 1];
1350           dst[4*i + 2] = src[4*i + 0];
1351 #endif
1352         }
1353 #endif
1354 }
1355
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357 {
1358         unsigned i;
1359 #ifdef HAVE_MMX
1360         int mmx_size= 23 - src_size;
1361         asm volatile (
1362                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1363                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1364                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1365                 ".balign 16                     \n\t"
1366                 "1:                             \n\t"
1367                 PREFETCH" 32(%1, %%eax)         \n\t"
1368                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1369                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1370                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1371                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1372                 "pand %%mm5, %%mm0              \n\t"
1373                 "pand %%mm6, %%mm1              \n\t"
1374                 "pand %%mm7, %%mm2              \n\t"
1375                 "por %%mm0, %%mm1               \n\t"
1376                 "por %%mm2, %%mm1               \n\t"                
1377                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1378                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1379                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1380                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1381                 "pand %%mm7, %%mm0              \n\t"
1382                 "pand %%mm5, %%mm1              \n\t"
1383                 "pand %%mm6, %%mm2              \n\t"
1384                 "por %%mm0, %%mm1               \n\t"
1385                 "por %%mm2, %%mm1               \n\t"                
1386                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1387                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1388                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1389                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1390                 "pand %%mm6, %%mm0              \n\t"
1391                 "pand %%mm7, %%mm1              \n\t"
1392                 "pand %%mm5, %%mm2              \n\t"
1393                 "por %%mm0, %%mm1               \n\t"
1394                 "por %%mm2, %%mm1               \n\t"                
1395                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1396                 "addl $24, %%eax                \n\t"
1397                 " js 1b                         \n\t"
1398                 : "+a" (mmx_size)
1399                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1400         );
1401
1402         __asm __volatile(SFENCE:::"memory");
1403         __asm __volatile(EMMS:::"memory");
1404
1405         if(mmx_size==23) return; //finihsed, was multiple of 8
1406
1407         src+= src_size;
1408         dst+= src_size;
1409         src_size= 23-mmx_size;
1410         src-= src_size;
1411         dst-= src_size;
1412 #endif
1413         for(i=0; i<src_size; i+=3)
1414         {
1415                 register uint8_t x;
1416                 x          = src[i + 2];
1417                 dst[i + 1] = src[i + 1];
1418                 dst[i + 2] = src[i + 0];
1419                 dst[i + 0] = x;
1420         }
1421 }
1422
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424         unsigned int width, unsigned int height,
1425         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426 {
1427         unsigned y;
1428         const unsigned chromWidth= width>>1;
1429         for(y=0; y<height; y++)
1430         {
1431 #ifdef HAVE_MMX
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433                 asm volatile(
1434                         "xorl %%eax, %%eax              \n\t"
1435                         ".balign 16                     \n\t"
1436                         "1:                             \n\t"
1437                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1438                         PREFETCH" 32(%2, %%eax)         \n\t"
1439                         PREFETCH" 32(%3, %%eax)         \n\t"
1440                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441                         "movq %%mm0, %%mm2              \n\t" // U(0)
1442                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1444                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1445
1446                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1447                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1448                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1449                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1450                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1451                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1452                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1453                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1454
1455                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1456                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1457                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1459
1460                         "addl $8, %%eax                 \n\t"
1461                         "cmpl %4, %%eax                 \n\t"
1462                         " jb 1b                         \n\t"
1463                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464                         : "%eax"
1465                 );
1466 #else
1467
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n)                                      \
1470         y1 = yc[n];                                     \
1471         y2 = yc2[n];                                    \
1472         u = uc[n];                                      \
1473         v = vc[n];                                      \
1474         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1475         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1476         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478         yuv1 = (u << 8) + (v << 24);                    \
1479         yuv2 = yuv1 + y2;                               \
1480         yuv1 += y1;                                     \
1481         qdst[n] = yuv1;                                 \
1482         qdst2[n] = yuv2;
1483
1484                 int i;
1485                 uint64_t *qdst = (uint64_t *) dst;
1486                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487                 const uint32_t *yc = (uint32_t *) ysrc;
1488                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490                 for(i = 0; i < chromWidth; i += 8){
1491                         uint64_t y1, y2, yuv1, yuv2;
1492                         uint64_t u, v;
1493                         /* Prefetch */
1494                         asm("ldq $31,64(%0)" :: "r"(yc));
1495                         asm("ldq $31,64(%0)" :: "r"(yc2));
1496                         asm("ldq $31,64(%0)" :: "r"(uc));
1497                         asm("ldq $31,64(%0)" :: "r"(vc));
1498
1499                         pl2yuy2(0);
1500                         pl2yuy2(1);
1501                         pl2yuy2(2);
1502                         pl2yuy2(3);
1503
1504                         yc += 4;
1505                         yc2 += 4;
1506                         uc += 4;
1507                         vc += 4;
1508                         qdst += 4;
1509                         qdst2 += 4;
1510                 }
1511                 y++;
1512                 ysrc += lumStride;
1513                 dst += dstStride;
1514
1515 #elif __WORDSIZE >= 64
1516                 int i;
1517                 uint64_t *ldst = (uint64_t *) dst;
1518                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519                 for(i = 0; i < chromWidth; i += 2){
1520                         uint64_t k, l;
1521                         k = yc[0] + (uc[0] << 8) +
1522                             (yc[1] << 16) + (vc[0] << 24);
1523                         l = yc[2] + (uc[1] << 8) +
1524                             (yc[3] << 16) + (vc[1] << 24);
1525                         *ldst++ = k + (l << 32);
1526                         yc += 4;
1527                         uc += 2;
1528                         vc += 2;
1529                 }
1530
1531 #else
1532                 int i, *idst = (int32_t *) dst;
1533                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534                 for(i = 0; i < chromWidth; i++){
1535                         *idst++ = yc[0] + (uc[0] << 8) +
1536                             (yc[1] << 16) + (vc[0] << 24);
1537                         yc += 2;
1538                         uc++;
1539                         vc++;
1540                 }
1541 #endif
1542 #endif
1543                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1544                 {
1545                         usrc += chromStride;
1546                         vsrc += chromStride;
1547                 }
1548                 ysrc += lumStride;
1549                 dst += dstStride;
1550         }
1551 #ifdef HAVE_MMX
1552 asm(    EMMS" \n\t"
1553         SFENCE" \n\t"
1554         :::"memory");
1555 #endif
1556 }
1557
1558 /**
1559  *
1560  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561  * problem for anyone then tell me, and ill fix it)
1562  */
1563 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564         unsigned int width, unsigned int height,
1565         int lumStride, int chromStride, int dstStride)
1566 {
1567         //FIXME interpolate chroma
1568         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1569 }
1570
1571 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572         unsigned int width, unsigned int height,
1573         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1574 {
1575         unsigned y;
1576         const unsigned chromWidth= width>>1;
1577         for(y=0; y<height; y++)
1578         {
1579 #ifdef HAVE_MMX
1580 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1581                 asm volatile(
1582                         "xorl %%eax, %%eax              \n\t"
1583                         ".balign 16                     \n\t"
1584                         "1:                             \n\t"
1585                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1586                         PREFETCH" 32(%2, %%eax)         \n\t"
1587                         PREFETCH" 32(%3, %%eax)         \n\t"
1588                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1589                         "movq %%mm0, %%mm2              \n\t" // U(0)
1590                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1591                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1592                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1593
1594                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1595                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1596                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1597                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1598                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1599                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1600                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1601                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1602
1603                         MOVNTQ" %%mm0, (%0, %%eax, 4)   \n\t"
1604                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1605                         MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1606                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1607
1608                         "addl $8, %%eax                 \n\t"
1609                         "cmpl %4, %%eax                 \n\t"
1610                         " jb 1b                         \n\t"
1611                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1612                         : "%eax"
1613                 );
1614 #else
1615 //FIXME adapt the alpha asm code from yv12->yuy2
1616
1617 #if __WORDSIZE >= 64
1618                 int i;
1619                 uint64_t *ldst = (uint64_t *) dst;
1620                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1621                 for(i = 0; i < chromWidth; i += 2){
1622                         uint64_t k, l;
1623                         k = uc[0] + (yc[0] << 8) +
1624                             (vc[0] << 16) + (yc[1] << 24);
1625                         l = uc[1] + (yc[2] << 8) +
1626                             (vc[1] << 16) + (yc[3] << 24);
1627                         *ldst++ = k + (l << 32);
1628                         yc += 4;
1629                         uc += 2;
1630                         vc += 2;
1631                 }
1632
1633 #else
1634                 int i, *idst = (int32_t *) dst;
1635                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1636                 for(i = 0; i < chromWidth; i++){
1637                         *idst++ = uc[0] + (yc[0] << 8) +
1638                             (vc[0] << 16) + (yc[1] << 24);
1639                         yc += 2;
1640                         uc++;
1641                         vc++;
1642                 }
1643 #endif
1644 #endif
1645                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1646                 {
1647                         usrc += chromStride;
1648                         vsrc += chromStride;
1649                 }
1650                 ysrc += lumStride;
1651                 dst += dstStride;
1652         }
1653 #ifdef HAVE_MMX
1654 asm(    EMMS" \n\t"
1655         SFENCE" \n\t"
1656         :::"memory");
1657 #endif
1658 }
1659
1660 /**
1661  *
1662  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1663  * problem for anyone then tell me, and ill fix it)
1664  */
1665 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1666         unsigned int width, unsigned int height,
1667         int lumStride, int chromStride, int dstStride)
1668 {
1669         //FIXME interpolate chroma
1670         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1671 }
1672
1673 /**
1674  *
1675  * width should be a multiple of 16
1676  */
1677 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678         unsigned int width, unsigned int height,
1679         int lumStride, int chromStride, int dstStride)
1680 {
1681         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1682 }
1683
1684 /**
1685  *
1686  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1687  * problem for anyone then tell me, and ill fix it)
1688  */
1689 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1690         unsigned int width, unsigned int height,
1691         int lumStride, int chromStride, int srcStride)
1692 {
1693         unsigned y;
1694         const unsigned chromWidth= width>>1;
1695         for(y=0; y<height; y+=2)
1696         {
1697 #ifdef HAVE_MMX
1698                 asm volatile(
1699                         "xorl %%eax, %%eax              \n\t"
1700                         "pcmpeqw %%mm7, %%mm7           \n\t"
1701                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1702                         ".balign 16                     \n\t"
1703                         "1:                             \n\t"
1704                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1705                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1706                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1707                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1708                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1709                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1710                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1711                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1712                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1713                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1714                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1715
1716                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1717
1718                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1719                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1720                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1721                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1722                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1723                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1724                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1725                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1726                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1727                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1728
1729                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1730
1731                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1732                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1733                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1734                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1735                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1736                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1737                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1738                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1739
1740                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1741                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1742
1743                         "addl $8, %%eax                 \n\t"
1744                         "cmpl %4, %%eax                 \n\t"
1745                         " jb 1b                         \n\t"
1746                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1747                         : "memory", "%eax"
1748                 );
1749
1750                 ydst += lumStride;
1751                 src  += srcStride;
1752
1753                 asm volatile(
1754                         "xorl %%eax, %%eax              \n\t"
1755                         ".balign 16                     \n\t"
1756                         "1:                             \n\t"
1757                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1758                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1759                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1760                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1761                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1762                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1763                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1764                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1765                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1766                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1767                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1768
1769                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1770                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1771
1772                         "addl $8, %%eax                 \n\t"
1773                         "cmpl %4, %%eax                 \n\t"
1774                         " jb 1b                         \n\t"
1775
1776                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1777                         : "memory", "%eax"
1778                 );
1779 #else
1780                 unsigned i;
1781                 for(i=0; i<chromWidth; i++)
1782                 {
1783                         ydst[2*i+0]     = src[4*i+0];
1784                         udst[i]         = src[4*i+1];
1785                         ydst[2*i+1]     = src[4*i+2];
1786                         vdst[i]         = src[4*i+3];
1787                 }
1788                 ydst += lumStride;
1789                 src  += srcStride;
1790
1791                 for(i=0; i<chromWidth; i++)
1792                 {
1793                         ydst[2*i+0]     = src[4*i+0];
1794                         ydst[2*i+1]     = src[4*i+2];
1795                 }
1796 #endif
1797                 udst += chromStride;
1798                 vdst += chromStride;
1799                 ydst += lumStride;
1800                 src  += srcStride;
1801         }
1802 #ifdef HAVE_MMX
1803 asm volatile(   EMMS" \n\t"
1804                 SFENCE" \n\t"
1805                 :::"memory");
1806 #endif
1807 }
1808
1809 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1810         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1811         unsigned int width, unsigned int height, int lumStride, int chromStride)
1812 {
1813         /* Y Plane */
1814         memcpy(ydst, ysrc, width*height);
1815
1816         /* XXX: implement upscaling for U,V */
1817 }
1818
1819 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1820 {
1821         int x,y;
1822         
1823         dst[0]= src[0];
1824         
1825         // first line
1826         for(x=0; x<srcWidth-1; x++){
1827                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1828                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1829         }
1830         dst[2*srcWidth-1]= src[srcWidth-1];
1831         
1832         dst+= dstStride;
1833
1834         for(y=1; y<srcHeight; y++){
1835 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1836                 const int mmxSize= srcWidth&~15;
1837                 asm volatile(
1838                         "movl %4, %%eax                 \n\t"
1839                         "1:                             \n\t"
1840                         "movq (%0, %%eax), %%mm0        \n\t"
1841                         "movq (%1, %%eax), %%mm1        \n\t"
1842                         "movq 1(%0, %%eax), %%mm2       \n\t"
1843                         "movq 1(%1, %%eax), %%mm3       \n\t"
1844                         "movq -1(%0, %%eax), %%mm4      \n\t"
1845                         "movq -1(%1, %%eax), %%mm5      \n\t"
1846                         PAVGB" %%mm0, %%mm5             \n\t"
1847                         PAVGB" %%mm0, %%mm3             \n\t"
1848                         PAVGB" %%mm0, %%mm5             \n\t"
1849                         PAVGB" %%mm0, %%mm3             \n\t"
1850                         PAVGB" %%mm1, %%mm4             \n\t"
1851                         PAVGB" %%mm1, %%mm2             \n\t"
1852                         PAVGB" %%mm1, %%mm4             \n\t"
1853                         PAVGB" %%mm1, %%mm2             \n\t"
1854                         "movq %%mm5, %%mm7              \n\t"
1855                         "movq %%mm4, %%mm6              \n\t"
1856                         "punpcklbw %%mm3, %%mm5         \n\t"
1857                         "punpckhbw %%mm3, %%mm7         \n\t"
1858                         "punpcklbw %%mm2, %%mm4         \n\t"
1859                         "punpckhbw %%mm2, %%mm6         \n\t"
1860 #if 1
1861                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
1862                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
1863                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
1864                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
1865 #else
1866                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
1867                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
1868                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
1869                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
1870 #endif
1871                         "addl $8, %%eax                 \n\t"
1872                         " js 1b                         \n\t"
1873                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1874                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1875                            "g" (-mmxSize)
1876                         : "%eax"
1877
1878                 );
1879 #else
1880                 const int mmxSize=1;
1881 #endif
1882                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1883                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1884
1885                 for(x=mmxSize-1; x<srcWidth-1; x++){
1886                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1887                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1888                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1889                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1890                 }
1891                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1892                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1893
1894                 dst+=dstStride*2;
1895                 src+=srcStride;
1896         }
1897         
1898         // last line
1899 #if 1
1900         dst[0]= src[0];
1901         
1902         for(x=0; x<srcWidth-1; x++){
1903                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1904                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1905         }
1906         dst[2*srcWidth-1]= src[srcWidth-1];
1907 #else
1908         for(x=0; x<srcWidth; x++){
1909                 dst[2*x+0]=
1910                 dst[2*x+1]= src[x];
1911         }
1912 #endif
1913
1914 #ifdef HAVE_MMX
1915 asm volatile(   EMMS" \n\t"
1916                 SFENCE" \n\t"
1917                 :::"memory");
1918 #endif
1919 }
1920
1921 /**
1922  *
1923  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1924  * problem for anyone then tell me, and ill fix it)
1925  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1926  */
1927 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1928         unsigned int width, unsigned int height,
1929         int lumStride, int chromStride, int srcStride)
1930 {
1931         unsigned y;
1932         const unsigned chromWidth= width>>1;
1933         for(y=0; y<height; y+=2)
1934         {
1935 #ifdef HAVE_MMX
1936                 asm volatile(
1937                         "xorl %%eax, %%eax              \n\t"
1938                         "pcmpeqw %%mm7, %%mm7           \n\t"
1939                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1940                         ".balign 16                     \n\t"
1941                         "1:                             \n\t"
1942                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1943                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1944                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1945                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1946                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1947                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1948                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1949                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1950                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1951                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1952                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1953
1954                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1955
1956                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1957                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1958                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1959                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1960                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1961                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1962                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1963                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1964                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1965                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1966
1967                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1968
1969                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1970                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1971                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1972                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1973                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1974                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1975                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1976                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1977
1978                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1979                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1980
1981                         "addl $8, %%eax                 \n\t"
1982                         "cmpl %4, %%eax                 \n\t"
1983                         " jb 1b                         \n\t"
1984                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1985                         : "memory", "%eax"
1986                 );
1987
1988                 ydst += lumStride;
1989                 src  += srcStride;
1990
1991                 asm volatile(
1992                         "xorl %%eax, %%eax              \n\t"
1993                         ".balign 16                     \n\t"
1994                         "1:                             \n\t"
1995                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1996                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1997                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1998                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1999                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2000                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2001                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2002                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2003                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2004                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2005                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2006
2007                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2008                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2009
2010                         "addl $8, %%eax                 \n\t"
2011                         "cmpl %4, %%eax                 \n\t"
2012                         " jb 1b                         \n\t"
2013
2014                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2015                         : "memory", "%eax"
2016                 );
2017 #else
2018                 unsigned i;
2019                 for(i=0; i<chromWidth; i++)
2020                 {
2021                         udst[i]         = src[4*i+0];
2022                         ydst[2*i+0]     = src[4*i+1];
2023                         vdst[i]         = src[4*i+2];
2024                         ydst[2*i+1]     = src[4*i+3];
2025                 }
2026                 ydst += lumStride;
2027                 src  += srcStride;
2028
2029                 for(i=0; i<chromWidth; i++)
2030                 {
2031                         ydst[2*i+0]     = src[4*i+1];
2032                         ydst[2*i+1]     = src[4*i+3];
2033                 }
2034 #endif
2035                 udst += chromStride;
2036                 vdst += chromStride;
2037                 ydst += lumStride;
2038                 src  += srcStride;
2039         }
2040 #ifdef HAVE_MMX
2041 asm volatile(   EMMS" \n\t"
2042                 SFENCE" \n\t"
2043                 :::"memory");
2044 #endif
2045 }
2046
2047 /**
2048  *
2049  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2050  * problem for anyone then tell me, and ill fix it)
2051  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2052  */
2053 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2054         unsigned int width, unsigned int height,
2055         int lumStride, int chromStride, int srcStride)
2056 {
2057         unsigned y;
2058         const unsigned chromWidth= width>>1;
2059 #ifdef HAVE_MMX
2060         for(y=0; y<height-2; y+=2)
2061         {
2062                 unsigned i;
2063                 for(i=0; i<2; i++)
2064                 {
2065                         asm volatile(
2066                                 "movl %2, %%eax                 \n\t"
2067                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2068                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2069                                 "pxor %%mm7, %%mm7              \n\t"
2070                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2071                                 ".balign 16                     \n\t"
2072                                 "1:                             \n\t"
2073                                 PREFETCH" 64(%0, %%ebx)         \n\t"
2074                                 "movd (%0, %%ebx), %%mm0        \n\t"
2075                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
2076                                 "punpcklbw %%mm7, %%mm0         \n\t"
2077                                 "punpcklbw %%mm7, %%mm1         \n\t"
2078                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
2079                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
2080                                 "punpcklbw %%mm7, %%mm2         \n\t"
2081                                 "punpcklbw %%mm7, %%mm3         \n\t"
2082                                 "pmaddwd %%mm6, %%mm0           \n\t"
2083                                 "pmaddwd %%mm6, %%mm1           \n\t"
2084                                 "pmaddwd %%mm6, %%mm2           \n\t"
2085                                 "pmaddwd %%mm6, %%mm3           \n\t"
2086 #ifndef FAST_BGR2YV12
2087                                 "psrad $8, %%mm0                \n\t"
2088                                 "psrad $8, %%mm1                \n\t"
2089                                 "psrad $8, %%mm2                \n\t"
2090                                 "psrad $8, %%mm3                \n\t"
2091 #endif
2092                                 "packssdw %%mm1, %%mm0          \n\t"
2093                                 "packssdw %%mm3, %%mm2          \n\t"
2094                                 "pmaddwd %%mm5, %%mm0           \n\t"
2095                                 "pmaddwd %%mm5, %%mm2           \n\t"
2096                                 "packssdw %%mm2, %%mm0          \n\t"
2097                                 "psraw $7, %%mm0                \n\t"
2098
2099                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
2100                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
2101                                 "punpcklbw %%mm7, %%mm4         \n\t"
2102                                 "punpcklbw %%mm7, %%mm1         \n\t"
2103                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
2104                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
2105                                 "punpcklbw %%mm7, %%mm2         \n\t"
2106                                 "punpcklbw %%mm7, %%mm3         \n\t"
2107                                 "pmaddwd %%mm6, %%mm4           \n\t"
2108                                 "pmaddwd %%mm6, %%mm1           \n\t"
2109                                 "pmaddwd %%mm6, %%mm2           \n\t"
2110                                 "pmaddwd %%mm6, %%mm3           \n\t"
2111 #ifndef FAST_BGR2YV12
2112                                 "psrad $8, %%mm4                \n\t"
2113                                 "psrad $8, %%mm1                \n\t"
2114                                 "psrad $8, %%mm2                \n\t"
2115                                 "psrad $8, %%mm3                \n\t"
2116 #endif
2117                                 "packssdw %%mm1, %%mm4          \n\t"
2118                                 "packssdw %%mm3, %%mm2          \n\t"
2119                                 "pmaddwd %%mm5, %%mm4           \n\t"
2120                                 "pmaddwd %%mm5, %%mm2           \n\t"
2121                                 "addl $24, %%ebx                \n\t"
2122                                 "packssdw %%mm2, %%mm4          \n\t"
2123                                 "psraw $7, %%mm4                \n\t"
2124
2125                                 "packuswb %%mm4, %%mm0          \n\t"
2126                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2127
2128                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
2129                                 "addl $8, %%eax                 \n\t"
2130                                 " js 1b                         \n\t"
2131                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2132                                 : "%eax", "%ebx"
2133                         );
2134                         ydst += lumStride;
2135                         src  += srcStride;
2136                 }
2137                 src -= srcStride*2;
2138                 asm volatile(
2139                         "movl %4, %%eax                 \n\t"
2140                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2141                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2142                         "pxor %%mm7, %%mm7              \n\t"
2143                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2144                         "addl %%ebx, %%ebx              \n\t"
2145                         ".balign 16                     \n\t"
2146                         "1:                             \n\t"
2147                         PREFETCH" 64(%0, %%ebx)         \n\t"
2148                         PREFETCH" 64(%1, %%ebx)         \n\t"
2149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2150                         "movq (%0, %%ebx), %%mm0        \n\t"
2151                         "movq (%1, %%ebx), %%mm1        \n\t"
2152                         "movq 6(%0, %%ebx), %%mm2       \n\t"
2153                         "movq 6(%1, %%ebx), %%mm3       \n\t"
2154                         PAVGB" %%mm1, %%mm0             \n\t"
2155                         PAVGB" %%mm3, %%mm2             \n\t"
2156                         "movq %%mm0, %%mm1              \n\t"
2157                         "movq %%mm2, %%mm3              \n\t"
2158                         "psrlq $24, %%mm0               \n\t"
2159                         "psrlq $24, %%mm2               \n\t"
2160                         PAVGB" %%mm1, %%mm0             \n\t"
2161                         PAVGB" %%mm3, %%mm2             \n\t"
2162                         "punpcklbw %%mm7, %%mm0         \n\t"
2163                         "punpcklbw %%mm7, %%mm2         \n\t"
2164 #else
2165                         "movd (%0, %%ebx), %%mm0        \n\t"
2166                         "movd (%1, %%ebx), %%mm1        \n\t"
2167                         "movd 3(%0, %%ebx), %%mm2       \n\t"
2168                         "movd 3(%1, %%ebx), %%mm3       \n\t"
2169                         "punpcklbw %%mm7, %%mm0         \n\t"
2170                         "punpcklbw %%mm7, %%mm1         \n\t"
2171                         "punpcklbw %%mm7, %%mm2         \n\t"
2172                         "punpcklbw %%mm7, %%mm3         \n\t"
2173                         "paddw %%mm1, %%mm0             \n\t"
2174                         "paddw %%mm3, %%mm2             \n\t"
2175                         "paddw %%mm2, %%mm0             \n\t"
2176                         "movd 6(%0, %%ebx), %%mm4       \n\t"
2177                         "movd 6(%1, %%ebx), %%mm1       \n\t"
2178                         "movd 9(%0, %%ebx), %%mm2       \n\t"
2179                         "movd 9(%1, %%ebx), %%mm3       \n\t"
2180                         "punpcklbw %%mm7, %%mm4         \n\t"
2181                         "punpcklbw %%mm7, %%mm1         \n\t"
2182                         "punpcklbw %%mm7, %%mm2         \n\t"
2183                         "punpcklbw %%mm7, %%mm3         \n\t"
2184                         "paddw %%mm1, %%mm4             \n\t"
2185                         "paddw %%mm3, %%mm2             \n\t"
2186                         "paddw %%mm4, %%mm2             \n\t"
2187                         "psrlw $2, %%mm0                \n\t"
2188                         "psrlw $2, %%mm2                \n\t"
2189 #endif
2190                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2191                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2192
2193                         "pmaddwd %%mm0, %%mm1           \n\t"
2194                         "pmaddwd %%mm2, %%mm3           \n\t"
2195                         "pmaddwd %%mm6, %%mm0           \n\t"
2196                         "pmaddwd %%mm6, %%mm2           \n\t"
2197 #ifndef FAST_BGR2YV12
2198                         "psrad $8, %%mm0                \n\t"
2199                         "psrad $8, %%mm1                \n\t"
2200                         "psrad $8, %%mm2                \n\t"
2201                         "psrad $8, %%mm3                \n\t"
2202 #endif
2203                         "packssdw %%mm2, %%mm0          \n\t"
2204                         "packssdw %%mm3, %%mm1          \n\t"
2205                         "pmaddwd %%mm5, %%mm0           \n\t"
2206                         "pmaddwd %%mm5, %%mm1           \n\t"
2207                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2208                         "psraw $7, %%mm0                \n\t"
2209
2210 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2211                         "movq 12(%0, %%ebx), %%mm4      \n\t"
2212                         "movq 12(%1, %%ebx), %%mm1      \n\t"
2213                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2214                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2215                         PAVGB" %%mm1, %%mm4             \n\t"
2216                         PAVGB" %%mm3, %%mm2             \n\t"
2217                         "movq %%mm4, %%mm1              \n\t"
2218                         "movq %%mm2, %%mm3              \n\t"
2219                         "psrlq $24, %%mm4               \n\t"
2220                         "psrlq $24, %%mm2               \n\t"
2221                         PAVGB" %%mm1, %%mm4             \n\t"
2222                         PAVGB" %%mm3, %%mm2             \n\t"
2223                         "punpcklbw %%mm7, %%mm4         \n\t"
2224                         "punpcklbw %%mm7, %%mm2         \n\t"
2225 #else
2226                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2227                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2228                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2229                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2230                         "punpcklbw %%mm7, %%mm4         \n\t"
2231                         "punpcklbw %%mm7, %%mm1         \n\t"
2232                         "punpcklbw %%mm7, %%mm2         \n\t"
2233                         "punpcklbw %%mm7, %%mm3         \n\t"
2234                         "paddw %%mm1, %%mm4             \n\t"
2235                         "paddw %%mm3, %%mm2             \n\t"
2236                         "paddw %%mm2, %%mm4             \n\t"
2237                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2238                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2239                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2240                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2241                         "punpcklbw %%mm7, %%mm5         \n\t"
2242                         "punpcklbw %%mm7, %%mm1         \n\t"
2243                         "punpcklbw %%mm7, %%mm2         \n\t"
2244                         "punpcklbw %%mm7, %%mm3         \n\t"
2245                         "paddw %%mm1, %%mm5             \n\t"
2246                         "paddw %%mm3, %%mm2             \n\t"
2247                         "paddw %%mm5, %%mm2             \n\t"
2248                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2249                         "psrlw $2, %%mm4                \n\t"
2250                         "psrlw $2, %%mm2                \n\t"
2251 #endif
2252                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2253                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2254
2255                         "pmaddwd %%mm4, %%mm1           \n\t"
2256                         "pmaddwd %%mm2, %%mm3           \n\t"
2257                         "pmaddwd %%mm6, %%mm4           \n\t"
2258                         "pmaddwd %%mm6, %%mm2           \n\t"
2259 #ifndef FAST_BGR2YV12
2260                         "psrad $8, %%mm4                \n\t"
2261                         "psrad $8, %%mm1                \n\t"
2262                         "psrad $8, %%mm2                \n\t"
2263                         "psrad $8, %%mm3                \n\t"
2264 #endif
2265                         "packssdw %%mm2, %%mm4          \n\t"
2266                         "packssdw %%mm3, %%mm1          \n\t"
2267                         "pmaddwd %%mm5, %%mm4           \n\t"
2268                         "pmaddwd %%mm5, %%mm1           \n\t"
2269                         "addl $24, %%ebx                \n\t"
2270                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2271                         "psraw $7, %%mm4                \n\t"
2272
2273                         "movq %%mm0, %%mm1              \n\t"
2274                         "punpckldq %%mm4, %%mm0         \n\t"
2275                         "punpckhdq %%mm4, %%mm1         \n\t"
2276                         "packsswb %%mm1, %%mm0          \n\t"
2277                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2278
2279                         "movd %%mm0, (%2, %%eax)        \n\t"
2280                         "punpckhdq %%mm0, %%mm0         \n\t"
2281                         "movd %%mm0, (%3, %%eax)        \n\t"
2282                         "addl $4, %%eax                 \n\t"
2283                         " js 1b                         \n\t"
2284                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2285                         : "%eax", "%ebx"
2286                 );
2287
2288                 udst += chromStride;
2289                 vdst += chromStride;
2290                 src  += srcStride*2;
2291         }
2292
2293         asm volatile(   EMMS" \n\t"
2294                         SFENCE" \n\t"
2295                         :::"memory");
2296 #else
2297         y=0;
2298 #endif
2299         for(; y<height; y+=2)
2300         {
2301                 unsigned i;
2302                 for(i=0; i<chromWidth; i++)
2303                 {
2304                         unsigned int b= src[6*i+0];
2305                         unsigned int g= src[6*i+1];
2306                         unsigned int r= src[6*i+2];
2307
2308                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2309                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2310                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2311
2312                         udst[i]         = U;
2313                         vdst[i]         = V;
2314                         ydst[2*i]       = Y;
2315
2316                         b= src[6*i+3];
2317                         g= src[6*i+4];
2318                         r= src[6*i+5];
2319
2320                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2321                         ydst[2*i+1]     = Y;
2322                 }
2323                 ydst += lumStride;
2324                 src  += srcStride;
2325
2326                 for(i=0; i<chromWidth; i++)
2327                 {
2328                         unsigned int b= src[6*i+0];
2329                         unsigned int g= src[6*i+1];
2330                         unsigned int r= src[6*i+2];
2331
2332                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2333
2334                         ydst[2*i]       = Y;
2335
2336                         b= src[6*i+3];
2337                         g= src[6*i+4];
2338                         r= src[6*i+5];
2339
2340                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2341                         ydst[2*i+1]     = Y;
2342                 }
2343                 udst += chromStride;
2344                 vdst += chromStride;
2345                 ydst += lumStride;
2346                 src  += srcStride;
2347         }
2348 }
2349
2350 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2351                             unsigned width, unsigned height, int src1Stride,
2352                             int src2Stride, int dstStride){
2353         unsigned h;
2354
2355         for(h=0; h < height; h++)
2356         {
2357                 unsigned w;
2358
2359 #ifdef HAVE_MMX
2360 #ifdef HAVE_SSE2
2361                 asm(
2362                         "xorl %%eax, %%eax              \n\t"
2363                         "1:                             \n\t"
2364                         PREFETCH" 64(%1, %%eax)         \n\t"
2365                         PREFETCH" 64(%2, %%eax)         \n\t"
2366                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2367                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2368                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2369                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2370                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2371                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2372                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2373                         "addl $16, %%eax                        \n\t"
2374                         "cmpl %3, %%eax                 \n\t"
2375                         " jb 1b                         \n\t"
2376                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2377                         : "memory", "%eax"
2378                 );
2379 #else
2380                 asm(
2381                         "xorl %%eax, %%eax              \n\t"
2382                         "1:                             \n\t"
2383                         PREFETCH" 64(%1, %%eax)         \n\t"
2384                         PREFETCH" 64(%2, %%eax)         \n\t"
2385                         "movq (%1, %%eax), %%mm0        \n\t"
2386                         "movq 8(%1, %%eax), %%mm2       \n\t"
2387                         "movq %%mm0, %%mm1              \n\t"
2388                         "movq %%mm2, %%mm3              \n\t"
2389                         "movq (%2, %%eax), %%mm4        \n\t"
2390                         "movq 8(%2, %%eax), %%mm5       \n\t"
2391                         "punpcklbw %%mm4, %%mm0         \n\t"
2392                         "punpckhbw %%mm4, %%mm1         \n\t"
2393                         "punpcklbw %%mm5, %%mm2         \n\t"
2394                         "punpckhbw %%mm5, %%mm3         \n\t"
2395                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2396                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2397                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2398                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2399                         "addl $16, %%eax                        \n\t"
2400                         "cmpl %3, %%eax                 \n\t"
2401                         " jb 1b                         \n\t"
2402                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2403                         : "memory", "%eax"
2404                 );
2405 #endif
2406                 for(w= (width&(~15)); w < width; w++)
2407                 {
2408                         dest[2*w+0] = src1[w];
2409                         dest[2*w+1] = src2[w];
2410                 }
2411 #else
2412                 for(w=0; w < width; w++)
2413                 {
2414                         dest[2*w+0] = src1[w];
2415                         dest[2*w+1] = src2[w];
2416                 }
2417 #endif
2418                 dest += dstStride;
2419                 src1 += src1Stride;
2420                 src2 += src2Stride;
2421         }
2422 #ifdef HAVE_MMX
2423         asm(
2424                 EMMS" \n\t"
2425                 SFENCE" \n\t"
2426                 ::: "memory"
2427                 );
2428 #endif
2429 }
2430
2431 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2432                         uint8_t *dst1, uint8_t *dst2,
2433                         unsigned width, unsigned height,
2434                         int srcStride1, int srcStride2,
2435                         int dstStride1, int dstStride2)
2436 {
2437     unsigned int y,x,h;
2438     int w;
2439     w=width/2; h=height/2;
2440 #ifdef HAVE_MMX
2441     asm volatile(
2442         PREFETCH" %0\n\t"
2443         PREFETCH" %1\n\t"
2444         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2445 #endif
2446     for(y=0;y<h;y++){
2447         const uint8_t* s1=src1+srcStride1*(y>>1);
2448         uint8_t* d=dst1+dstStride1*y;
2449         x=0;
2450 #ifdef HAVE_MMX
2451         for(;x<w-31;x+=32)
2452         {
2453             asm volatile(
2454                 PREFETCH" 32%1\n\t"
2455                 "movq   %1, %%mm0\n\t"
2456                 "movq   8%1, %%mm2\n\t"
2457                 "movq   16%1, %%mm4\n\t"
2458                 "movq   24%1, %%mm6\n\t"
2459                 "movq   %%mm0, %%mm1\n\t"
2460                 "movq   %%mm2, %%mm3\n\t"
2461                 "movq   %%mm4, %%mm5\n\t"
2462                 "movq   %%mm6, %%mm7\n\t"
2463                 "punpcklbw %%mm0, %%mm0\n\t"
2464                 "punpckhbw %%mm1, %%mm1\n\t"
2465                 "punpcklbw %%mm2, %%mm2\n\t"
2466                 "punpckhbw %%mm3, %%mm3\n\t"
2467                 "punpcklbw %%mm4, %%mm4\n\t"
2468                 "punpckhbw %%mm5, %%mm5\n\t"
2469                 "punpcklbw %%mm6, %%mm6\n\t"
2470                 "punpckhbw %%mm7, %%mm7\n\t"
2471                 MOVNTQ" %%mm0, %0\n\t"
2472                 MOVNTQ" %%mm1, 8%0\n\t"
2473                 MOVNTQ" %%mm2, 16%0\n\t"
2474                 MOVNTQ" %%mm3, 24%0\n\t"
2475                 MOVNTQ" %%mm4, 32%0\n\t"
2476                 MOVNTQ" %%mm5, 40%0\n\t"
2477                 MOVNTQ" %%mm6, 48%0\n\t"
2478                 MOVNTQ" %%mm7, 56%0"
2479                 :"=m"(d[2*x])
2480                 :"m"(s1[x])
2481                 :"memory");
2482         }
2483 #endif
2484         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2485     }
2486     for(y=0;y<h;y++){
2487         const uint8_t* s2=src2+srcStride2*(y>>1);
2488         uint8_t* d=dst2+dstStride2*y;
2489         x=0;
2490 #ifdef HAVE_MMX
2491         for(;x<w-31;x+=32)
2492         {
2493             asm volatile(
2494                 PREFETCH" 32%1\n\t"
2495                 "movq   %1, %%mm0\n\t"
2496                 "movq   8%1, %%mm2\n\t"
2497                 "movq   16%1, %%mm4\n\t"
2498                 "movq   24%1, %%mm6\n\t"
2499                 "movq   %%mm0, %%mm1\n\t"
2500                 "movq   %%mm2, %%mm3\n\t"
2501                 "movq   %%mm4, %%mm5\n\t"
2502                 "movq   %%mm6, %%mm7\n\t"
2503                 "punpcklbw %%mm0, %%mm0\n\t"
2504                 "punpckhbw %%mm1, %%mm1\n\t"
2505                 "punpcklbw %%mm2, %%mm2\n\t"
2506                 "punpckhbw %%mm3, %%mm3\n\t"
2507                 "punpcklbw %%mm4, %%mm4\n\t"
2508                 "punpckhbw %%mm5, %%mm5\n\t"
2509                 "punpcklbw %%mm6, %%mm6\n\t"
2510                 "punpckhbw %%mm7, %%mm7\n\t"
2511                 MOVNTQ" %%mm0, %0\n\t"
2512                 MOVNTQ" %%mm1, 8%0\n\t"
2513                 MOVNTQ" %%mm2, 16%0\n\t"
2514                 MOVNTQ" %%mm3, 24%0\n\t"
2515                 MOVNTQ" %%mm4, 32%0\n\t"
2516                 MOVNTQ" %%mm5, 40%0\n\t"
2517                 MOVNTQ" %%mm6, 48%0\n\t"
2518                 MOVNTQ" %%mm7, 56%0"
2519                 :"=m"(d[2*x])
2520                 :"m"(s2[x])
2521                 :"memory");
2522         }
2523 #endif
2524         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2525     }
2526 #ifdef HAVE_MMX
2527         asm(
2528                 EMMS" \n\t"
2529                 SFENCE" \n\t"
2530                 ::: "memory"
2531                 );
2532 #endif
2533 }
2534
2535 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2536                         uint8_t *dst,
2537                         unsigned width, unsigned height,
2538                         int srcStride1, int srcStride2,
2539                         int srcStride3, int dstStride)
2540 {
2541     unsigned y,x,w,h;
2542     w=width/2; h=height;
2543     for(y=0;y<h;y++){
2544         const uint8_t* yp=src1+srcStride1*y;
2545         const uint8_t* up=src2+srcStride2*(y>>2);
2546         const uint8_t* vp=src3+srcStride3*(y>>2);
2547         uint8_t* d=dst+dstStride*y;
2548         x=0;
2549 #ifdef HAVE_MMX
2550         for(;x<w-7;x+=8)
2551         {
2552             asm volatile(
2553                 PREFETCH" 32(%1, %0)\n\t"
2554                 PREFETCH" 32(%2, %0)\n\t"
2555                 PREFETCH" 32(%3, %0)\n\t"
2556                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2557                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2558                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2559                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2560                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2561                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2562                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2563                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2564                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2565                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2566
2567                 "movq   %%mm1, %%mm6\n\t"
2568                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2569                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2570                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2571                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2572                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2573                 
2574                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2575                 "movq   8(%1, %0, 4), %%mm0\n\t"
2576                 "movq   %%mm0, %%mm3\n\t"
2577                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2578                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2579                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2580                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2581
2582                 "movq   %%mm4, %%mm6\n\t"
2583                 "movq   16(%1, %0, 4), %%mm0\n\t"
2584                 "movq   %%mm0, %%mm3\n\t"
2585                 "punpcklbw %%mm5, %%mm4\n\t"
2586                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2587                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2588                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2589                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2590                 
2591                 "punpckhbw %%mm5, %%mm6\n\t"
2592                 "movq   24(%1, %0, 4), %%mm0\n\t"
2593                 "movq   %%mm0, %%mm3\n\t"
2594                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2595                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2596                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2597                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2598
2599                 : "+r" (x)
2600                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2601                 :"memory");
2602         }
2603 #endif
2604         for(; x<w; x++)
2605         {
2606             const int x2= x<<2;
2607             d[8*x+0]=yp[x2];
2608             d[8*x+1]=up[x];
2609             d[8*x+2]=yp[x2+1];
2610             d[8*x+3]=vp[x];
2611             d[8*x+4]=yp[x2+2];
2612             d[8*x+5]=up[x];
2613             d[8*x+6]=yp[x2+3];
2614             d[8*x+7]=vp[x];
2615         }
2616     }
2617 #ifdef HAVE_MMX
2618         asm(
2619                 EMMS" \n\t"
2620                 SFENCE" \n\t"
2621                 ::: "memory"
2622                 );
2623 #endif
2624 }