]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
bigendian fix by (Samuel Kleiner <kleiner at cd dot chalmers dot se>)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245 {
246   register const uint8_t* s=src;
247   register uint8_t* d=dst;
248   register const uint8_t *end;
249   const uint8_t *mm_end;
250   end = s + src_size;
251 #ifdef HAVE_MMX
252   __asm __volatile(PREFETCH"    %0"::"m"(*s));
253   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255   mm_end = end - 15;
256   while(s<mm_end)
257   {
258         __asm __volatile(
259                 PREFETCH"       32%1\n\t"
260                 "movq   %1, %%mm0\n\t"
261                 "movq   8%1, %%mm2\n\t"
262                 "movq   %%mm0, %%mm1\n\t"
263                 "movq   %%mm2, %%mm3\n\t"
264                 "psrlq  $1, %%mm0\n\t"
265                 "psrlq  $1, %%mm2\n\t"
266                 "pand   %%mm7, %%mm0\n\t"
267                 "pand   %%mm7, %%mm2\n\t"
268                 "pand   %%mm6, %%mm1\n\t"
269                 "pand   %%mm6, %%mm3\n\t"
270                 "por    %%mm1, %%mm0\n\t"
271                 "por    %%mm3, %%mm2\n\t"
272                 MOVNTQ" %%mm0, %0\n\t"
273                 MOVNTQ" %%mm2, 8%0"
274                 :"=m"(*d)
275                 :"m"(*s)
276                 );
277         d+=16;
278         s+=16;
279   }
280   __asm __volatile(SFENCE:::"memory");
281   __asm __volatile(EMMS:::"memory");
282 #endif
283     mm_end = end - 3;
284     while(s < mm_end)
285     {
286         register uint32_t x= *((uint32_t *)s);
287         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288         s+=4;
289         d+=4;
290     }
291     if(s < end)
292     {
293         register uint16_t x= *((uint16_t *)s);
294         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295         s+=2;
296         d+=2;
297     }
298 }
299
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301 {
302         const uint8_t *s = src;
303         const uint8_t *end;
304 #ifdef HAVE_MMX
305         const uint8_t *mm_end;
306 #endif
307         uint16_t *d = (uint16_t *)dst;
308         end = s + src_size;
309 #ifdef HAVE_MMX
310         mm_end = end - 15;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312         asm volatile(
313                 "movq %3, %%mm5                 \n\t"
314                 "movq %4, %%mm6                 \n\t"
315                 "movq %5, %%mm7                 \n\t"
316                 ".balign 16                     \n\t"
317                 "1:                             \n\t"
318                 PREFETCH" 32(%1)                \n\t"
319                 "movd   (%1), %%mm0             \n\t"
320                 "movd   4(%1), %%mm3            \n\t"
321                 "punpckldq 8(%1), %%mm0         \n\t"
322                 "punpckldq 12(%1), %%mm3        \n\t"
323                 "movq %%mm0, %%mm1              \n\t"
324                 "movq %%mm3, %%mm4              \n\t"
325                 "pand %%mm6, %%mm0              \n\t"
326                 "pand %%mm6, %%mm3              \n\t"
327                 "pmaddwd %%mm7, %%mm0           \n\t"
328                 "pmaddwd %%mm7, %%mm3           \n\t"
329                 "pand %%mm5, %%mm1              \n\t"
330                 "pand %%mm5, %%mm4              \n\t"
331                 "por %%mm1, %%mm0               \n\t"   
332                 "por %%mm4, %%mm3               \n\t"
333                 "psrld $5, %%mm0                \n\t"
334                 "pslld $11, %%mm3               \n\t"
335                 "por %%mm3, %%mm0               \n\t"
336                 MOVNTQ" %%mm0, (%0)             \n\t"
337                 "addl $16, %1                   \n\t"
338                 "addl $8, %0                    \n\t"
339                 "cmpl %2, %1                    \n\t"
340                 " jb 1b                         \n\t"
341                 : "+r" (d), "+r"(s)
342                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343         );
344 #else
345         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
346         __asm __volatile(
347             "movq       %0, %%mm7\n\t"
348             "movq       %1, %%mm6\n\t"
349             ::"m"(red_16mask),"m"(green_16mask));
350         while(s < mm_end)
351         {
352             __asm __volatile(
353                 PREFETCH" 32%1\n\t"
354                 "movd   %1, %%mm0\n\t"
355                 "movd   4%1, %%mm3\n\t"
356                 "punpckldq 8%1, %%mm0\n\t"
357                 "punpckldq 12%1, %%mm3\n\t"
358                 "movq   %%mm0, %%mm1\n\t"
359                 "movq   %%mm0, %%mm2\n\t"
360                 "movq   %%mm3, %%mm4\n\t"
361                 "movq   %%mm3, %%mm5\n\t"
362                 "psrlq  $3, %%mm0\n\t"
363                 "psrlq  $3, %%mm3\n\t"
364                 "pand   %2, %%mm0\n\t"
365                 "pand   %2, %%mm3\n\t"
366                 "psrlq  $5, %%mm1\n\t"
367                 "psrlq  $5, %%mm4\n\t"
368                 "pand   %%mm6, %%mm1\n\t"
369                 "pand   %%mm6, %%mm4\n\t"
370                 "psrlq  $8, %%mm2\n\t"
371                 "psrlq  $8, %%mm5\n\t"
372                 "pand   %%mm7, %%mm2\n\t"
373                 "pand   %%mm7, %%mm5\n\t"
374                 "por    %%mm1, %%mm0\n\t"
375                 "por    %%mm4, %%mm3\n\t"
376                 "por    %%mm2, %%mm0\n\t"
377                 "por    %%mm5, %%mm3\n\t"
378                 "psllq  $16, %%mm3\n\t"
379                 "por    %%mm3, %%mm0\n\t"
380                 MOVNTQ" %%mm0, %0\n\t"
381                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382                 d += 4;
383                 s += 16;
384         }
385 #endif
386         __asm __volatile(SFENCE:::"memory");
387         __asm __volatile(EMMS:::"memory");
388 #endif
389         while(s < end)
390         {
391                 const int src= *((uint32_t*)s)++;
392                 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 //              *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394         }
395 }
396
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398 {
399         const uint8_t *s = src;
400         const uint8_t *end;
401 #ifdef HAVE_MMX
402         const uint8_t *mm_end;
403 #endif
404         uint16_t *d = (uint16_t *)dst;
405         end = s + src_size;
406 #ifdef HAVE_MMX
407         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
408         __asm __volatile(
409             "movq       %0, %%mm7\n\t"
410             "movq       %1, %%mm6\n\t"
411             ::"m"(red_16mask),"m"(green_16mask));
412         mm_end = end - 15;
413         while(s < mm_end)
414         {
415             __asm __volatile(
416                 PREFETCH" 32%1\n\t"
417                 "movd   %1, %%mm0\n\t"
418                 "movd   4%1, %%mm3\n\t"
419                 "punpckldq 8%1, %%mm0\n\t"
420                 "punpckldq 12%1, %%mm3\n\t"
421                 "movq   %%mm0, %%mm1\n\t"
422                 "movq   %%mm0, %%mm2\n\t"
423                 "movq   %%mm3, %%mm4\n\t"
424                 "movq   %%mm3, %%mm5\n\t"
425                 "psllq  $8, %%mm0\n\t"
426                 "psllq  $8, %%mm3\n\t"
427                 "pand   %%mm7, %%mm0\n\t"
428                 "pand   %%mm7, %%mm3\n\t"
429                 "psrlq  $5, %%mm1\n\t"
430                 "psrlq  $5, %%mm4\n\t"
431                 "pand   %%mm6, %%mm1\n\t"
432                 "pand   %%mm6, %%mm4\n\t"
433                 "psrlq  $19, %%mm2\n\t"
434                 "psrlq  $19, %%mm5\n\t"
435                 "pand   %2, %%mm2\n\t"
436                 "pand   %2, %%mm5\n\t"
437                 "por    %%mm1, %%mm0\n\t"
438                 "por    %%mm4, %%mm3\n\t"
439                 "por    %%mm2, %%mm0\n\t"
440                 "por    %%mm5, %%mm3\n\t"
441                 "psllq  $16, %%mm3\n\t"
442                 "por    %%mm3, %%mm0\n\t"
443                 MOVNTQ" %%mm0, %0\n\t"
444                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445                 d += 4;
446                 s += 16;
447         }
448         __asm __volatile(SFENCE:::"memory");
449         __asm __volatile(EMMS:::"memory");
450 #endif
451         while(s < end)
452         {
453                 const int src= *((uint32_t*)s)++;
454                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455         }
456 }
457
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459 {
460         const uint8_t *s = src;
461         const uint8_t *end;
462 #ifdef HAVE_MMX
463         const uint8_t *mm_end;
464 #endif
465         uint16_t *d = (uint16_t *)dst;
466         end = s + src_size;
467 #ifdef HAVE_MMX
468         mm_end = end - 15;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470         asm volatile(
471                 "movq %3, %%mm5                 \n\t"
472                 "movq %4, %%mm6                 \n\t"
473                 "movq %5, %%mm7                 \n\t"
474                 ".balign 16                     \n\t"
475                 "1:                             \n\t"
476                 PREFETCH" 32(%1)                \n\t"
477                 "movd   (%1), %%mm0             \n\t"
478                 "movd   4(%1), %%mm3            \n\t"
479                 "punpckldq 8(%1), %%mm0         \n\t"
480                 "punpckldq 12(%1), %%mm3        \n\t"
481                 "movq %%mm0, %%mm1              \n\t"
482                 "movq %%mm3, %%mm4              \n\t"
483                 "pand %%mm6, %%mm0              \n\t"
484                 "pand %%mm6, %%mm3              \n\t"
485                 "pmaddwd %%mm7, %%mm0           \n\t"
486                 "pmaddwd %%mm7, %%mm3           \n\t"
487                 "pand %%mm5, %%mm1              \n\t"
488                 "pand %%mm5, %%mm4              \n\t"
489                 "por %%mm1, %%mm0               \n\t"   
490                 "por %%mm4, %%mm3               \n\t"
491                 "psrld $6, %%mm0                \n\t"
492                 "pslld $10, %%mm3               \n\t"
493                 "por %%mm3, %%mm0               \n\t"
494                 MOVNTQ" %%mm0, (%0)             \n\t"
495                 "addl $16, %1                   \n\t"
496                 "addl $8, %0                    \n\t"
497                 "cmpl %2, %1                    \n\t"
498                 " jb 1b                         \n\t"
499                 : "+r" (d), "+r"(s)
500                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501         );
502 #else
503         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
504         __asm __volatile(
505             "movq       %0, %%mm7\n\t"
506             "movq       %1, %%mm6\n\t"
507             ::"m"(red_15mask),"m"(green_15mask));
508         while(s < mm_end)
509         {
510             __asm __volatile(
511                 PREFETCH" 32%1\n\t"
512                 "movd   %1, %%mm0\n\t"
513                 "movd   4%1, %%mm3\n\t"
514                 "punpckldq 8%1, %%mm0\n\t"
515                 "punpckldq 12%1, %%mm3\n\t"
516                 "movq   %%mm0, %%mm1\n\t"
517                 "movq   %%mm0, %%mm2\n\t"
518                 "movq   %%mm3, %%mm4\n\t"
519                 "movq   %%mm3, %%mm5\n\t"
520                 "psrlq  $3, %%mm0\n\t"
521                 "psrlq  $3, %%mm3\n\t"
522                 "pand   %2, %%mm0\n\t"
523                 "pand   %2, %%mm3\n\t"
524                 "psrlq  $6, %%mm1\n\t"
525                 "psrlq  $6, %%mm4\n\t"
526                 "pand   %%mm6, %%mm1\n\t"
527                 "pand   %%mm6, %%mm4\n\t"
528                 "psrlq  $9, %%mm2\n\t"
529                 "psrlq  $9, %%mm5\n\t"
530                 "pand   %%mm7, %%mm2\n\t"
531                 "pand   %%mm7, %%mm5\n\t"
532                 "por    %%mm1, %%mm0\n\t"
533                 "por    %%mm4, %%mm3\n\t"
534                 "por    %%mm2, %%mm0\n\t"
535                 "por    %%mm5, %%mm3\n\t"
536                 "psllq  $16, %%mm3\n\t"
537                 "por    %%mm3, %%mm0\n\t"
538                 MOVNTQ" %%mm0, %0\n\t"
539                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540                 d += 4;
541                 s += 16;
542         }
543 #endif
544         __asm __volatile(SFENCE:::"memory");
545         __asm __volatile(EMMS:::"memory");
546 #endif
547         while(s < end)
548         {
549                 const int src= *((uint32_t*)s)++;
550                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551         }
552 }
553
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555 {
556         const uint8_t *s = src;
557         const uint8_t *end;
558 #ifdef HAVE_MMX
559         const uint8_t *mm_end;
560 #endif
561         uint16_t *d = (uint16_t *)dst;
562         end = s + src_size;
563 #ifdef HAVE_MMX
564         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
565         __asm __volatile(
566             "movq       %0, %%mm7\n\t"
567             "movq       %1, %%mm6\n\t"
568             ::"m"(red_15mask),"m"(green_15mask));
569         mm_end = end - 15;
570         while(s < mm_end)
571         {
572             __asm __volatile(
573                 PREFETCH" 32%1\n\t"
574                 "movd   %1, %%mm0\n\t"
575                 "movd   4%1, %%mm3\n\t"
576                 "punpckldq 8%1, %%mm0\n\t"
577                 "punpckldq 12%1, %%mm3\n\t"
578                 "movq   %%mm0, %%mm1\n\t"
579                 "movq   %%mm0, %%mm2\n\t"
580                 "movq   %%mm3, %%mm4\n\t"
581                 "movq   %%mm3, %%mm5\n\t"
582                 "psllq  $7, %%mm0\n\t"
583                 "psllq  $7, %%mm3\n\t"
584                 "pand   %%mm7, %%mm0\n\t"
585                 "pand   %%mm7, %%mm3\n\t"
586                 "psrlq  $6, %%mm1\n\t"
587                 "psrlq  $6, %%mm4\n\t"
588                 "pand   %%mm6, %%mm1\n\t"
589                 "pand   %%mm6, %%mm4\n\t"
590                 "psrlq  $19, %%mm2\n\t"
591                 "psrlq  $19, %%mm5\n\t"
592                 "pand   %2, %%mm2\n\t"
593                 "pand   %2, %%mm5\n\t"
594                 "por    %%mm1, %%mm0\n\t"
595                 "por    %%mm4, %%mm3\n\t"
596                 "por    %%mm2, %%mm0\n\t"
597                 "por    %%mm5, %%mm3\n\t"
598                 "psllq  $16, %%mm3\n\t"
599                 "por    %%mm3, %%mm0\n\t"
600                 MOVNTQ" %%mm0, %0\n\t"
601                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602                 d += 4;
603                 s += 16;
604         }
605         __asm __volatile(SFENCE:::"memory");
606         __asm __volatile(EMMS:::"memory");
607 #endif
608         while(s < end)
609         {
610                 const int src= *((uint32_t*)s)++;
611                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612         }
613 }
614
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616 {
617         const uint8_t *s = src;
618         const uint8_t *end;
619 #ifdef HAVE_MMX
620         const uint8_t *mm_end;
621 #endif
622         uint16_t *d = (uint16_t *)dst;
623         end = s + src_size;
624 #ifdef HAVE_MMX
625         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
626         __asm __volatile(
627             "movq       %0, %%mm7\n\t"
628             "movq       %1, %%mm6\n\t"
629             ::"m"(red_16mask),"m"(green_16mask));
630         mm_end = end - 11;
631         while(s < mm_end)
632         {
633             __asm __volatile(
634                 PREFETCH" 32%1\n\t"
635                 "movd   %1, %%mm0\n\t"
636                 "movd   3%1, %%mm3\n\t"
637                 "punpckldq 6%1, %%mm0\n\t"
638                 "punpckldq 9%1, %%mm3\n\t"
639                 "movq   %%mm0, %%mm1\n\t"
640                 "movq   %%mm0, %%mm2\n\t"
641                 "movq   %%mm3, %%mm4\n\t"
642                 "movq   %%mm3, %%mm5\n\t"
643                 "psrlq  $3, %%mm0\n\t"
644                 "psrlq  $3, %%mm3\n\t"
645                 "pand   %2, %%mm0\n\t"
646                 "pand   %2, %%mm3\n\t"
647                 "psrlq  $5, %%mm1\n\t"
648                 "psrlq  $5, %%mm4\n\t"
649                 "pand   %%mm6, %%mm1\n\t"
650                 "pand   %%mm6, %%mm4\n\t"
651                 "psrlq  $8, %%mm2\n\t"
652                 "psrlq  $8, %%mm5\n\t"
653                 "pand   %%mm7, %%mm2\n\t"
654                 "pand   %%mm7, %%mm5\n\t"
655                 "por    %%mm1, %%mm0\n\t"
656                 "por    %%mm4, %%mm3\n\t"
657                 "por    %%mm2, %%mm0\n\t"
658                 "por    %%mm5, %%mm3\n\t"
659                 "psllq  $16, %%mm3\n\t"
660                 "por    %%mm3, %%mm0\n\t"
661                 MOVNTQ" %%mm0, %0\n\t"
662                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663                 d += 4;
664                 s += 12;
665         }
666         __asm __volatile(SFENCE:::"memory");
667         __asm __volatile(EMMS:::"memory");
668 #endif
669         while(s < end)
670         {
671                 const int b= *s++;
672                 const int g= *s++;
673                 const int r= *s++;
674                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675         }
676 }
677
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679 {
680         const uint8_t *s = src;
681         const uint8_t *end;
682 #ifdef HAVE_MMX
683         const uint8_t *mm_end;
684 #endif
685         uint16_t *d = (uint16_t *)dst;
686         end = s + src_size;
687 #ifdef HAVE_MMX
688         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
689         __asm __volatile(
690             "movq       %0, %%mm7\n\t"
691             "movq       %1, %%mm6\n\t"
692             ::"m"(red_16mask),"m"(green_16mask));
693         mm_end = end - 15;
694         while(s < mm_end)
695         {
696             __asm __volatile(
697                 PREFETCH" 32%1\n\t"
698                 "movd   %1, %%mm0\n\t"
699                 "movd   3%1, %%mm3\n\t"
700                 "punpckldq 6%1, %%mm0\n\t"
701                 "punpckldq 9%1, %%mm3\n\t"
702                 "movq   %%mm0, %%mm1\n\t"
703                 "movq   %%mm0, %%mm2\n\t"
704                 "movq   %%mm3, %%mm4\n\t"
705                 "movq   %%mm3, %%mm5\n\t"
706                 "psllq  $8, %%mm0\n\t"
707                 "psllq  $8, %%mm3\n\t"
708                 "pand   %%mm7, %%mm0\n\t"
709                 "pand   %%mm7, %%mm3\n\t"
710                 "psrlq  $5, %%mm1\n\t"
711                 "psrlq  $5, %%mm4\n\t"
712                 "pand   %%mm6, %%mm1\n\t"
713                 "pand   %%mm6, %%mm4\n\t"
714                 "psrlq  $19, %%mm2\n\t"
715                 "psrlq  $19, %%mm5\n\t"
716                 "pand   %2, %%mm2\n\t"
717                 "pand   %2, %%mm5\n\t"
718                 "por    %%mm1, %%mm0\n\t"
719                 "por    %%mm4, %%mm3\n\t"
720                 "por    %%mm2, %%mm0\n\t"
721                 "por    %%mm5, %%mm3\n\t"
722                 "psllq  $16, %%mm3\n\t"
723                 "por    %%mm3, %%mm0\n\t"
724                 MOVNTQ" %%mm0, %0\n\t"
725                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726                 d += 4;
727                 s += 12;
728         }
729         __asm __volatile(SFENCE:::"memory");
730         __asm __volatile(EMMS:::"memory");
731 #endif
732         while(s < end)
733         {
734                 const int r= *s++;
735                 const int g= *s++;
736                 const int b= *s++;
737                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738         }
739 }
740
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742 {
743         const uint8_t *s = src;
744         const uint8_t *end;
745 #ifdef HAVE_MMX
746         const uint8_t *mm_end;
747 #endif
748         uint16_t *d = (uint16_t *)dst;
749         end = s + src_size;
750 #ifdef HAVE_MMX
751         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
752         __asm __volatile(
753             "movq       %0, %%mm7\n\t"
754             "movq       %1, %%mm6\n\t"
755             ::"m"(red_15mask),"m"(green_15mask));
756         mm_end = end - 11;
757         while(s < mm_end)
758         {
759             __asm __volatile(
760                 PREFETCH" 32%1\n\t"
761                 "movd   %1, %%mm0\n\t"
762                 "movd   3%1, %%mm3\n\t"
763                 "punpckldq 6%1, %%mm0\n\t"
764                 "punpckldq 9%1, %%mm3\n\t"
765                 "movq   %%mm0, %%mm1\n\t"
766                 "movq   %%mm0, %%mm2\n\t"
767                 "movq   %%mm3, %%mm4\n\t"
768                 "movq   %%mm3, %%mm5\n\t"
769                 "psrlq  $3, %%mm0\n\t"
770                 "psrlq  $3, %%mm3\n\t"
771                 "pand   %2, %%mm0\n\t"
772                 "pand   %2, %%mm3\n\t"
773                 "psrlq  $6, %%mm1\n\t"
774                 "psrlq  $6, %%mm4\n\t"
775                 "pand   %%mm6, %%mm1\n\t"
776                 "pand   %%mm6, %%mm4\n\t"
777                 "psrlq  $9, %%mm2\n\t"
778                 "psrlq  $9, %%mm5\n\t"
779                 "pand   %%mm7, %%mm2\n\t"
780                 "pand   %%mm7, %%mm5\n\t"
781                 "por    %%mm1, %%mm0\n\t"
782                 "por    %%mm4, %%mm3\n\t"
783                 "por    %%mm2, %%mm0\n\t"
784                 "por    %%mm5, %%mm3\n\t"
785                 "psllq  $16, %%mm3\n\t"
786                 "por    %%mm3, %%mm0\n\t"
787                 MOVNTQ" %%mm0, %0\n\t"
788                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789                 d += 4;
790                 s += 12;
791         }
792         __asm __volatile(SFENCE:::"memory");
793         __asm __volatile(EMMS:::"memory");
794 #endif
795         while(s < end)
796         {
797                 const int b= *s++;
798                 const int g= *s++;
799                 const int r= *s++;
800                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801         }
802 }
803
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805 {
806         const uint8_t *s = src;
807         const uint8_t *end;
808 #ifdef HAVE_MMX
809         const uint8_t *mm_end;
810 #endif
811         uint16_t *d = (uint16_t *)dst;
812         end = s + src_size;
813 #ifdef HAVE_MMX
814         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
815         __asm __volatile(
816             "movq       %0, %%mm7\n\t"
817             "movq       %1, %%mm6\n\t"
818             ::"m"(red_15mask),"m"(green_15mask));
819         mm_end = end - 15;
820         while(s < mm_end)
821         {
822             __asm __volatile(
823                 PREFETCH" 32%1\n\t"
824                 "movd   %1, %%mm0\n\t"
825                 "movd   3%1, %%mm3\n\t"
826                 "punpckldq 6%1, %%mm0\n\t"
827                 "punpckldq 9%1, %%mm3\n\t"
828                 "movq   %%mm0, %%mm1\n\t"
829                 "movq   %%mm0, %%mm2\n\t"
830                 "movq   %%mm3, %%mm4\n\t"
831                 "movq   %%mm3, %%mm5\n\t"
832                 "psllq  $7, %%mm0\n\t"
833                 "psllq  $7, %%mm3\n\t"
834                 "pand   %%mm7, %%mm0\n\t"
835                 "pand   %%mm7, %%mm3\n\t"
836                 "psrlq  $6, %%mm1\n\t"
837                 "psrlq  $6, %%mm4\n\t"
838                 "pand   %%mm6, %%mm1\n\t"
839                 "pand   %%mm6, %%mm4\n\t"
840                 "psrlq  $19, %%mm2\n\t"
841                 "psrlq  $19, %%mm5\n\t"
842                 "pand   %2, %%mm2\n\t"
843                 "pand   %2, %%mm5\n\t"
844                 "por    %%mm1, %%mm0\n\t"
845                 "por    %%mm4, %%mm3\n\t"
846                 "por    %%mm2, %%mm0\n\t"
847                 "por    %%mm5, %%mm3\n\t"
848                 "psllq  $16, %%mm3\n\t"
849                 "por    %%mm3, %%mm0\n\t"
850                 MOVNTQ" %%mm0, %0\n\t"
851                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852                 d += 4;
853                 s += 12;
854         }
855         __asm __volatile(SFENCE:::"memory");
856         __asm __volatile(EMMS:::"memory");
857 #endif
858         while(s < end)
859         {
860                 const int r= *s++;
861                 const int g= *s++;
862                 const int b= *s++;
863                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864         }
865 }
866
867 /*
868   I use here less accurate approximation by simply
869  left-shifting the input
870   value and filling the low order bits with
871  zeroes. This method improves png's
872   compression but this scheme cannot reproduce white exactly, since it does not
873   generate an all-ones maximum value; the net effect is to darken the
874   image slightly.
875
876   The better method should be "left bit replication":
877
878    4 3 2 1 0
879    ---------
880    1 1 0 1 1
881
882    7 6 5 4 3  2 1 0
883    ----------------
884    1 1 0 1 1  1 1 0
885    |=======|  |===|
886        |      Leftmost Bits Repeated to Fill Open Bits
887        |
888    Original Bits
889 */
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891 {
892         const uint16_t *end;
893 #ifdef HAVE_MMX
894         const uint16_t *mm_end;
895 #endif
896         uint8_t *d = (uint8_t *)dst;
897         const uint16_t *s = (uint16_t *)src;
898         end = s + src_size/2;
899 #ifdef HAVE_MMX
900         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
901         mm_end = end - 7;
902         while(s < mm_end)
903         {
904             __asm __volatile(
905                 PREFETCH" 32%1\n\t"
906                 "movq   %1, %%mm0\n\t"
907                 "movq   %1, %%mm1\n\t"
908                 "movq   %1, %%mm2\n\t"
909                 "pand   %2, %%mm0\n\t"
910                 "pand   %3, %%mm1\n\t"
911                 "pand   %4, %%mm2\n\t"
912                 "psllq  $3, %%mm0\n\t"
913                 "psrlq  $2, %%mm1\n\t"
914                 "psrlq  $7, %%mm2\n\t"
915                 "movq   %%mm0, %%mm3\n\t"
916                 "movq   %%mm1, %%mm4\n\t"
917                 "movq   %%mm2, %%mm5\n\t"
918                 "punpcklwd %5, %%mm0\n\t"
919                 "punpcklwd %5, %%mm1\n\t"
920                 "punpcklwd %5, %%mm2\n\t"
921                 "punpckhwd %5, %%mm3\n\t"
922                 "punpckhwd %5, %%mm4\n\t"
923                 "punpckhwd %5, %%mm5\n\t"
924                 "psllq  $8, %%mm1\n\t"
925                 "psllq  $16, %%mm2\n\t"
926                 "por    %%mm1, %%mm0\n\t"
927                 "por    %%mm2, %%mm0\n\t"
928                 "psllq  $8, %%mm4\n\t"
929                 "psllq  $16, %%mm5\n\t"
930                 "por    %%mm4, %%mm3\n\t"
931                 "por    %%mm5, %%mm3\n\t"
932
933                 "movq   %%mm0, %%mm6\n\t"
934                 "movq   %%mm3, %%mm7\n\t"
935                 
936                 "movq   8%1, %%mm0\n\t"
937                 "movq   8%1, %%mm1\n\t"
938                 "movq   8%1, %%mm2\n\t"
939                 "pand   %2, %%mm0\n\t"
940                 "pand   %3, %%mm1\n\t"
941                 "pand   %4, %%mm2\n\t"
942                 "psllq  $3, %%mm0\n\t"
943                 "psrlq  $2, %%mm1\n\t"
944                 "psrlq  $7, %%mm2\n\t"
945                 "movq   %%mm0, %%mm3\n\t"
946                 "movq   %%mm1, %%mm4\n\t"
947                 "movq   %%mm2, %%mm5\n\t"
948                 "punpcklwd %5, %%mm0\n\t"
949                 "punpcklwd %5, %%mm1\n\t"
950                 "punpcklwd %5, %%mm2\n\t"
951                 "punpckhwd %5, %%mm3\n\t"
952                 "punpckhwd %5, %%mm4\n\t"
953                 "punpckhwd %5, %%mm5\n\t"
954                 "psllq  $8, %%mm1\n\t"
955                 "psllq  $16, %%mm2\n\t"
956                 "por    %%mm1, %%mm0\n\t"
957                 "por    %%mm2, %%mm0\n\t"
958                 "psllq  $8, %%mm4\n\t"
959                 "psllq  $16, %%mm5\n\t"
960                 "por    %%mm4, %%mm3\n\t"
961                 "por    %%mm5, %%mm3\n\t"
962
963                 :"=m"(*d)
964                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965                 :"memory");
966             /* Borrowed 32 to 24 */
967             __asm __volatile(
968                 "movq   %%mm0, %%mm4\n\t"
969                 "movq   %%mm3, %%mm5\n\t"
970                 "movq   %%mm6, %%mm0\n\t"
971                 "movq   %%mm7, %%mm1\n\t"
972                 
973                 "movq   %%mm4, %%mm6\n\t"
974                 "movq   %%mm5, %%mm7\n\t"
975                 "movq   %%mm0, %%mm2\n\t"
976                 "movq   %%mm1, %%mm3\n\t"
977
978                 "psrlq  $8, %%mm2\n\t"
979                 "psrlq  $8, %%mm3\n\t"
980                 "psrlq  $8, %%mm6\n\t"
981                 "psrlq  $8, %%mm7\n\t"
982                 "pand   %2, %%mm0\n\t"
983                 "pand   %2, %%mm1\n\t"
984                 "pand   %2, %%mm4\n\t"
985                 "pand   %2, %%mm5\n\t"
986                 "pand   %3, %%mm2\n\t"
987                 "pand   %3, %%mm3\n\t"
988                 "pand   %3, %%mm6\n\t"
989                 "pand   %3, %%mm7\n\t"
990                 "por    %%mm2, %%mm0\n\t"
991                 "por    %%mm3, %%mm1\n\t"
992                 "por    %%mm6, %%mm4\n\t"
993                 "por    %%mm7, %%mm5\n\t"
994
995                 "movq   %%mm1, %%mm2\n\t"
996                 "movq   %%mm4, %%mm3\n\t"
997                 "psllq  $48, %%mm2\n\t"
998                 "psllq  $32, %%mm3\n\t"
999                 "pand   %4, %%mm2\n\t"
1000                 "pand   %5, %%mm3\n\t"
1001                 "por    %%mm2, %%mm0\n\t"
1002                 "psrlq  $16, %%mm1\n\t"
1003                 "psrlq  $32, %%mm4\n\t"
1004                 "psllq  $16, %%mm5\n\t"
1005                 "por    %%mm3, %%mm1\n\t"
1006                 "pand   %6, %%mm5\n\t"
1007                 "por    %%mm5, %%mm4\n\t"
1008
1009                 MOVNTQ" %%mm0, %0\n\t"
1010                 MOVNTQ" %%mm1, 8%0\n\t"
1011                 MOVNTQ" %%mm4, 16%0"
1012
1013                 :"=m"(*d)
1014                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015                 :"memory");
1016                 d += 24;
1017                 s += 8;
1018         }
1019         __asm __volatile(SFENCE:::"memory");
1020         __asm __volatile(EMMS:::"memory");
1021 #endif
1022         while(s < end)
1023         {
1024                 register uint16_t bgr;
1025                 bgr = *s++;
1026                 *d++ = (bgr&0x1F)<<3;
1027                 *d++ = (bgr&0x3E0)>>2;
1028                 *d++ = (bgr&0x7C00)>>7;
1029         }
1030 }
1031
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033 {
1034         const uint16_t *end;
1035 #ifdef HAVE_MMX
1036         const uint16_t *mm_end;
1037 #endif
1038         uint8_t *d = (uint8_t *)dst;
1039         const uint16_t *s = (const uint16_t *)src;
1040         end = s + src_size/2;
1041 #ifdef HAVE_MMX
1042         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1043         mm_end = end - 7;
1044         while(s < mm_end)
1045         {
1046             __asm __volatile(
1047                 PREFETCH" 32%1\n\t"
1048                 "movq   %1, %%mm0\n\t"
1049                 "movq   %1, %%mm1\n\t"
1050                 "movq   %1, %%mm2\n\t"
1051                 "pand   %2, %%mm0\n\t"
1052                 "pand   %3, %%mm1\n\t"
1053                 "pand   %4, %%mm2\n\t"
1054                 "psllq  $3, %%mm0\n\t"
1055                 "psrlq  $3, %%mm1\n\t"
1056                 "psrlq  $8, %%mm2\n\t"
1057                 "movq   %%mm0, %%mm3\n\t"
1058                 "movq   %%mm1, %%mm4\n\t"
1059                 "movq   %%mm2, %%mm5\n\t"
1060                 "punpcklwd %5, %%mm0\n\t"
1061                 "punpcklwd %5, %%mm1\n\t"
1062                 "punpcklwd %5, %%mm2\n\t"
1063                 "punpckhwd %5, %%mm3\n\t"
1064                 "punpckhwd %5, %%mm4\n\t"
1065                 "punpckhwd %5, %%mm5\n\t"
1066                 "psllq  $8, %%mm1\n\t"
1067                 "psllq  $16, %%mm2\n\t"
1068                 "por    %%mm1, %%mm0\n\t"
1069                 "por    %%mm2, %%mm0\n\t"
1070                 "psllq  $8, %%mm4\n\t"
1071                 "psllq  $16, %%mm5\n\t"
1072                 "por    %%mm4, %%mm3\n\t"
1073                 "por    %%mm5, %%mm3\n\t"
1074                 
1075                 "movq   %%mm0, %%mm6\n\t"
1076                 "movq   %%mm3, %%mm7\n\t"
1077
1078                 "movq   8%1, %%mm0\n\t"
1079                 "movq   8%1, %%mm1\n\t"
1080                 "movq   8%1, %%mm2\n\t"
1081                 "pand   %2, %%mm0\n\t"
1082                 "pand   %3, %%mm1\n\t"
1083                 "pand   %4, %%mm2\n\t"
1084                 "psllq  $3, %%mm0\n\t"
1085                 "psrlq  $3, %%mm1\n\t"
1086                 "psrlq  $8, %%mm2\n\t"
1087                 "movq   %%mm0, %%mm3\n\t"
1088                 "movq   %%mm1, %%mm4\n\t"
1089                 "movq   %%mm2, %%mm5\n\t"
1090                 "punpcklwd %5, %%mm0\n\t"
1091                 "punpcklwd %5, %%mm1\n\t"
1092                 "punpcklwd %5, %%mm2\n\t"
1093                 "punpckhwd %5, %%mm3\n\t"
1094                 "punpckhwd %5, %%mm4\n\t"
1095                 "punpckhwd %5, %%mm5\n\t"
1096                 "psllq  $8, %%mm1\n\t"
1097                 "psllq  $16, %%mm2\n\t"
1098                 "por    %%mm1, %%mm0\n\t"
1099                 "por    %%mm2, %%mm0\n\t"
1100                 "psllq  $8, %%mm4\n\t"
1101                 "psllq  $16, %%mm5\n\t"
1102                 "por    %%mm4, %%mm3\n\t"
1103                 "por    %%mm5, %%mm3\n\t"
1104                 :"=m"(*d)
1105                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1106                 :"memory");
1107             /* Borrowed 32 to 24 */
1108             __asm __volatile(
1109                 "movq   %%mm0, %%mm4\n\t"
1110                 "movq   %%mm3, %%mm5\n\t"
1111                 "movq   %%mm6, %%mm0\n\t"
1112                 "movq   %%mm7, %%mm1\n\t"
1113                 
1114                 "movq   %%mm4, %%mm6\n\t"
1115                 "movq   %%mm5, %%mm7\n\t"
1116                 "movq   %%mm0, %%mm2\n\t"
1117                 "movq   %%mm1, %%mm3\n\t"
1118
1119                 "psrlq  $8, %%mm2\n\t"
1120                 "psrlq  $8, %%mm3\n\t"
1121                 "psrlq  $8, %%mm6\n\t"
1122                 "psrlq  $8, %%mm7\n\t"
1123                 "pand   %2, %%mm0\n\t"
1124                 "pand   %2, %%mm1\n\t"
1125                 "pand   %2, %%mm4\n\t"
1126                 "pand   %2, %%mm5\n\t"
1127                 "pand   %3, %%mm2\n\t"
1128                 "pand   %3, %%mm3\n\t"
1129                 "pand   %3, %%mm6\n\t"
1130                 "pand   %3, %%mm7\n\t"
1131                 "por    %%mm2, %%mm0\n\t"
1132                 "por    %%mm3, %%mm1\n\t"
1133                 "por    %%mm6, %%mm4\n\t"
1134                 "por    %%mm7, %%mm5\n\t"
1135
1136                 "movq   %%mm1, %%mm2\n\t"
1137                 "movq   %%mm4, %%mm3\n\t"
1138                 "psllq  $48, %%mm2\n\t"
1139                 "psllq  $32, %%mm3\n\t"
1140                 "pand   %4, %%mm2\n\t"
1141                 "pand   %5, %%mm3\n\t"
1142                 "por    %%mm2, %%mm0\n\t"
1143                 "psrlq  $16, %%mm1\n\t"
1144                 "psrlq  $32, %%mm4\n\t"
1145                 "psllq  $16, %%mm5\n\t"
1146                 "por    %%mm3, %%mm1\n\t"
1147                 "pand   %6, %%mm5\n\t"
1148                 "por    %%mm5, %%mm4\n\t"
1149
1150                 MOVNTQ" %%mm0, %0\n\t"
1151                 MOVNTQ" %%mm1, 8%0\n\t"
1152                 MOVNTQ" %%mm4, 16%0"
1153
1154                 :"=m"(*d)
1155                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156                 :"memory");
1157                 d += 24;
1158                 s += 8;
1159         }
1160         __asm __volatile(SFENCE:::"memory");
1161         __asm __volatile(EMMS:::"memory");
1162 #endif
1163         while(s < end)
1164         {
1165                 register uint16_t bgr;
1166                 bgr = *s++;
1167                 *d++ = (bgr&0x1F)<<3;
1168                 *d++ = (bgr&0x7E0)>>3;
1169                 *d++ = (bgr&0xF800)>>8;
1170         }
1171 }
1172
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174 {
1175         const uint16_t *end;
1176 #ifdef HAVE_MMX
1177         const uint16_t *mm_end;
1178 #endif
1179         uint8_t *d = (uint8_t *)dst;
1180         const uint16_t *s = (const uint16_t *)src;
1181         end = s + src_size/2;
1182 #ifdef HAVE_MMX
1183         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1184         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1185         mm_end = end - 3;
1186         while(s < mm_end)
1187         {
1188             __asm __volatile(
1189                 PREFETCH" 32%1\n\t"
1190                 "movq   %1, %%mm0\n\t"
1191                 "movq   %1, %%mm1\n\t"
1192                 "movq   %1, %%mm2\n\t"
1193                 "pand   %2, %%mm0\n\t"
1194                 "pand   %3, %%mm1\n\t"
1195                 "pand   %4, %%mm2\n\t"
1196                 "psllq  $3, %%mm0\n\t"
1197                 "psrlq  $2, %%mm1\n\t"
1198                 "psrlq  $7, %%mm2\n\t"
1199                 "movq   %%mm0, %%mm3\n\t"
1200                 "movq   %%mm1, %%mm4\n\t"
1201                 "movq   %%mm2, %%mm5\n\t"
1202                 "punpcklwd %%mm7, %%mm0\n\t"
1203                 "punpcklwd %%mm7, %%mm1\n\t"
1204                 "punpcklwd %%mm7, %%mm2\n\t"
1205                 "punpckhwd %%mm7, %%mm3\n\t"
1206                 "punpckhwd %%mm7, %%mm4\n\t"
1207                 "punpckhwd %%mm7, %%mm5\n\t"
1208                 "psllq  $8, %%mm1\n\t"
1209                 "psllq  $16, %%mm2\n\t"
1210                 "por    %%mm1, %%mm0\n\t"
1211                 "por    %%mm2, %%mm0\n\t"
1212                 "psllq  $8, %%mm4\n\t"
1213                 "psllq  $16, %%mm5\n\t"
1214                 "por    %%mm4, %%mm3\n\t"
1215                 "por    %%mm5, %%mm3\n\t"
1216                 MOVNTQ" %%mm0, %0\n\t"
1217                 MOVNTQ" %%mm3, 8%0\n\t"
1218                 :"=m"(*d)
1219                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220                 :"memory");
1221                 d += 16;
1222                 s += 4;
1223         }
1224         __asm __volatile(SFENCE:::"memory");
1225         __asm __volatile(EMMS:::"memory");
1226 #endif
1227         while(s < end)
1228         {
1229 #if 0 //slightly slower on athlon
1230                 int bgr= *s++;
1231                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232 #else
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234                 register uint16_t bgr;
1235                 bgr = *s++;
1236                 *d++ = (bgr&0x1F)<<3;
1237                 *d++ = (bgr&0x3E0)>>2;
1238                 *d++ = (bgr&0x7C00)>>7;
1239                 *d++ = 0;
1240 #endif
1241         }
1242 }
1243
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245 {
1246         const uint16_t *end;
1247 #ifdef HAVE_MMX
1248         const uint16_t *mm_end;
1249 #endif
1250         uint8_t *d = (uint8_t *)dst;
1251         const uint16_t *s = (uint16_t *)src;
1252         end = s + src_size/2;
1253 #ifdef HAVE_MMX
1254         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1255         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1256         mm_end = end - 3;
1257         while(s < mm_end)
1258         {
1259             __asm __volatile(
1260                 PREFETCH" 32%1\n\t"
1261                 "movq   %1, %%mm0\n\t"
1262                 "movq   %1, %%mm1\n\t"
1263                 "movq   %1, %%mm2\n\t"
1264                 "pand   %2, %%mm0\n\t"
1265                 "pand   %3, %%mm1\n\t"
1266                 "pand   %4, %%mm2\n\t"
1267                 "psllq  $3, %%mm0\n\t"
1268                 "psrlq  $3, %%mm1\n\t"
1269                 "psrlq  $8, %%mm2\n\t"
1270                 "movq   %%mm0, %%mm3\n\t"
1271                 "movq   %%mm1, %%mm4\n\t"
1272                 "movq   %%mm2, %%mm5\n\t"
1273                 "punpcklwd %%mm7, %%mm0\n\t"
1274                 "punpcklwd %%mm7, %%mm1\n\t"
1275                 "punpcklwd %%mm7, %%mm2\n\t"
1276                 "punpckhwd %%mm7, %%mm3\n\t"
1277                 "punpckhwd %%mm7, %%mm4\n\t"
1278                 "punpckhwd %%mm7, %%mm5\n\t"
1279                 "psllq  $8, %%mm1\n\t"
1280                 "psllq  $16, %%mm2\n\t"
1281                 "por    %%mm1, %%mm0\n\t"
1282                 "por    %%mm2, %%mm0\n\t"
1283                 "psllq  $8, %%mm4\n\t"
1284                 "psllq  $16, %%mm5\n\t"
1285                 "por    %%mm4, %%mm3\n\t"
1286                 "por    %%mm5, %%mm3\n\t"
1287                 MOVNTQ" %%mm0, %0\n\t"
1288                 MOVNTQ" %%mm3, 8%0\n\t"
1289                 :"=m"(*d)
1290                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291                 :"memory");
1292                 d += 16;
1293                 s += 4;
1294         }
1295         __asm __volatile(SFENCE:::"memory");
1296         __asm __volatile(EMMS:::"memory");
1297 #endif
1298         while(s < end)
1299         {
1300                 register uint16_t bgr;
1301                 bgr = *s++;
1302                 *d++ = (bgr&0x1F)<<3;
1303                 *d++ = (bgr&0x7E0)>>3;
1304                 *d++ = (bgr&0xF800)>>8;
1305                 *d++ = 0;
1306         }
1307 }
1308
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310 {
1311 #ifdef HAVE_MMX
1312 /* TODO: unroll this loop */
1313         asm volatile (
1314                 "xorl %%eax, %%eax              \n\t"
1315                 ".balign 16                     \n\t"
1316                 "1:                             \n\t"
1317                 PREFETCH" 32(%0, %%eax)         \n\t"
1318                 "movq (%0, %%eax), %%mm0        \n\t"
1319                 "movq %%mm0, %%mm1              \n\t"
1320                 "movq %%mm0, %%mm2              \n\t"
1321                 "pslld $16, %%mm0               \n\t"
1322                 "psrld $16, %%mm1               \n\t"
1323                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1324                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1325                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1326                 "por %%mm0, %%mm2               \n\t"
1327                 "por %%mm1, %%mm2               \n\t"
1328                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1329                 "addl $8, %%eax                 \n\t"
1330                 "cmpl %2, %%eax                 \n\t"
1331                 " jb 1b                         \n\t"
1332                 :: "r" (src), "r"(dst), "r" (src_size-7)
1333                 : "%eax"
1334         );
1335
1336         __asm __volatile(SFENCE:::"memory");
1337         __asm __volatile(EMMS:::"memory");
1338 #else
1339         unsigned i;
1340         unsigned num_pixels = src_size >> 2;
1341         for(i=0; i<num_pixels; i++)
1342         {
1343 #ifdef WORDS_BIGENDIAN  
1344           dst[4*i + 1] = src[4*i + 3];
1345           dst[4*i + 2] = src[4*i + 2];
1346           dst[4*i + 3] = src[4*i + 1];
1347 #else
1348           dst[4*i + 0] = src[4*i + 2];
1349           dst[4*i + 1] = src[4*i + 1];
1350           dst[4*i + 2] = src[4*i + 0];
1351 #endif
1352         }
1353 #endif
1354 }
1355
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357 {
1358         unsigned i;
1359 #ifdef HAVE_MMX
1360         int mmx_size= 23 - src_size;
1361         asm volatile (
1362                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1363                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1364                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1365                 ".balign 16                     \n\t"
1366                 "1:                             \n\t"
1367                 PREFETCH" 32(%1, %%eax)         \n\t"
1368                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1369                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1370                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1371                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1372                 "pand %%mm5, %%mm0              \n\t"
1373                 "pand %%mm6, %%mm1              \n\t"
1374                 "pand %%mm7, %%mm2              \n\t"
1375                 "por %%mm0, %%mm1               \n\t"
1376                 "por %%mm2, %%mm1               \n\t"                
1377                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1378                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1379                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1380                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1381                 "pand %%mm7, %%mm0              \n\t"
1382                 "pand %%mm5, %%mm1              \n\t"
1383                 "pand %%mm6, %%mm2              \n\t"
1384                 "por %%mm0, %%mm1               \n\t"
1385                 "por %%mm2, %%mm1               \n\t"                
1386                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1387                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1388                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1389                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1390                 "pand %%mm6, %%mm0              \n\t"
1391                 "pand %%mm7, %%mm1              \n\t"
1392                 "pand %%mm5, %%mm2              \n\t"
1393                 "por %%mm0, %%mm1               \n\t"
1394                 "por %%mm2, %%mm1               \n\t"                
1395                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1396                 "addl $24, %%eax                \n\t"
1397                 " js 1b                         \n\t"
1398                 : "+a" (mmx_size)
1399                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1400         );
1401
1402         __asm __volatile(SFENCE:::"memory");
1403         __asm __volatile(EMMS:::"memory");
1404
1405         if(mmx_size==23) return; //finihsed, was multiple of 8
1406
1407         src+= src_size;
1408         dst+= src_size;
1409         src_size= 23-mmx_size;
1410         src-= src_size;
1411         dst-= src_size;
1412 #endif
1413         for(i=0; i<src_size; i+=3)
1414         {
1415                 register uint8_t x;
1416                 x          = src[i + 2];
1417                 dst[i + 1] = src[i + 1];
1418                 dst[i + 2] = src[i + 0];
1419                 dst[i + 0] = x;
1420         }
1421 }
1422
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424         unsigned int width, unsigned int height,
1425         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426 {
1427         unsigned y;
1428         const unsigned chromWidth= width>>1;
1429         for(y=0; y<height; y++)
1430         {
1431 #ifdef HAVE_MMX
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433                 asm volatile(
1434                         "xorl %%eax, %%eax              \n\t"
1435                         ".balign 16                     \n\t"
1436                         "1:                             \n\t"
1437                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1438                         PREFETCH" 32(%2, %%eax)         \n\t"
1439                         PREFETCH" 32(%3, %%eax)         \n\t"
1440                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441                         "movq %%mm0, %%mm2              \n\t" // U(0)
1442                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1444                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1445
1446                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1447                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1448                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1449                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1450                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1451                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1452                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1453                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1454
1455                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1456                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1457                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1459
1460                         "addl $8, %%eax                 \n\t"
1461                         "cmpl %4, %%eax                 \n\t"
1462                         " jb 1b                         \n\t"
1463                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464                         : "%eax"
1465                 );
1466 #else
1467
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n)                                      \
1470         y1 = yc[n];                                     \
1471         y2 = yc2[n];                                    \
1472         u = uc[n];                                      \
1473         v = vc[n];                                      \
1474         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1475         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1476         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478         yuv1 = (u << 8) + (v << 24);                    \
1479         yuv2 = yuv1 + y2;                               \
1480         yuv1 += y1;                                     \
1481         qdst[n] = yuv1;                                 \
1482         qdst2[n] = yuv2;
1483
1484                 int i;
1485                 uint64_t *qdst = (uint64_t *) dst;
1486                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487                 const uint32_t *yc = (uint32_t *) ysrc;
1488                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490                 for(i = 0; i < chromWidth; i += 8){
1491                         uint64_t y1, y2, yuv1, yuv2;
1492                         uint64_t u, v;
1493                         /* Prefetch */
1494                         asm("ldq $31,64(%0)" :: "r"(yc));
1495                         asm("ldq $31,64(%0)" :: "r"(yc2));
1496                         asm("ldq $31,64(%0)" :: "r"(uc));
1497                         asm("ldq $31,64(%0)" :: "r"(vc));
1498
1499                         pl2yuy2(0);
1500                         pl2yuy2(1);
1501                         pl2yuy2(2);
1502                         pl2yuy2(3);
1503
1504                         yc += 4;
1505                         yc2 += 4;
1506                         uc += 4;
1507                         vc += 4;
1508                         qdst += 4;
1509                         qdst2 += 4;
1510                 }
1511                 y++;
1512                 ysrc += lumStride;
1513                 dst += dstStride;
1514
1515 #elif __WORDSIZE >= 64
1516                 int i;
1517                 uint64_t *ldst = (uint64_t *) dst;
1518                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519                 for(i = 0; i < chromWidth; i += 2){
1520                         uint64_t k, l;
1521                         k = yc[0] + (uc[0] << 8) +
1522                             (yc[1] << 16) + (vc[0] << 24);
1523                         l = yc[2] + (uc[1] << 8) +
1524                             (yc[3] << 16) + (vc[1] << 24);
1525                         *ldst++ = k + (l << 32);
1526                         yc += 4;
1527                         uc += 2;
1528                         vc += 2;
1529                 }
1530
1531 #else
1532                 int i, *idst = (int32_t *) dst;
1533                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534                 for(i = 0; i < chromWidth; i++){
1535                         *idst++ = yc[0] + (uc[0] << 8) +
1536                             (yc[1] << 16) + (vc[0] << 24);
1537                         yc += 2;
1538                         uc++;
1539                         vc++;
1540                 }
1541 #endif
1542 #endif
1543                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1544                 {
1545                         usrc += chromStride;
1546                         vsrc += chromStride;
1547                 }
1548                 ysrc += lumStride;
1549                 dst += dstStride;
1550         }
1551 #ifdef HAVE_MMX
1552 asm(    EMMS" \n\t"
1553         SFENCE" \n\t"
1554         :::"memory");
1555 #endif
1556 }
1557
1558 /**
1559  *
1560  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561  * problem for anyone then tell me, and ill fix it)
1562  */
1563 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564         unsigned int width, unsigned int height,
1565         int lumStride, int chromStride, int dstStride)
1566 {
1567         //FIXME interpolate chroma
1568         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1569 }
1570
1571 /**
1572  *
1573  * width should be a multiple of 16
1574  */
1575 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1576         unsigned int width, unsigned int height,
1577         int lumStride, int chromStride, int dstStride)
1578 {
1579         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1580 }
1581
1582 /**
1583  *
1584  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1585  * problem for anyone then tell me, and ill fix it)
1586  */
1587 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1588         unsigned int width, unsigned int height,
1589         int lumStride, int chromStride, int srcStride)
1590 {
1591         unsigned y;
1592         const unsigned chromWidth= width>>1;
1593         for(y=0; y<height; y+=2)
1594         {
1595 #ifdef HAVE_MMX
1596                 asm volatile(
1597                         "xorl %%eax, %%eax              \n\t"
1598                         "pcmpeqw %%mm7, %%mm7           \n\t"
1599                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1600                         ".balign 16                     \n\t"
1601                         "1:                             \n\t"
1602                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1603                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1604                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1605                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1606                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1607                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1608                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1609                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1610                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1611                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1612                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1613
1614                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1615
1616                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1617                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1618                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1619                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1620                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1621                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1622                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1623                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1624                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1625                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1626
1627                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1628
1629                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1630                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1631                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1632                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1633                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1634                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1635                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1636                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1637
1638                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1639                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1640
1641                         "addl $8, %%eax                 \n\t"
1642                         "cmpl %4, %%eax                 \n\t"
1643                         " jb 1b                         \n\t"
1644                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1645                         : "memory", "%eax"
1646                 );
1647
1648                 ydst += lumStride;
1649                 src  += srcStride;
1650
1651                 asm volatile(
1652                         "xorl %%eax, %%eax              \n\t"
1653                         ".balign 16                     \n\t"
1654                         "1:                             \n\t"
1655                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1656                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1657                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1658                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1659                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1660                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1661                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1662                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1663                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1664                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1665                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1666
1667                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1668                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1669
1670                         "addl $8, %%eax                 \n\t"
1671                         "cmpl %4, %%eax                 \n\t"
1672                         " jb 1b                         \n\t"
1673
1674                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1675                         : "memory", "%eax"
1676                 );
1677 #else
1678                 unsigned i;
1679                 for(i=0; i<chromWidth; i++)
1680                 {
1681                         ydst[2*i+0]     = src[4*i+0];
1682                         udst[i]         = src[4*i+1];
1683                         ydst[2*i+1]     = src[4*i+2];
1684                         vdst[i]         = src[4*i+3];
1685                 }
1686                 ydst += lumStride;
1687                 src  += srcStride;
1688
1689                 for(i=0; i<chromWidth; i++)
1690                 {
1691                         ydst[2*i+0]     = src[4*i+0];
1692                         ydst[2*i+1]     = src[4*i+2];
1693                 }
1694 #endif
1695                 udst += chromStride;
1696                 vdst += chromStride;
1697                 ydst += lumStride;
1698                 src  += srcStride;
1699         }
1700 #ifdef HAVE_MMX
1701 asm volatile(   EMMS" \n\t"
1702                 SFENCE" \n\t"
1703                 :::"memory");
1704 #endif
1705 }
1706
1707 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1708         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1709         unsigned int width, unsigned int height, int lumStride, int chromStride)
1710 {
1711         /* Y Plane */
1712         memcpy(ydst, ysrc, width*height);
1713
1714         /* XXX: implement upscaling for U,V */
1715 }
1716
1717 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1718 {
1719         int x,y;
1720         
1721         dst[0]= src[0];
1722         
1723         // first line
1724         for(x=0; x<srcWidth-1; x++){
1725                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1726                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1727         }
1728         dst[2*srcWidth-1]= src[srcWidth-1];
1729         
1730         dst+= dstStride;
1731
1732         for(y=1; y<srcHeight; y++){
1733 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1734                 const int mmxSize= srcWidth&~15;
1735                 asm volatile(
1736                         "movl %4, %%eax                 \n\t"
1737                         "1:                             \n\t"
1738                         "movq (%0, %%eax), %%mm0        \n\t"
1739                         "movq (%1, %%eax), %%mm1        \n\t"
1740                         "movq 1(%0, %%eax), %%mm2       \n\t"
1741                         "movq 1(%1, %%eax), %%mm3       \n\t"
1742                         "movq -1(%0, %%eax), %%mm4      \n\t"
1743                         "movq -1(%1, %%eax), %%mm5      \n\t"
1744                         PAVGB" %%mm0, %%mm5             \n\t"
1745                         PAVGB" %%mm0, %%mm3             \n\t"
1746                         PAVGB" %%mm0, %%mm5             \n\t"
1747                         PAVGB" %%mm0, %%mm3             \n\t"
1748                         PAVGB" %%mm1, %%mm4             \n\t"
1749                         PAVGB" %%mm1, %%mm2             \n\t"
1750                         PAVGB" %%mm1, %%mm4             \n\t"
1751                         PAVGB" %%mm1, %%mm2             \n\t"
1752                         "movq %%mm5, %%mm7              \n\t"
1753                         "movq %%mm4, %%mm6              \n\t"
1754                         "punpcklbw %%mm3, %%mm5         \n\t"
1755                         "punpckhbw %%mm3, %%mm7         \n\t"
1756                         "punpcklbw %%mm2, %%mm4         \n\t"
1757                         "punpckhbw %%mm2, %%mm6         \n\t"
1758 #if 1
1759                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
1760                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
1761                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
1762                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
1763 #else
1764                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
1765                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
1766                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
1767                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
1768 #endif
1769                         "addl $8, %%eax                 \n\t"
1770                         " js 1b                         \n\t"
1771                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1772                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1773                            "g" (-mmxSize)
1774                         : "%eax"
1775
1776                 );
1777 #else
1778                 const int mmxSize=1;
1779 #endif
1780                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1781                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1782
1783                 for(x=mmxSize-1; x<srcWidth-1; x++){
1784                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1785                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1786                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1787                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1788                 }
1789                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1790                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1791
1792                 dst+=dstStride*2;
1793                 src+=srcStride;
1794         }
1795         
1796         // last line
1797 #if 1
1798         dst[0]= src[0];
1799         
1800         for(x=0; x<srcWidth-1; x++){
1801                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1802                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1803         }
1804         dst[2*srcWidth-1]= src[srcWidth-1];
1805 #else
1806         for(x=0; x<srcWidth; x++){
1807                 dst[2*x+0]=
1808                 dst[2*x+1]= src[x];
1809         }
1810 #endif
1811
1812 #ifdef HAVE_MMX
1813 asm volatile(   EMMS" \n\t"
1814                 SFENCE" \n\t"
1815                 :::"memory");
1816 #endif
1817 }
1818
1819 /**
1820  *
1821  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1822  * problem for anyone then tell me, and ill fix it)
1823  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1824  */
1825 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1826         unsigned int width, unsigned int height,
1827         int lumStride, int chromStride, int srcStride)
1828 {
1829         unsigned y;
1830         const unsigned chromWidth= width>>1;
1831         for(y=0; y<height; y+=2)
1832         {
1833 #ifdef HAVE_MMX
1834                 asm volatile(
1835                         "xorl %%eax, %%eax              \n\t"
1836                         "pcmpeqw %%mm7, %%mm7           \n\t"
1837                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1838                         ".balign 16                     \n\t"
1839                         "1:                             \n\t"
1840                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1841                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1842                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1843                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1844                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1845                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1846                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1847                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1848                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1849                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1850                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1851
1852                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1853
1854                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1855                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1856                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1857                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1858                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1859                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1860                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1861                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1862                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1863                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1864
1865                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1866
1867                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1868                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1869                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1870                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1871                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1872                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1873                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1874                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1875
1876                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1877                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1878
1879                         "addl $8, %%eax                 \n\t"
1880                         "cmpl %4, %%eax                 \n\t"
1881                         " jb 1b                         \n\t"
1882                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1883                         : "memory", "%eax"
1884                 );
1885
1886                 ydst += lumStride;
1887                 src  += srcStride;
1888
1889                 asm volatile(
1890                         "xorl %%eax, %%eax              \n\t"
1891                         ".balign 16                     \n\t"
1892                         "1:                             \n\t"
1893                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1894                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1895                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1896                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1897                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1898                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1899                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1900                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1901                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1902                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1903                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1904
1905                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1906                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1907
1908                         "addl $8, %%eax                 \n\t"
1909                         "cmpl %4, %%eax                 \n\t"
1910                         " jb 1b                         \n\t"
1911
1912                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1913                         : "memory", "%eax"
1914                 );
1915 #else
1916                 unsigned i;
1917                 for(i=0; i<chromWidth; i++)
1918                 {
1919                         udst[i]         = src[4*i+0];
1920                         ydst[2*i+0]     = src[4*i+1];
1921                         vdst[i]         = src[4*i+2];
1922                         ydst[2*i+1]     = src[4*i+3];
1923                 }
1924                 ydst += lumStride;
1925                 src  += srcStride;
1926
1927                 for(i=0; i<chromWidth; i++)
1928                 {
1929                         ydst[2*i+0]     = src[4*i+1];
1930                         ydst[2*i+1]     = src[4*i+3];
1931                 }
1932 #endif
1933                 udst += chromStride;
1934                 vdst += chromStride;
1935                 ydst += lumStride;
1936                 src  += srcStride;
1937         }
1938 #ifdef HAVE_MMX
1939 asm volatile(   EMMS" \n\t"
1940                 SFENCE" \n\t"
1941                 :::"memory");
1942 #endif
1943 }
1944
1945 /**
1946  *
1947  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1948  * problem for anyone then tell me, and ill fix it)
1949  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1950  */
1951 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1952         unsigned int width, unsigned int height,
1953         int lumStride, int chromStride, int srcStride)
1954 {
1955         unsigned y;
1956         const unsigned chromWidth= width>>1;
1957 #ifdef HAVE_MMX
1958         for(y=0; y<height-2; y+=2)
1959         {
1960                 unsigned i;
1961                 for(i=0; i<2; i++)
1962                 {
1963                         asm volatile(
1964                                 "movl %2, %%eax                 \n\t"
1965                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1966                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1967                                 "pxor %%mm7, %%mm7              \n\t"
1968                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1969                                 ".balign 16                     \n\t"
1970                                 "1:                             \n\t"
1971                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1972                                 "movd (%0, %%ebx), %%mm0        \n\t"
1973                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1974                                 "punpcklbw %%mm7, %%mm0         \n\t"
1975                                 "punpcklbw %%mm7, %%mm1         \n\t"
1976                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1977                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1978                                 "punpcklbw %%mm7, %%mm2         \n\t"
1979                                 "punpcklbw %%mm7, %%mm3         \n\t"
1980                                 "pmaddwd %%mm6, %%mm0           \n\t"
1981                                 "pmaddwd %%mm6, %%mm1           \n\t"
1982                                 "pmaddwd %%mm6, %%mm2           \n\t"
1983                                 "pmaddwd %%mm6, %%mm3           \n\t"
1984 #ifndef FAST_BGR2YV12
1985                                 "psrad $8, %%mm0                \n\t"
1986                                 "psrad $8, %%mm1                \n\t"
1987                                 "psrad $8, %%mm2                \n\t"
1988                                 "psrad $8, %%mm3                \n\t"
1989 #endif
1990                                 "packssdw %%mm1, %%mm0          \n\t"
1991                                 "packssdw %%mm3, %%mm2          \n\t"
1992                                 "pmaddwd %%mm5, %%mm0           \n\t"
1993                                 "pmaddwd %%mm5, %%mm2           \n\t"
1994                                 "packssdw %%mm2, %%mm0          \n\t"
1995                                 "psraw $7, %%mm0                \n\t"
1996
1997                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1998                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1999                                 "punpcklbw %%mm7, %%mm4         \n\t"
2000                                 "punpcklbw %%mm7, %%mm1         \n\t"
2001                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
2002                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
2003                                 "punpcklbw %%mm7, %%mm2         \n\t"
2004                                 "punpcklbw %%mm7, %%mm3         \n\t"
2005                                 "pmaddwd %%mm6, %%mm4           \n\t"
2006                                 "pmaddwd %%mm6, %%mm1           \n\t"
2007                                 "pmaddwd %%mm6, %%mm2           \n\t"
2008                                 "pmaddwd %%mm6, %%mm3           \n\t"
2009 #ifndef FAST_BGR2YV12
2010                                 "psrad $8, %%mm4                \n\t"
2011                                 "psrad $8, %%mm1                \n\t"
2012                                 "psrad $8, %%mm2                \n\t"
2013                                 "psrad $8, %%mm3                \n\t"
2014 #endif
2015                                 "packssdw %%mm1, %%mm4          \n\t"
2016                                 "packssdw %%mm3, %%mm2          \n\t"
2017                                 "pmaddwd %%mm5, %%mm4           \n\t"
2018                                 "pmaddwd %%mm5, %%mm2           \n\t"
2019                                 "addl $24, %%ebx                \n\t"
2020                                 "packssdw %%mm2, %%mm4          \n\t"
2021                                 "psraw $7, %%mm4                \n\t"
2022
2023                                 "packuswb %%mm4, %%mm0          \n\t"
2024                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2025
2026                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
2027                                 "addl $8, %%eax                 \n\t"
2028                                 " js 1b                         \n\t"
2029                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2030                                 : "%eax", "%ebx"
2031                         );
2032                         ydst += lumStride;
2033                         src  += srcStride;
2034                 }
2035                 src -= srcStride*2;
2036                 asm volatile(
2037                         "movl %4, %%eax                 \n\t"
2038                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2039                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2040                         "pxor %%mm7, %%mm7              \n\t"
2041                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2042                         "addl %%ebx, %%ebx              \n\t"
2043                         ".balign 16                     \n\t"
2044                         "1:                             \n\t"
2045                         PREFETCH" 64(%0, %%ebx)         \n\t"
2046                         PREFETCH" 64(%1, %%ebx)         \n\t"
2047 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2048                         "movq (%0, %%ebx), %%mm0        \n\t"
2049                         "movq (%1, %%ebx), %%mm1        \n\t"
2050                         "movq 6(%0, %%ebx), %%mm2       \n\t"
2051                         "movq 6(%1, %%ebx), %%mm3       \n\t"
2052                         PAVGB" %%mm1, %%mm0             \n\t"
2053                         PAVGB" %%mm3, %%mm2             \n\t"
2054                         "movq %%mm0, %%mm1              \n\t"
2055                         "movq %%mm2, %%mm3              \n\t"
2056                         "psrlq $24, %%mm0               \n\t"
2057                         "psrlq $24, %%mm2               \n\t"
2058                         PAVGB" %%mm1, %%mm0             \n\t"
2059                         PAVGB" %%mm3, %%mm2             \n\t"
2060                         "punpcklbw %%mm7, %%mm0         \n\t"
2061                         "punpcklbw %%mm7, %%mm2         \n\t"
2062 #else
2063                         "movd (%0, %%ebx), %%mm0        \n\t"
2064                         "movd (%1, %%ebx), %%mm1        \n\t"
2065                         "movd 3(%0, %%ebx), %%mm2       \n\t"
2066                         "movd 3(%1, %%ebx), %%mm3       \n\t"
2067                         "punpcklbw %%mm7, %%mm0         \n\t"
2068                         "punpcklbw %%mm7, %%mm1         \n\t"
2069                         "punpcklbw %%mm7, %%mm2         \n\t"
2070                         "punpcklbw %%mm7, %%mm3         \n\t"
2071                         "paddw %%mm1, %%mm0             \n\t"
2072                         "paddw %%mm3, %%mm2             \n\t"
2073                         "paddw %%mm2, %%mm0             \n\t"
2074                         "movd 6(%0, %%ebx), %%mm4       \n\t"
2075                         "movd 6(%1, %%ebx), %%mm1       \n\t"
2076                         "movd 9(%0, %%ebx), %%mm2       \n\t"
2077                         "movd 9(%1, %%ebx), %%mm3       \n\t"
2078                         "punpcklbw %%mm7, %%mm4         \n\t"
2079                         "punpcklbw %%mm7, %%mm1         \n\t"
2080                         "punpcklbw %%mm7, %%mm2         \n\t"
2081                         "punpcklbw %%mm7, %%mm3         \n\t"
2082                         "paddw %%mm1, %%mm4             \n\t"
2083                         "paddw %%mm3, %%mm2             \n\t"
2084                         "paddw %%mm4, %%mm2             \n\t"
2085                         "psrlw $2, %%mm0                \n\t"
2086                         "psrlw $2, %%mm2                \n\t"
2087 #endif
2088                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2089                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2090
2091                         "pmaddwd %%mm0, %%mm1           \n\t"
2092                         "pmaddwd %%mm2, %%mm3           \n\t"
2093                         "pmaddwd %%mm6, %%mm0           \n\t"
2094                         "pmaddwd %%mm6, %%mm2           \n\t"
2095 #ifndef FAST_BGR2YV12
2096                         "psrad $8, %%mm0                \n\t"
2097                         "psrad $8, %%mm1                \n\t"
2098                         "psrad $8, %%mm2                \n\t"
2099                         "psrad $8, %%mm3                \n\t"
2100 #endif
2101                         "packssdw %%mm2, %%mm0          \n\t"
2102                         "packssdw %%mm3, %%mm1          \n\t"
2103                         "pmaddwd %%mm5, %%mm0           \n\t"
2104                         "pmaddwd %%mm5, %%mm1           \n\t"
2105                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2106                         "psraw $7, %%mm0                \n\t"
2107
2108 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2109                         "movq 12(%0, %%ebx), %%mm4      \n\t"
2110                         "movq 12(%1, %%ebx), %%mm1      \n\t"
2111                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2112                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2113                         PAVGB" %%mm1, %%mm4             \n\t"
2114                         PAVGB" %%mm3, %%mm2             \n\t"
2115                         "movq %%mm4, %%mm1              \n\t"
2116                         "movq %%mm2, %%mm3              \n\t"
2117                         "psrlq $24, %%mm4               \n\t"
2118                         "psrlq $24, %%mm2               \n\t"
2119                         PAVGB" %%mm1, %%mm4             \n\t"
2120                         PAVGB" %%mm3, %%mm2             \n\t"
2121                         "punpcklbw %%mm7, %%mm4         \n\t"
2122                         "punpcklbw %%mm7, %%mm2         \n\t"
2123 #else
2124                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2125                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2126                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2127                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2128                         "punpcklbw %%mm7, %%mm4         \n\t"
2129                         "punpcklbw %%mm7, %%mm1         \n\t"
2130                         "punpcklbw %%mm7, %%mm2         \n\t"
2131                         "punpcklbw %%mm7, %%mm3         \n\t"
2132                         "paddw %%mm1, %%mm4             \n\t"
2133                         "paddw %%mm3, %%mm2             \n\t"
2134                         "paddw %%mm2, %%mm4             \n\t"
2135                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2136                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2137                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2138                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2139                         "punpcklbw %%mm7, %%mm5         \n\t"
2140                         "punpcklbw %%mm7, %%mm1         \n\t"
2141                         "punpcklbw %%mm7, %%mm2         \n\t"
2142                         "punpcklbw %%mm7, %%mm3         \n\t"
2143                         "paddw %%mm1, %%mm5             \n\t"
2144                         "paddw %%mm3, %%mm2             \n\t"
2145                         "paddw %%mm5, %%mm2             \n\t"
2146                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2147                         "psrlw $2, %%mm4                \n\t"
2148                         "psrlw $2, %%mm2                \n\t"
2149 #endif
2150                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2151                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2152
2153                         "pmaddwd %%mm4, %%mm1           \n\t"
2154                         "pmaddwd %%mm2, %%mm3           \n\t"
2155                         "pmaddwd %%mm6, %%mm4           \n\t"
2156                         "pmaddwd %%mm6, %%mm2           \n\t"
2157 #ifndef FAST_BGR2YV12
2158                         "psrad $8, %%mm4                \n\t"
2159                         "psrad $8, %%mm1                \n\t"
2160                         "psrad $8, %%mm2                \n\t"
2161                         "psrad $8, %%mm3                \n\t"
2162 #endif
2163                         "packssdw %%mm2, %%mm4          \n\t"
2164                         "packssdw %%mm3, %%mm1          \n\t"
2165                         "pmaddwd %%mm5, %%mm4           \n\t"
2166                         "pmaddwd %%mm5, %%mm1           \n\t"
2167                         "addl $24, %%ebx                \n\t"
2168                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2169                         "psraw $7, %%mm4                \n\t"
2170
2171                         "movq %%mm0, %%mm1              \n\t"
2172                         "punpckldq %%mm4, %%mm0         \n\t"
2173                         "punpckhdq %%mm4, %%mm1         \n\t"
2174                         "packsswb %%mm1, %%mm0          \n\t"
2175                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2176
2177                         "movd %%mm0, (%2, %%eax)        \n\t"
2178                         "punpckhdq %%mm0, %%mm0         \n\t"
2179                         "movd %%mm0, (%3, %%eax)        \n\t"
2180                         "addl $4, %%eax                 \n\t"
2181                         " js 1b                         \n\t"
2182                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2183                         : "%eax", "%ebx"
2184                 );
2185
2186                 udst += chromStride;
2187                 vdst += chromStride;
2188                 src  += srcStride*2;
2189         }
2190
2191         asm volatile(   EMMS" \n\t"
2192                         SFENCE" \n\t"
2193                         :::"memory");
2194 #else
2195         y=0;
2196 #endif
2197         for(; y<height; y+=2)
2198         {
2199                 unsigned i;
2200                 for(i=0; i<chromWidth; i++)
2201                 {
2202                         unsigned int b= src[6*i+0];
2203                         unsigned int g= src[6*i+1];
2204                         unsigned int r= src[6*i+2];
2205
2206                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2207                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2208                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2209
2210                         udst[i]         = U;
2211                         vdst[i]         = V;
2212                         ydst[2*i]       = Y;
2213
2214                         b= src[6*i+3];
2215                         g= src[6*i+4];
2216                         r= src[6*i+5];
2217
2218                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2219                         ydst[2*i+1]     = Y;
2220                 }
2221                 ydst += lumStride;
2222                 src  += srcStride;
2223
2224                 for(i=0; i<chromWidth; i++)
2225                 {
2226                         unsigned int b= src[6*i+0];
2227                         unsigned int g= src[6*i+1];
2228                         unsigned int r= src[6*i+2];
2229
2230                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2231
2232                         ydst[2*i]       = Y;
2233
2234                         b= src[6*i+3];
2235                         g= src[6*i+4];
2236                         r= src[6*i+5];
2237
2238                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2239                         ydst[2*i+1]     = Y;
2240                 }
2241                 udst += chromStride;
2242                 vdst += chromStride;
2243                 ydst += lumStride;
2244                 src  += srcStride;
2245         }
2246 }
2247
2248 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2249                             unsigned width, unsigned height, int src1Stride,
2250                             int src2Stride, int dstStride){
2251         unsigned h;
2252
2253         for(h=0; h < height; h++)
2254         {
2255                 unsigned w;
2256
2257 #ifdef HAVE_MMX
2258 #ifdef HAVE_SSE2
2259                 asm(
2260                         "xorl %%eax, %%eax              \n\t"
2261                         "1:                             \n\t"
2262                         PREFETCH" 64(%1, %%eax)         \n\t"
2263                         PREFETCH" 64(%2, %%eax)         \n\t"
2264                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2265                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2266                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2267                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2268                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2269                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2270                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2271                         "addl $16, %%eax                        \n\t"
2272                         "cmpl %3, %%eax                 \n\t"
2273                         " jb 1b                         \n\t"
2274                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2275                         : "memory", "%eax"
2276                 );
2277 #else
2278                 asm(
2279                         "xorl %%eax, %%eax              \n\t"
2280                         "1:                             \n\t"
2281                         PREFETCH" 64(%1, %%eax)         \n\t"
2282                         PREFETCH" 64(%2, %%eax)         \n\t"
2283                         "movq (%1, %%eax), %%mm0        \n\t"
2284                         "movq 8(%1, %%eax), %%mm2       \n\t"
2285                         "movq %%mm0, %%mm1              \n\t"
2286                         "movq %%mm2, %%mm3              \n\t"
2287                         "movq (%2, %%eax), %%mm4        \n\t"
2288                         "movq 8(%2, %%eax), %%mm5       \n\t"
2289                         "punpcklbw %%mm4, %%mm0         \n\t"
2290                         "punpckhbw %%mm4, %%mm1         \n\t"
2291                         "punpcklbw %%mm5, %%mm2         \n\t"
2292                         "punpckhbw %%mm5, %%mm3         \n\t"
2293                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2294                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2295                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2296                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2297                         "addl $16, %%eax                        \n\t"
2298                         "cmpl %3, %%eax                 \n\t"
2299                         " jb 1b                         \n\t"
2300                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2301                         : "memory", "%eax"
2302                 );
2303 #endif
2304                 for(w= (width&(~15)); w < width; w++)
2305                 {
2306                         dest[2*w+0] = src1[w];
2307                         dest[2*w+1] = src2[w];
2308                 }
2309 #else
2310                 for(w=0; w < width; w++)
2311                 {
2312                         dest[2*w+0] = src1[w];
2313                         dest[2*w+1] = src2[w];
2314                 }
2315 #endif
2316                 dest += dstStride;
2317                 src1 += src1Stride;
2318                 src2 += src2Stride;
2319         }
2320 #ifdef HAVE_MMX
2321         asm(
2322                 EMMS" \n\t"
2323                 SFENCE" \n\t"
2324                 ::: "memory"
2325                 );
2326 #endif
2327 }
2328
2329 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2330                         uint8_t *dst1, uint8_t *dst2,
2331                         unsigned width, unsigned height,
2332                         int srcStride1, int srcStride2,
2333                         int dstStride1, int dstStride2)
2334 {
2335     unsigned int y,x,h;
2336     int w;
2337     w=width/2; h=height/2;
2338 #ifdef HAVE_MMX
2339     asm volatile(
2340         PREFETCH" %0\n\t"
2341         PREFETCH" %1\n\t"
2342         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2343 #endif
2344     for(y=0;y<h;y++){
2345         const uint8_t* s1=src1+srcStride1*(y>>1);
2346         uint8_t* d=dst1+dstStride1*y;
2347         x=0;
2348 #ifdef HAVE_MMX
2349         for(;x<w-31;x+=32)
2350         {
2351             asm volatile(
2352                 PREFETCH" 32%1\n\t"
2353                 "movq   %1, %%mm0\n\t"
2354                 "movq   8%1, %%mm2\n\t"
2355                 "movq   16%1, %%mm4\n\t"
2356                 "movq   24%1, %%mm6\n\t"
2357                 "movq   %%mm0, %%mm1\n\t"
2358                 "movq   %%mm2, %%mm3\n\t"
2359                 "movq   %%mm4, %%mm5\n\t"
2360                 "movq   %%mm6, %%mm7\n\t"
2361                 "punpcklbw %%mm0, %%mm0\n\t"
2362                 "punpckhbw %%mm1, %%mm1\n\t"
2363                 "punpcklbw %%mm2, %%mm2\n\t"
2364                 "punpckhbw %%mm3, %%mm3\n\t"
2365                 "punpcklbw %%mm4, %%mm4\n\t"
2366                 "punpckhbw %%mm5, %%mm5\n\t"
2367                 "punpcklbw %%mm6, %%mm6\n\t"
2368                 "punpckhbw %%mm7, %%mm7\n\t"
2369                 MOVNTQ" %%mm0, %0\n\t"
2370                 MOVNTQ" %%mm1, 8%0\n\t"
2371                 MOVNTQ" %%mm2, 16%0\n\t"
2372                 MOVNTQ" %%mm3, 24%0\n\t"
2373                 MOVNTQ" %%mm4, 32%0\n\t"
2374                 MOVNTQ" %%mm5, 40%0\n\t"
2375                 MOVNTQ" %%mm6, 48%0\n\t"
2376                 MOVNTQ" %%mm7, 56%0"
2377                 :"=m"(d[2*x])
2378                 :"m"(s1[x])
2379                 :"memory");
2380         }
2381 #endif
2382         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2383     }
2384     for(y=0;y<h;y++){
2385         const uint8_t* s2=src2+srcStride2*(y>>1);
2386         uint8_t* d=dst2+dstStride2*y;
2387         x=0;
2388 #ifdef HAVE_MMX
2389         for(;x<w-31;x+=32)
2390         {
2391             asm volatile(
2392                 PREFETCH" 32%1\n\t"
2393                 "movq   %1, %%mm0\n\t"
2394                 "movq   8%1, %%mm2\n\t"
2395                 "movq   16%1, %%mm4\n\t"
2396                 "movq   24%1, %%mm6\n\t"
2397                 "movq   %%mm0, %%mm1\n\t"
2398                 "movq   %%mm2, %%mm3\n\t"
2399                 "movq   %%mm4, %%mm5\n\t"
2400                 "movq   %%mm6, %%mm7\n\t"
2401                 "punpcklbw %%mm0, %%mm0\n\t"
2402                 "punpckhbw %%mm1, %%mm1\n\t"
2403                 "punpcklbw %%mm2, %%mm2\n\t"
2404                 "punpckhbw %%mm3, %%mm3\n\t"
2405                 "punpcklbw %%mm4, %%mm4\n\t"
2406                 "punpckhbw %%mm5, %%mm5\n\t"
2407                 "punpcklbw %%mm6, %%mm6\n\t"
2408                 "punpckhbw %%mm7, %%mm7\n\t"
2409                 MOVNTQ" %%mm0, %0\n\t"
2410                 MOVNTQ" %%mm1, 8%0\n\t"
2411                 MOVNTQ" %%mm2, 16%0\n\t"
2412                 MOVNTQ" %%mm3, 24%0\n\t"
2413                 MOVNTQ" %%mm4, 32%0\n\t"
2414                 MOVNTQ" %%mm5, 40%0\n\t"
2415                 MOVNTQ" %%mm6, 48%0\n\t"
2416                 MOVNTQ" %%mm7, 56%0"
2417                 :"=m"(d[2*x])
2418                 :"m"(s2[x])
2419                 :"memory");
2420         }
2421 #endif
2422         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2423     }
2424 #ifdef HAVE_MMX
2425         asm(
2426                 EMMS" \n\t"
2427                 SFENCE" \n\t"
2428                 ::: "memory"
2429                 );
2430 #endif
2431 }
2432
2433 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2434                         uint8_t *dst,
2435                         unsigned width, unsigned height,
2436                         int srcStride1, int srcStride2,
2437                         int srcStride3, int dstStride)
2438 {
2439     unsigned y,x,w,h;
2440     w=width/2; h=height;
2441     for(y=0;y<h;y++){
2442         const uint8_t* yp=src1+srcStride1*y;
2443         const uint8_t* up=src2+srcStride2*(y>>2);
2444         const uint8_t* vp=src3+srcStride3*(y>>2);
2445         uint8_t* d=dst+dstStride*y;
2446         x=0;
2447 #ifdef HAVE_MMX
2448         for(;x<w-7;x+=8)
2449         {
2450             asm volatile(
2451                 PREFETCH" 32(%1, %0)\n\t"
2452                 PREFETCH" 32(%2, %0)\n\t"
2453                 PREFETCH" 32(%3, %0)\n\t"
2454                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2455                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2456                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2457                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2458                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2459                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2460                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2461                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2462                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2463                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2464
2465                 "movq   %%mm1, %%mm6\n\t"
2466                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2467                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2468                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2469                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2470                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2471                 
2472                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2473                 "movq   8(%1, %0, 4), %%mm0\n\t"
2474                 "movq   %%mm0, %%mm3\n\t"
2475                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2476                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2477                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2478                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2479
2480                 "movq   %%mm4, %%mm6\n\t"
2481                 "movq   16(%1, %0, 4), %%mm0\n\t"
2482                 "movq   %%mm0, %%mm3\n\t"
2483                 "punpcklbw %%mm5, %%mm4\n\t"
2484                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2485                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2486                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2487                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2488                 
2489                 "punpckhbw %%mm5, %%mm6\n\t"
2490                 "movq   24(%1, %0, 4), %%mm0\n\t"
2491                 "movq   %%mm0, %%mm3\n\t"
2492                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2493                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2494                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2495                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2496
2497                 : "+r" (x)
2498                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2499                 :"memory");
2500         }
2501 #endif
2502         for(; x<w; x++)
2503         {
2504             const int x2= x<<2;
2505             d[8*x+0]=yp[x2];
2506             d[8*x+1]=up[x];
2507             d[8*x+2]=yp[x2+1];
2508             d[8*x+3]=vp[x];
2509             d[8*x+4]=yp[x2+2];
2510             d[8*x+5]=up[x];
2511             d[8*x+6]=yp[x2+3];
2512             d[8*x+7]=vp[x];
2513         }
2514     }
2515 #ifdef HAVE_MMX
2516         asm(
2517                 EMMS" \n\t"
2518                 SFENCE" \n\t"
2519                 ::: "memory"
2520                 );
2521 #endif
2522 }