]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
cedbf700a79ad44c87d8083120d0e3e891d7f17b
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH  "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB     "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB     "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS     "femms"
49 #else
50 #define EMMS     "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63   uint8_t *dest = dst;
64   const uint8_t *s = src;
65   const uint8_t *end;
66 #ifdef HAVE_MMX
67   const uint8_t *mm_end;
68 #endif
69   end = s + src_size;
70 #ifdef HAVE_MMX
71   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
72   mm_end = end - 23;
73   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74   while(s < mm_end)
75   {
76     __asm __volatile(
77         PREFETCH"       32%1\n\t"
78         "movd   %1, %%mm0\n\t"
79         "punpckldq 3%1, %%mm0\n\t"
80         "movd   6%1, %%mm1\n\t"
81         "punpckldq 9%1, %%mm1\n\t"
82         "movd   12%1, %%mm2\n\t"
83         "punpckldq 15%1, %%mm2\n\t"
84         "movd   18%1, %%mm3\n\t"
85         "punpckldq 21%1, %%mm3\n\t"
86         "pand   %%mm7, %%mm0\n\t"
87         "pand   %%mm7, %%mm1\n\t"
88         "pand   %%mm7, %%mm2\n\t"
89         "pand   %%mm7, %%mm3\n\t"
90         MOVNTQ" %%mm0, %0\n\t"
91         MOVNTQ" %%mm1, 8%0\n\t"
92         MOVNTQ" %%mm2, 16%0\n\t"
93         MOVNTQ" %%mm3, 24%0"
94         :"=m"(*dest)
95         :"m"(*s)
96         :"memory");
97     dest += 32;
98     s += 24;
99   }
100   __asm __volatile(SFENCE:::"memory");
101   __asm __volatile(EMMS:::"memory");
102 #endif
103   while(s < end)
104   {
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = *s++;
108     *dest++ = 0;
109   }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114   uint8_t *dest = dst;
115   const uint8_t *s = src;
116   const uint8_t *end;
117 #ifdef HAVE_MMX
118   const uint8_t *mm_end;
119 #endif
120   end = s + src_size;
121 #ifdef HAVE_MMX
122   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
123   mm_end = end - 31;
124   while(s < mm_end)
125   {
126     __asm __volatile(
127         PREFETCH"       32%1\n\t"
128         "movq   %1, %%mm0\n\t"
129         "movq   8%1, %%mm1\n\t"
130         "movq   16%1, %%mm4\n\t"
131         "movq   24%1, %%mm5\n\t"
132         "movq   %%mm0, %%mm2\n\t"
133         "movq   %%mm1, %%mm3\n\t"
134         "movq   %%mm4, %%mm6\n\t"
135         "movq   %%mm5, %%mm7\n\t"
136         "psrlq  $8, %%mm2\n\t"
137         "psrlq  $8, %%mm3\n\t"
138         "psrlq  $8, %%mm6\n\t"
139         "psrlq  $8, %%mm7\n\t"
140         "pand   %2, %%mm0\n\t"
141         "pand   %2, %%mm1\n\t"
142         "pand   %2, %%mm4\n\t"
143         "pand   %2, %%mm5\n\t"
144         "pand   %3, %%mm2\n\t"
145         "pand   %3, %%mm3\n\t"
146         "pand   %3, %%mm6\n\t"
147         "pand   %3, %%mm7\n\t"
148         "por    %%mm2, %%mm0\n\t"
149         "por    %%mm3, %%mm1\n\t"
150         "por    %%mm6, %%mm4\n\t"
151         "por    %%mm7, %%mm5\n\t"
152
153         "movq   %%mm1, %%mm2\n\t"
154         "movq   %%mm4, %%mm3\n\t"
155         "psllq  $48, %%mm2\n\t"
156         "psllq  $32, %%mm3\n\t"
157         "pand   %4, %%mm2\n\t"
158         "pand   %5, %%mm3\n\t"
159         "por    %%mm2, %%mm0\n\t"
160         "psrlq  $16, %%mm1\n\t"
161         "psrlq  $32, %%mm4\n\t"
162         "psllq  $16, %%mm5\n\t"
163         "por    %%mm3, %%mm1\n\t"
164         "pand   %6, %%mm5\n\t"
165         "por    %%mm5, %%mm4\n\t"
166
167         MOVNTQ" %%mm0, %0\n\t"
168         MOVNTQ" %%mm1, 8%0\n\t"
169         MOVNTQ" %%mm4, 16%0"
170         :"=m"(*dest)
171         :"m"(*s),"m"(mask24l),
172          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173         :"memory");
174     dest += 24;
175     s += 32;
176   }
177   __asm __volatile(SFENCE:::"memory");
178   __asm __volatile(EMMS:::"memory");
179 #endif
180   while(s < end)
181   {
182     *dest++ = *s++;
183     *dest++ = *s++;
184     *dest++ = *s++;
185     s++;
186   }
187 }
188
189 /*
190  Original by Strepto/Astral
191  ported to gcc & bugfixed : A'rpi
192  MMX2, 3DNOW optimization by Nick Kurshev
193  32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197   register const uint8_t* s=src;
198   register uint8_t* d=dst;
199   register const uint8_t *end;
200   const uint8_t *mm_end;
201   end = s + src_size;
202 #ifdef HAVE_MMX
203   __asm __volatile(PREFETCH"    %0"::"m"(*s));
204   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205   mm_end = end - 15;
206   while(s<mm_end)
207   {
208         __asm __volatile(
209                 PREFETCH"       32%1\n\t"
210                 "movq   %1, %%mm0\n\t"
211                 "movq   8%1, %%mm2\n\t"
212                 "movq   %%mm0, %%mm1\n\t"
213                 "movq   %%mm2, %%mm3\n\t"
214                 "pand   %%mm4, %%mm0\n\t"
215                 "pand   %%mm4, %%mm2\n\t"
216                 "paddw  %%mm1, %%mm0\n\t"
217                 "paddw  %%mm3, %%mm2\n\t"
218                 MOVNTQ" %%mm0, %0\n\t"
219                 MOVNTQ" %%mm2, 8%0"
220                 :"=m"(*d)
221                 :"m"(*s)
222                 );
223         d+=16;
224         s+=16;
225   }
226   __asm __volatile(SFENCE:::"memory");
227   __asm __volatile(EMMS:::"memory");
228 #endif
229     mm_end = end - 3;
230     while(s < mm_end)
231     {
232         register unsigned x= *((uint32_t *)s);
233         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234         d+=4;
235         s+=4;
236     }
237     if(s < end)
238     {
239         register unsigned short x= *((uint16_t *)s);
240         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241     }
242 }
243
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245 {
246   register const uint8_t* s=src;
247   register uint8_t* d=dst;
248   register const uint8_t *end;
249   const uint8_t *mm_end;
250   end = s + src_size;
251 #ifdef HAVE_MMX
252   __asm __volatile(PREFETCH"    %0"::"m"(*s));
253   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255   mm_end = end - 15;
256   while(s<mm_end)
257   {
258         __asm __volatile(
259                 PREFETCH"       32%1\n\t"
260                 "movq   %1, %%mm0\n\t"
261                 "movq   8%1, %%mm2\n\t"
262                 "movq   %%mm0, %%mm1\n\t"
263                 "movq   %%mm2, %%mm3\n\t"
264                 "psrlq  $1, %%mm0\n\t"
265                 "psrlq  $1, %%mm2\n\t"
266                 "pand   %%mm7, %%mm0\n\t"
267                 "pand   %%mm7, %%mm2\n\t"
268                 "pand   %%mm6, %%mm1\n\t"
269                 "pand   %%mm6, %%mm3\n\t"
270                 "por    %%mm1, %%mm0\n\t"
271                 "por    %%mm3, %%mm2\n\t"
272                 MOVNTQ" %%mm0, %0\n\t"
273                 MOVNTQ" %%mm2, 8%0"
274                 :"=m"(*d)
275                 :"m"(*s)
276                 );
277         d+=16;
278         s+=16;
279   }
280   __asm __volatile(SFENCE:::"memory");
281   __asm __volatile(EMMS:::"memory");
282 #endif
283     mm_end = end - 3;
284     while(s < mm_end)
285     {
286         register uint32_t x= *((uint32_t *)s);
287         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288         s+=4;
289         d+=4;
290     }
291     if(s < end)
292     {
293         register uint16_t x= *((uint16_t *)s);
294         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295         s+=2;
296         d+=2;
297     }
298 }
299
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301 {
302         const uint8_t *s = src;
303         const uint8_t *end;
304 #ifdef HAVE_MMX
305         const uint8_t *mm_end;
306 #endif
307         uint16_t *d = (uint16_t *)dst;
308         end = s + src_size;
309 #ifdef HAVE_MMX
310         mm_end = end - 15;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312         asm volatile(
313                 "movq %3, %%mm5                 \n\t"
314                 "movq %4, %%mm6                 \n\t"
315                 "movq %5, %%mm7                 \n\t"
316                 ".balign 16                     \n\t"
317                 "1:                             \n\t"
318                 PREFETCH" 32(%1)                \n\t"
319                 "movd   (%1), %%mm0             \n\t"
320                 "movd   4(%1), %%mm3            \n\t"
321                 "punpckldq 8(%1), %%mm0         \n\t"
322                 "punpckldq 12(%1), %%mm3        \n\t"
323                 "movq %%mm0, %%mm1              \n\t"
324                 "movq %%mm3, %%mm4              \n\t"
325                 "pand %%mm6, %%mm0              \n\t"
326                 "pand %%mm6, %%mm3              \n\t"
327                 "pmaddwd %%mm7, %%mm0           \n\t"
328                 "pmaddwd %%mm7, %%mm3           \n\t"
329                 "pand %%mm5, %%mm1              \n\t"
330                 "pand %%mm5, %%mm4              \n\t"
331                 "por %%mm1, %%mm0               \n\t"   
332                 "por %%mm4, %%mm3               \n\t"
333                 "psrld $5, %%mm0                \n\t"
334                 "pslld $11, %%mm3               \n\t"
335                 "por %%mm3, %%mm0               \n\t"
336                 MOVNTQ" %%mm0, (%0)             \n\t"
337                 "addl $16, %1                   \n\t"
338                 "addl $8, %0                    \n\t"
339                 "cmpl %2, %1                    \n\t"
340                 " jb 1b                         \n\t"
341                 : "+r" (d), "+r"(s)
342                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343         );
344 #else
345         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
346         __asm __volatile(
347             "movq       %0, %%mm7\n\t"
348             "movq       %1, %%mm6\n\t"
349             ::"m"(red_16mask),"m"(green_16mask));
350         while(s < mm_end)
351         {
352             __asm __volatile(
353                 PREFETCH" 32%1\n\t"
354                 "movd   %1, %%mm0\n\t"
355                 "movd   4%1, %%mm3\n\t"
356                 "punpckldq 8%1, %%mm0\n\t"
357                 "punpckldq 12%1, %%mm3\n\t"
358                 "movq   %%mm0, %%mm1\n\t"
359                 "movq   %%mm0, %%mm2\n\t"
360                 "movq   %%mm3, %%mm4\n\t"
361                 "movq   %%mm3, %%mm5\n\t"
362                 "psrlq  $3, %%mm0\n\t"
363                 "psrlq  $3, %%mm3\n\t"
364                 "pand   %2, %%mm0\n\t"
365                 "pand   %2, %%mm3\n\t"
366                 "psrlq  $5, %%mm1\n\t"
367                 "psrlq  $5, %%mm4\n\t"
368                 "pand   %%mm6, %%mm1\n\t"
369                 "pand   %%mm6, %%mm4\n\t"
370                 "psrlq  $8, %%mm2\n\t"
371                 "psrlq  $8, %%mm5\n\t"
372                 "pand   %%mm7, %%mm2\n\t"
373                 "pand   %%mm7, %%mm5\n\t"
374                 "por    %%mm1, %%mm0\n\t"
375                 "por    %%mm4, %%mm3\n\t"
376                 "por    %%mm2, %%mm0\n\t"
377                 "por    %%mm5, %%mm3\n\t"
378                 "psllq  $16, %%mm3\n\t"
379                 "por    %%mm3, %%mm0\n\t"
380                 MOVNTQ" %%mm0, %0\n\t"
381                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382                 d += 4;
383                 s += 16;
384         }
385 #endif
386         __asm __volatile(SFENCE:::"memory");
387         __asm __volatile(EMMS:::"memory");
388 #endif
389         while(s < end)
390         {
391                 const int src= *((uint32_t*)s)++;
392                 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 //              *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394         }
395 }
396
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398 {
399         const uint8_t *s = src;
400         const uint8_t *end;
401 #ifdef HAVE_MMX
402         const uint8_t *mm_end;
403 #endif
404         uint16_t *d = (uint16_t *)dst;
405         end = s + src_size;
406 #ifdef HAVE_MMX
407         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
408         __asm __volatile(
409             "movq       %0, %%mm7\n\t"
410             "movq       %1, %%mm6\n\t"
411             ::"m"(red_16mask),"m"(green_16mask));
412         mm_end = end - 15;
413         while(s < mm_end)
414         {
415             __asm __volatile(
416                 PREFETCH" 32%1\n\t"
417                 "movd   %1, %%mm0\n\t"
418                 "movd   4%1, %%mm3\n\t"
419                 "punpckldq 8%1, %%mm0\n\t"
420                 "punpckldq 12%1, %%mm3\n\t"
421                 "movq   %%mm0, %%mm1\n\t"
422                 "movq   %%mm0, %%mm2\n\t"
423                 "movq   %%mm3, %%mm4\n\t"
424                 "movq   %%mm3, %%mm5\n\t"
425                 "psllq  $8, %%mm0\n\t"
426                 "psllq  $8, %%mm3\n\t"
427                 "pand   %%mm7, %%mm0\n\t"
428                 "pand   %%mm7, %%mm3\n\t"
429                 "psrlq  $5, %%mm1\n\t"
430                 "psrlq  $5, %%mm4\n\t"
431                 "pand   %%mm6, %%mm1\n\t"
432                 "pand   %%mm6, %%mm4\n\t"
433                 "psrlq  $19, %%mm2\n\t"
434                 "psrlq  $19, %%mm5\n\t"
435                 "pand   %2, %%mm2\n\t"
436                 "pand   %2, %%mm5\n\t"
437                 "por    %%mm1, %%mm0\n\t"
438                 "por    %%mm4, %%mm3\n\t"
439                 "por    %%mm2, %%mm0\n\t"
440                 "por    %%mm5, %%mm3\n\t"
441                 "psllq  $16, %%mm3\n\t"
442                 "por    %%mm3, %%mm0\n\t"
443                 MOVNTQ" %%mm0, %0\n\t"
444                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445                 d += 4;
446                 s += 16;
447         }
448         __asm __volatile(SFENCE:::"memory");
449         __asm __volatile(EMMS:::"memory");
450 #endif
451         while(s < end)
452         {
453                 const int src= *((uint32_t*)s)++;
454                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455         }
456 }
457
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459 {
460         const uint8_t *s = src;
461         const uint8_t *end;
462 #ifdef HAVE_MMX
463         const uint8_t *mm_end;
464 #endif
465         uint16_t *d = (uint16_t *)dst;
466         end = s + src_size;
467 #ifdef HAVE_MMX
468         mm_end = end - 15;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470         asm volatile(
471                 "movq %3, %%mm5                 \n\t"
472                 "movq %4, %%mm6                 \n\t"
473                 "movq %5, %%mm7                 \n\t"
474                 ".balign 16                     \n\t"
475                 "1:                             \n\t"
476                 PREFETCH" 32(%1)                \n\t"
477                 "movd   (%1), %%mm0             \n\t"
478                 "movd   4(%1), %%mm3            \n\t"
479                 "punpckldq 8(%1), %%mm0         \n\t"
480                 "punpckldq 12(%1), %%mm3        \n\t"
481                 "movq %%mm0, %%mm1              \n\t"
482                 "movq %%mm3, %%mm4              \n\t"
483                 "pand %%mm6, %%mm0              \n\t"
484                 "pand %%mm6, %%mm3              \n\t"
485                 "pmaddwd %%mm7, %%mm0           \n\t"
486                 "pmaddwd %%mm7, %%mm3           \n\t"
487                 "pand %%mm5, %%mm1              \n\t"
488                 "pand %%mm5, %%mm4              \n\t"
489                 "por %%mm1, %%mm0               \n\t"   
490                 "por %%mm4, %%mm3               \n\t"
491                 "psrld $6, %%mm0                \n\t"
492                 "pslld $10, %%mm3               \n\t"
493                 "por %%mm3, %%mm0               \n\t"
494                 MOVNTQ" %%mm0, (%0)             \n\t"
495                 "addl $16, %1                   \n\t"
496                 "addl $8, %0                    \n\t"
497                 "cmpl %2, %1                    \n\t"
498                 " jb 1b                         \n\t"
499                 : "+r" (d), "+r"(s)
500                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501         );
502 #else
503         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
504         __asm __volatile(
505             "movq       %0, %%mm7\n\t"
506             "movq       %1, %%mm6\n\t"
507             ::"m"(red_15mask),"m"(green_15mask));
508         while(s < mm_end)
509         {
510             __asm __volatile(
511                 PREFETCH" 32%1\n\t"
512                 "movd   %1, %%mm0\n\t"
513                 "movd   4%1, %%mm3\n\t"
514                 "punpckldq 8%1, %%mm0\n\t"
515                 "punpckldq 12%1, %%mm3\n\t"
516                 "movq   %%mm0, %%mm1\n\t"
517                 "movq   %%mm0, %%mm2\n\t"
518                 "movq   %%mm3, %%mm4\n\t"
519                 "movq   %%mm3, %%mm5\n\t"
520                 "psrlq  $3, %%mm0\n\t"
521                 "psrlq  $3, %%mm3\n\t"
522                 "pand   %2, %%mm0\n\t"
523                 "pand   %2, %%mm3\n\t"
524                 "psrlq  $6, %%mm1\n\t"
525                 "psrlq  $6, %%mm4\n\t"
526                 "pand   %%mm6, %%mm1\n\t"
527                 "pand   %%mm6, %%mm4\n\t"
528                 "psrlq  $9, %%mm2\n\t"
529                 "psrlq  $9, %%mm5\n\t"
530                 "pand   %%mm7, %%mm2\n\t"
531                 "pand   %%mm7, %%mm5\n\t"
532                 "por    %%mm1, %%mm0\n\t"
533                 "por    %%mm4, %%mm3\n\t"
534                 "por    %%mm2, %%mm0\n\t"
535                 "por    %%mm5, %%mm3\n\t"
536                 "psllq  $16, %%mm3\n\t"
537                 "por    %%mm3, %%mm0\n\t"
538                 MOVNTQ" %%mm0, %0\n\t"
539                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540                 d += 4;
541                 s += 16;
542         }
543 #endif
544         __asm __volatile(SFENCE:::"memory");
545         __asm __volatile(EMMS:::"memory");
546 #endif
547         while(s < end)
548         {
549                 const int src= *((uint32_t*)s)++;
550                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551         }
552 }
553
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555 {
556         const uint8_t *s = src;
557         const uint8_t *end;
558 #ifdef HAVE_MMX
559         const uint8_t *mm_end;
560 #endif
561         uint16_t *d = (uint16_t *)dst;
562         end = s + src_size;
563 #ifdef HAVE_MMX
564         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
565         __asm __volatile(
566             "movq       %0, %%mm7\n\t"
567             "movq       %1, %%mm6\n\t"
568             ::"m"(red_15mask),"m"(green_15mask));
569         mm_end = end - 15;
570         while(s < mm_end)
571         {
572             __asm __volatile(
573                 PREFETCH" 32%1\n\t"
574                 "movd   %1, %%mm0\n\t"
575                 "movd   4%1, %%mm3\n\t"
576                 "punpckldq 8%1, %%mm0\n\t"
577                 "punpckldq 12%1, %%mm3\n\t"
578                 "movq   %%mm0, %%mm1\n\t"
579                 "movq   %%mm0, %%mm2\n\t"
580                 "movq   %%mm3, %%mm4\n\t"
581                 "movq   %%mm3, %%mm5\n\t"
582                 "psllq  $7, %%mm0\n\t"
583                 "psllq  $7, %%mm3\n\t"
584                 "pand   %%mm7, %%mm0\n\t"
585                 "pand   %%mm7, %%mm3\n\t"
586                 "psrlq  $6, %%mm1\n\t"
587                 "psrlq  $6, %%mm4\n\t"
588                 "pand   %%mm6, %%mm1\n\t"
589                 "pand   %%mm6, %%mm4\n\t"
590                 "psrlq  $19, %%mm2\n\t"
591                 "psrlq  $19, %%mm5\n\t"
592                 "pand   %2, %%mm2\n\t"
593                 "pand   %2, %%mm5\n\t"
594                 "por    %%mm1, %%mm0\n\t"
595                 "por    %%mm4, %%mm3\n\t"
596                 "por    %%mm2, %%mm0\n\t"
597                 "por    %%mm5, %%mm3\n\t"
598                 "psllq  $16, %%mm3\n\t"
599                 "por    %%mm3, %%mm0\n\t"
600                 MOVNTQ" %%mm0, %0\n\t"
601                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602                 d += 4;
603                 s += 16;
604         }
605         __asm __volatile(SFENCE:::"memory");
606         __asm __volatile(EMMS:::"memory");
607 #endif
608         while(s < end)
609         {
610                 const int src= *((uint32_t*)s)++;
611                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612         }
613 }
614
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616 {
617         const uint8_t *s = src;
618         const uint8_t *end;
619 #ifdef HAVE_MMX
620         const uint8_t *mm_end;
621 #endif
622         uint16_t *d = (uint16_t *)dst;
623         end = s + src_size;
624 #ifdef HAVE_MMX
625         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
626         __asm __volatile(
627             "movq       %0, %%mm7\n\t"
628             "movq       %1, %%mm6\n\t"
629             ::"m"(red_16mask),"m"(green_16mask));
630         mm_end = end - 11;
631         while(s < mm_end)
632         {
633             __asm __volatile(
634                 PREFETCH" 32%1\n\t"
635                 "movd   %1, %%mm0\n\t"
636                 "movd   3%1, %%mm3\n\t"
637                 "punpckldq 6%1, %%mm0\n\t"
638                 "punpckldq 9%1, %%mm3\n\t"
639                 "movq   %%mm0, %%mm1\n\t"
640                 "movq   %%mm0, %%mm2\n\t"
641                 "movq   %%mm3, %%mm4\n\t"
642                 "movq   %%mm3, %%mm5\n\t"
643                 "psrlq  $3, %%mm0\n\t"
644                 "psrlq  $3, %%mm3\n\t"
645                 "pand   %2, %%mm0\n\t"
646                 "pand   %2, %%mm3\n\t"
647                 "psrlq  $5, %%mm1\n\t"
648                 "psrlq  $5, %%mm4\n\t"
649                 "pand   %%mm6, %%mm1\n\t"
650                 "pand   %%mm6, %%mm4\n\t"
651                 "psrlq  $8, %%mm2\n\t"
652                 "psrlq  $8, %%mm5\n\t"
653                 "pand   %%mm7, %%mm2\n\t"
654                 "pand   %%mm7, %%mm5\n\t"
655                 "por    %%mm1, %%mm0\n\t"
656                 "por    %%mm4, %%mm3\n\t"
657                 "por    %%mm2, %%mm0\n\t"
658                 "por    %%mm5, %%mm3\n\t"
659                 "psllq  $16, %%mm3\n\t"
660                 "por    %%mm3, %%mm0\n\t"
661                 MOVNTQ" %%mm0, %0\n\t"
662                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663                 d += 4;
664                 s += 12;
665         }
666         __asm __volatile(SFENCE:::"memory");
667         __asm __volatile(EMMS:::"memory");
668 #endif
669         while(s < end)
670         {
671                 const int b= *s++;
672                 const int g= *s++;
673                 const int r= *s++;
674                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675         }
676 }
677
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679 {
680         const uint8_t *s = src;
681         const uint8_t *end;
682 #ifdef HAVE_MMX
683         const uint8_t *mm_end;
684 #endif
685         uint16_t *d = (uint16_t *)dst;
686         end = s + src_size;
687 #ifdef HAVE_MMX
688         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
689         __asm __volatile(
690             "movq       %0, %%mm7\n\t"
691             "movq       %1, %%mm6\n\t"
692             ::"m"(red_16mask),"m"(green_16mask));
693         mm_end = end - 15;
694         while(s < mm_end)
695         {
696             __asm __volatile(
697                 PREFETCH" 32%1\n\t"
698                 "movd   %1, %%mm0\n\t"
699                 "movd   3%1, %%mm3\n\t"
700                 "punpckldq 6%1, %%mm0\n\t"
701                 "punpckldq 9%1, %%mm3\n\t"
702                 "movq   %%mm0, %%mm1\n\t"
703                 "movq   %%mm0, %%mm2\n\t"
704                 "movq   %%mm3, %%mm4\n\t"
705                 "movq   %%mm3, %%mm5\n\t"
706                 "psllq  $8, %%mm0\n\t"
707                 "psllq  $8, %%mm3\n\t"
708                 "pand   %%mm7, %%mm0\n\t"
709                 "pand   %%mm7, %%mm3\n\t"
710                 "psrlq  $5, %%mm1\n\t"
711                 "psrlq  $5, %%mm4\n\t"
712                 "pand   %%mm6, %%mm1\n\t"
713                 "pand   %%mm6, %%mm4\n\t"
714                 "psrlq  $19, %%mm2\n\t"
715                 "psrlq  $19, %%mm5\n\t"
716                 "pand   %2, %%mm2\n\t"
717                 "pand   %2, %%mm5\n\t"
718                 "por    %%mm1, %%mm0\n\t"
719                 "por    %%mm4, %%mm3\n\t"
720                 "por    %%mm2, %%mm0\n\t"
721                 "por    %%mm5, %%mm3\n\t"
722                 "psllq  $16, %%mm3\n\t"
723                 "por    %%mm3, %%mm0\n\t"
724                 MOVNTQ" %%mm0, %0\n\t"
725                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726                 d += 4;
727                 s += 12;
728         }
729         __asm __volatile(SFENCE:::"memory");
730         __asm __volatile(EMMS:::"memory");
731 #endif
732         while(s < end)
733         {
734                 const int r= *s++;
735                 const int g= *s++;
736                 const int b= *s++;
737                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738         }
739 }
740
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742 {
743         const uint8_t *s = src;
744         const uint8_t *end;
745 #ifdef HAVE_MMX
746         const uint8_t *mm_end;
747 #endif
748         uint16_t *d = (uint16_t *)dst;
749         end = s + src_size;
750 #ifdef HAVE_MMX
751         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
752         __asm __volatile(
753             "movq       %0, %%mm7\n\t"
754             "movq       %1, %%mm6\n\t"
755             ::"m"(red_15mask),"m"(green_15mask));
756         mm_end = end - 11;
757         while(s < mm_end)
758         {
759             __asm __volatile(
760                 PREFETCH" 32%1\n\t"
761                 "movd   %1, %%mm0\n\t"
762                 "movd   3%1, %%mm3\n\t"
763                 "punpckldq 6%1, %%mm0\n\t"
764                 "punpckldq 9%1, %%mm3\n\t"
765                 "movq   %%mm0, %%mm1\n\t"
766                 "movq   %%mm0, %%mm2\n\t"
767                 "movq   %%mm3, %%mm4\n\t"
768                 "movq   %%mm3, %%mm5\n\t"
769                 "psrlq  $3, %%mm0\n\t"
770                 "psrlq  $3, %%mm3\n\t"
771                 "pand   %2, %%mm0\n\t"
772                 "pand   %2, %%mm3\n\t"
773                 "psrlq  $6, %%mm1\n\t"
774                 "psrlq  $6, %%mm4\n\t"
775                 "pand   %%mm6, %%mm1\n\t"
776                 "pand   %%mm6, %%mm4\n\t"
777                 "psrlq  $9, %%mm2\n\t"
778                 "psrlq  $9, %%mm5\n\t"
779                 "pand   %%mm7, %%mm2\n\t"
780                 "pand   %%mm7, %%mm5\n\t"
781                 "por    %%mm1, %%mm0\n\t"
782                 "por    %%mm4, %%mm3\n\t"
783                 "por    %%mm2, %%mm0\n\t"
784                 "por    %%mm5, %%mm3\n\t"
785                 "psllq  $16, %%mm3\n\t"
786                 "por    %%mm3, %%mm0\n\t"
787                 MOVNTQ" %%mm0, %0\n\t"
788                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789                 d += 4;
790                 s += 12;
791         }
792         __asm __volatile(SFENCE:::"memory");
793         __asm __volatile(EMMS:::"memory");
794 #endif
795         while(s < end)
796         {
797                 const int b= *s++;
798                 const int g= *s++;
799                 const int r= *s++;
800                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801         }
802 }
803
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805 {
806         const uint8_t *s = src;
807         const uint8_t *end;
808 #ifdef HAVE_MMX
809         const uint8_t *mm_end;
810 #endif
811         uint16_t *d = (uint16_t *)dst;
812         end = s + src_size;
813 #ifdef HAVE_MMX
814         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
815         __asm __volatile(
816             "movq       %0, %%mm7\n\t"
817             "movq       %1, %%mm6\n\t"
818             ::"m"(red_15mask),"m"(green_15mask));
819         mm_end = end - 15;
820         while(s < mm_end)
821         {
822             __asm __volatile(
823                 PREFETCH" 32%1\n\t"
824                 "movd   %1, %%mm0\n\t"
825                 "movd   3%1, %%mm3\n\t"
826                 "punpckldq 6%1, %%mm0\n\t"
827                 "punpckldq 9%1, %%mm3\n\t"
828                 "movq   %%mm0, %%mm1\n\t"
829                 "movq   %%mm0, %%mm2\n\t"
830                 "movq   %%mm3, %%mm4\n\t"
831                 "movq   %%mm3, %%mm5\n\t"
832                 "psllq  $7, %%mm0\n\t"
833                 "psllq  $7, %%mm3\n\t"
834                 "pand   %%mm7, %%mm0\n\t"
835                 "pand   %%mm7, %%mm3\n\t"
836                 "psrlq  $6, %%mm1\n\t"
837                 "psrlq  $6, %%mm4\n\t"
838                 "pand   %%mm6, %%mm1\n\t"
839                 "pand   %%mm6, %%mm4\n\t"
840                 "psrlq  $19, %%mm2\n\t"
841                 "psrlq  $19, %%mm5\n\t"
842                 "pand   %2, %%mm2\n\t"
843                 "pand   %2, %%mm5\n\t"
844                 "por    %%mm1, %%mm0\n\t"
845                 "por    %%mm4, %%mm3\n\t"
846                 "por    %%mm2, %%mm0\n\t"
847                 "por    %%mm5, %%mm3\n\t"
848                 "psllq  $16, %%mm3\n\t"
849                 "por    %%mm3, %%mm0\n\t"
850                 MOVNTQ" %%mm0, %0\n\t"
851                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852                 d += 4;
853                 s += 12;
854         }
855         __asm __volatile(SFENCE:::"memory");
856         __asm __volatile(EMMS:::"memory");
857 #endif
858         while(s < end)
859         {
860                 const int r= *s++;
861                 const int g= *s++;
862                 const int b= *s++;
863                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864         }
865 }
866
867 /*
868   I use here less accurate approximation by simply
869  left-shifting the input
870   value and filling the low order bits with
871  zeroes. This method improves png's
872   compression but this scheme cannot reproduce white exactly, since it does not
873   generate an all-ones maximum value; the net effect is to darken the
874   image slightly.
875
876   The better method should be "left bit replication":
877
878    4 3 2 1 0
879    ---------
880    1 1 0 1 1
881
882    7 6 5 4 3  2 1 0
883    ----------------
884    1 1 0 1 1  1 1 0
885    |=======|  |===|
886        |      Leftmost Bits Repeated to Fill Open Bits
887        |
888    Original Bits
889 */
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891 {
892         const uint16_t *end;
893 #ifdef HAVE_MMX
894         const uint16_t *mm_end;
895 #endif
896         uint8_t *d = (uint8_t *)dst;
897         const uint16_t *s = (uint16_t *)src;
898         end = s + src_size/2;
899 #ifdef HAVE_MMX
900         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
901         mm_end = end - 7;
902         while(s < mm_end)
903         {
904             __asm __volatile(
905                 PREFETCH" 32%1\n\t"
906                 "movq   %1, %%mm0\n\t"
907                 "movq   %1, %%mm1\n\t"
908                 "movq   %1, %%mm2\n\t"
909                 "pand   %2, %%mm0\n\t"
910                 "pand   %3, %%mm1\n\t"
911                 "pand   %4, %%mm2\n\t"
912                 "psllq  $3, %%mm0\n\t"
913                 "psrlq  $2, %%mm1\n\t"
914                 "psrlq  $7, %%mm2\n\t"
915                 "movq   %%mm0, %%mm3\n\t"
916                 "movq   %%mm1, %%mm4\n\t"
917                 "movq   %%mm2, %%mm5\n\t"
918                 "punpcklwd %5, %%mm0\n\t"
919                 "punpcklwd %5, %%mm1\n\t"
920                 "punpcklwd %5, %%mm2\n\t"
921                 "punpckhwd %5, %%mm3\n\t"
922                 "punpckhwd %5, %%mm4\n\t"
923                 "punpckhwd %5, %%mm5\n\t"
924                 "psllq  $8, %%mm1\n\t"
925                 "psllq  $16, %%mm2\n\t"
926                 "por    %%mm1, %%mm0\n\t"
927                 "por    %%mm2, %%mm0\n\t"
928                 "psllq  $8, %%mm4\n\t"
929                 "psllq  $16, %%mm5\n\t"
930                 "por    %%mm4, %%mm3\n\t"
931                 "por    %%mm5, %%mm3\n\t"
932
933                 "movq   %%mm0, %%mm6\n\t"
934                 "movq   %%mm3, %%mm7\n\t"
935                 
936                 "movq   8%1, %%mm0\n\t"
937                 "movq   8%1, %%mm1\n\t"
938                 "movq   8%1, %%mm2\n\t"
939                 "pand   %2, %%mm0\n\t"
940                 "pand   %3, %%mm1\n\t"
941                 "pand   %4, %%mm2\n\t"
942                 "psllq  $3, %%mm0\n\t"
943                 "psrlq  $2, %%mm1\n\t"
944                 "psrlq  $7, %%mm2\n\t"
945                 "movq   %%mm0, %%mm3\n\t"
946                 "movq   %%mm1, %%mm4\n\t"
947                 "movq   %%mm2, %%mm5\n\t"
948                 "punpcklwd %5, %%mm0\n\t"
949                 "punpcklwd %5, %%mm1\n\t"
950                 "punpcklwd %5, %%mm2\n\t"
951                 "punpckhwd %5, %%mm3\n\t"
952                 "punpckhwd %5, %%mm4\n\t"
953                 "punpckhwd %5, %%mm5\n\t"
954                 "psllq  $8, %%mm1\n\t"
955                 "psllq  $16, %%mm2\n\t"
956                 "por    %%mm1, %%mm0\n\t"
957                 "por    %%mm2, %%mm0\n\t"
958                 "psllq  $8, %%mm4\n\t"
959                 "psllq  $16, %%mm5\n\t"
960                 "por    %%mm4, %%mm3\n\t"
961                 "por    %%mm5, %%mm3\n\t"
962
963                 :"=m"(*d)
964                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965                 :"memory");
966             /* Borrowed 32 to 24 */
967             __asm __volatile(
968                 "movq   %%mm0, %%mm4\n\t"
969                 "movq   %%mm3, %%mm5\n\t"
970                 "movq   %%mm6, %%mm0\n\t"
971                 "movq   %%mm7, %%mm1\n\t"
972                 
973                 "movq   %%mm4, %%mm6\n\t"
974                 "movq   %%mm5, %%mm7\n\t"
975                 "movq   %%mm0, %%mm2\n\t"
976                 "movq   %%mm1, %%mm3\n\t"
977
978                 "psrlq  $8, %%mm2\n\t"
979                 "psrlq  $8, %%mm3\n\t"
980                 "psrlq  $8, %%mm6\n\t"
981                 "psrlq  $8, %%mm7\n\t"
982                 "pand   %2, %%mm0\n\t"
983                 "pand   %2, %%mm1\n\t"
984                 "pand   %2, %%mm4\n\t"
985                 "pand   %2, %%mm5\n\t"
986                 "pand   %3, %%mm2\n\t"
987                 "pand   %3, %%mm3\n\t"
988                 "pand   %3, %%mm6\n\t"
989                 "pand   %3, %%mm7\n\t"
990                 "por    %%mm2, %%mm0\n\t"
991                 "por    %%mm3, %%mm1\n\t"
992                 "por    %%mm6, %%mm4\n\t"
993                 "por    %%mm7, %%mm5\n\t"
994
995                 "movq   %%mm1, %%mm2\n\t"
996                 "movq   %%mm4, %%mm3\n\t"
997                 "psllq  $48, %%mm2\n\t"
998                 "psllq  $32, %%mm3\n\t"
999                 "pand   %4, %%mm2\n\t"
1000                 "pand   %5, %%mm3\n\t"
1001                 "por    %%mm2, %%mm0\n\t"
1002                 "psrlq  $16, %%mm1\n\t"
1003                 "psrlq  $32, %%mm4\n\t"
1004                 "psllq  $16, %%mm5\n\t"
1005                 "por    %%mm3, %%mm1\n\t"
1006                 "pand   %6, %%mm5\n\t"
1007                 "por    %%mm5, %%mm4\n\t"
1008
1009                 MOVNTQ" %%mm0, %0\n\t"
1010                 MOVNTQ" %%mm1, 8%0\n\t"
1011                 MOVNTQ" %%mm4, 16%0"
1012
1013                 :"=m"(*d)
1014                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015                 :"memory");
1016                 d += 24;
1017                 s += 8;
1018         }
1019         __asm __volatile(SFENCE:::"memory");
1020         __asm __volatile(EMMS:::"memory");
1021 #endif
1022         while(s < end)
1023         {
1024                 register uint16_t bgr;
1025                 bgr = *s++;
1026                 *d++ = (bgr&0x1F)<<3;
1027                 *d++ = (bgr&0x3E0)>>2;
1028                 *d++ = (bgr&0x7C00)>>7;
1029         }
1030 }
1031
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033 {
1034         const uint16_t *end;
1035 #ifdef HAVE_MMX
1036         const uint16_t *mm_end;
1037 #endif
1038         uint8_t *d = (uint8_t *)dst;
1039         const uint16_t *s = (const uint16_t *)src;
1040         end = s + src_size/2;
1041 #ifdef HAVE_MMX
1042         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1043         mm_end = end - 7;
1044         while(s < mm_end)
1045         {
1046             __asm __volatile(
1047                 PREFETCH" 32%1\n\t"
1048                 "movq   %1, %%mm0\n\t"
1049                 "movq   %1, %%mm1\n\t"
1050                 "movq   %1, %%mm2\n\t"
1051                 "pand   %2, %%mm0\n\t"
1052                 "pand   %3, %%mm1\n\t"
1053                 "pand   %4, %%mm2\n\t"
1054                 "psllq  $3, %%mm0\n\t"
1055                 "psrlq  $3, %%mm1\n\t"
1056                 "psrlq  $8, %%mm2\n\t"
1057                 "movq   %%mm0, %%mm3\n\t"
1058                 "movq   %%mm1, %%mm4\n\t"
1059                 "movq   %%mm2, %%mm5\n\t"
1060                 "punpcklwd %5, %%mm0\n\t"
1061                 "punpcklwd %5, %%mm1\n\t"
1062                 "punpcklwd %5, %%mm2\n\t"
1063                 "punpckhwd %5, %%mm3\n\t"
1064                 "punpckhwd %5, %%mm4\n\t"
1065                 "punpckhwd %5, %%mm5\n\t"
1066                 "psllq  $8, %%mm1\n\t"
1067                 "psllq  $16, %%mm2\n\t"
1068                 "por    %%mm1, %%mm0\n\t"
1069                 "por    %%mm2, %%mm0\n\t"
1070                 "psllq  $8, %%mm4\n\t"
1071                 "psllq  $16, %%mm5\n\t"
1072                 "por    %%mm4, %%mm3\n\t"
1073                 "por    %%mm5, %%mm3\n\t"
1074                 
1075                 "movq   %%mm0, %%mm6\n\t"
1076                 "movq   %%mm3, %%mm7\n\t"
1077
1078                 "movq   8%1, %%mm0\n\t"
1079                 "movq   8%1, %%mm1\n\t"
1080                 "movq   8%1, %%mm2\n\t"
1081                 "pand   %2, %%mm0\n\t"
1082                 "pand   %3, %%mm1\n\t"
1083                 "pand   %4, %%mm2\n\t"
1084                 "psllq  $3, %%mm0\n\t"
1085                 "psrlq  $3, %%mm1\n\t"
1086                 "psrlq  $8, %%mm2\n\t"
1087                 "movq   %%mm0, %%mm3\n\t"
1088                 "movq   %%mm1, %%mm4\n\t"
1089                 "movq   %%mm2, %%mm5\n\t"
1090                 "punpcklwd %5, %%mm0\n\t"
1091                 "punpcklwd %5, %%mm1\n\t"
1092                 "punpcklwd %5, %%mm2\n\t"
1093                 "punpckhwd %5, %%mm3\n\t"
1094                 "punpckhwd %5, %%mm4\n\t"
1095                 "punpckhwd %5, %%mm5\n\t"
1096                 "psllq  $8, %%mm1\n\t"
1097                 "psllq  $16, %%mm2\n\t"
1098                 "por    %%mm1, %%mm0\n\t"
1099                 "por    %%mm2, %%mm0\n\t"
1100                 "psllq  $8, %%mm4\n\t"
1101                 "psllq  $16, %%mm5\n\t"
1102                 "por    %%mm4, %%mm3\n\t"
1103                 "por    %%mm5, %%mm3\n\t"
1104                 :"=m"(*d)
1105                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1106                 :"memory");
1107             /* Borrowed 32 to 24 */
1108             __asm __volatile(
1109                 "movq   %%mm0, %%mm4\n\t"
1110                 "movq   %%mm3, %%mm5\n\t"
1111                 "movq   %%mm6, %%mm0\n\t"
1112                 "movq   %%mm7, %%mm1\n\t"
1113                 
1114                 "movq   %%mm4, %%mm6\n\t"
1115                 "movq   %%mm5, %%mm7\n\t"
1116                 "movq   %%mm0, %%mm2\n\t"
1117                 "movq   %%mm1, %%mm3\n\t"
1118
1119                 "psrlq  $8, %%mm2\n\t"
1120                 "psrlq  $8, %%mm3\n\t"
1121                 "psrlq  $8, %%mm6\n\t"
1122                 "psrlq  $8, %%mm7\n\t"
1123                 "pand   %2, %%mm0\n\t"
1124                 "pand   %2, %%mm1\n\t"
1125                 "pand   %2, %%mm4\n\t"
1126                 "pand   %2, %%mm5\n\t"
1127                 "pand   %3, %%mm2\n\t"
1128                 "pand   %3, %%mm3\n\t"
1129                 "pand   %3, %%mm6\n\t"
1130                 "pand   %3, %%mm7\n\t"
1131                 "por    %%mm2, %%mm0\n\t"
1132                 "por    %%mm3, %%mm1\n\t"
1133                 "por    %%mm6, %%mm4\n\t"
1134                 "por    %%mm7, %%mm5\n\t"
1135
1136                 "movq   %%mm1, %%mm2\n\t"
1137                 "movq   %%mm4, %%mm3\n\t"
1138                 "psllq  $48, %%mm2\n\t"
1139                 "psllq  $32, %%mm3\n\t"
1140                 "pand   %4, %%mm2\n\t"
1141                 "pand   %5, %%mm3\n\t"
1142                 "por    %%mm2, %%mm0\n\t"
1143                 "psrlq  $16, %%mm1\n\t"
1144                 "psrlq  $32, %%mm4\n\t"
1145                 "psllq  $16, %%mm5\n\t"
1146                 "por    %%mm3, %%mm1\n\t"
1147                 "pand   %6, %%mm5\n\t"
1148                 "por    %%mm5, %%mm4\n\t"
1149
1150                 MOVNTQ" %%mm0, %0\n\t"
1151                 MOVNTQ" %%mm1, 8%0\n\t"
1152                 MOVNTQ" %%mm4, 16%0"
1153
1154                 :"=m"(*d)
1155                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156                 :"memory");
1157                 d += 24;
1158                 s += 8;
1159         }
1160         __asm __volatile(SFENCE:::"memory");
1161         __asm __volatile(EMMS:::"memory");
1162 #endif
1163         while(s < end)
1164         {
1165                 register uint16_t bgr;
1166                 bgr = *s++;
1167                 *d++ = (bgr&0x1F)<<3;
1168                 *d++ = (bgr&0x7E0)>>3;
1169                 *d++ = (bgr&0xF800)>>8;
1170         }
1171 }
1172
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174 {
1175         const uint16_t *end;
1176 #ifdef HAVE_MMX
1177         const uint16_t *mm_end;
1178 #endif
1179         uint8_t *d = (uint8_t *)dst;
1180         const uint16_t *s = (const uint16_t *)src;
1181         end = s + src_size/2;
1182 #ifdef HAVE_MMX
1183         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1184         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1185         mm_end = end - 3;
1186         while(s < mm_end)
1187         {
1188             __asm __volatile(
1189                 PREFETCH" 32%1\n\t"
1190                 "movq   %1, %%mm0\n\t"
1191                 "movq   %1, %%mm1\n\t"
1192                 "movq   %1, %%mm2\n\t"
1193                 "pand   %2, %%mm0\n\t"
1194                 "pand   %3, %%mm1\n\t"
1195                 "pand   %4, %%mm2\n\t"
1196                 "psllq  $3, %%mm0\n\t"
1197                 "psrlq  $2, %%mm1\n\t"
1198                 "psrlq  $7, %%mm2\n\t"
1199                 "movq   %%mm0, %%mm3\n\t"
1200                 "movq   %%mm1, %%mm4\n\t"
1201                 "movq   %%mm2, %%mm5\n\t"
1202                 "punpcklwd %%mm7, %%mm0\n\t"
1203                 "punpcklwd %%mm7, %%mm1\n\t"
1204                 "punpcklwd %%mm7, %%mm2\n\t"
1205                 "punpckhwd %%mm7, %%mm3\n\t"
1206                 "punpckhwd %%mm7, %%mm4\n\t"
1207                 "punpckhwd %%mm7, %%mm5\n\t"
1208                 "psllq  $8, %%mm1\n\t"
1209                 "psllq  $16, %%mm2\n\t"
1210                 "por    %%mm1, %%mm0\n\t"
1211                 "por    %%mm2, %%mm0\n\t"
1212                 "psllq  $8, %%mm4\n\t"
1213                 "psllq  $16, %%mm5\n\t"
1214                 "por    %%mm4, %%mm3\n\t"
1215                 "por    %%mm5, %%mm3\n\t"
1216                 MOVNTQ" %%mm0, %0\n\t"
1217                 MOVNTQ" %%mm3, 8%0\n\t"
1218                 :"=m"(*d)
1219                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220                 :"memory");
1221                 d += 16;
1222                 s += 4;
1223         }
1224         __asm __volatile(SFENCE:::"memory");
1225         __asm __volatile(EMMS:::"memory");
1226 #endif
1227         while(s < end)
1228         {
1229 #if 0 //slightly slower on athlon
1230                 int bgr= *s++;
1231                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232 #else
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234                 register uint16_t bgr;
1235                 bgr = *s++;
1236                 *d++ = (bgr&0x1F)<<3;
1237                 *d++ = (bgr&0x3E0)>>2;
1238                 *d++ = (bgr&0x7C00)>>7;
1239                 *d++ = 0;
1240 #endif
1241         }
1242 }
1243
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245 {
1246         const uint16_t *end;
1247 #ifdef HAVE_MMX
1248         const uint16_t *mm_end;
1249 #endif
1250         uint8_t *d = (uint8_t *)dst;
1251         const uint16_t *s = (uint16_t *)src;
1252         end = s + src_size/2;
1253 #ifdef HAVE_MMX
1254         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1255         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1256         mm_end = end - 3;
1257         while(s < mm_end)
1258         {
1259             __asm __volatile(
1260                 PREFETCH" 32%1\n\t"
1261                 "movq   %1, %%mm0\n\t"
1262                 "movq   %1, %%mm1\n\t"
1263                 "movq   %1, %%mm2\n\t"
1264                 "pand   %2, %%mm0\n\t"
1265                 "pand   %3, %%mm1\n\t"
1266                 "pand   %4, %%mm2\n\t"
1267                 "psllq  $3, %%mm0\n\t"
1268                 "psrlq  $3, %%mm1\n\t"
1269                 "psrlq  $8, %%mm2\n\t"
1270                 "movq   %%mm0, %%mm3\n\t"
1271                 "movq   %%mm1, %%mm4\n\t"
1272                 "movq   %%mm2, %%mm5\n\t"
1273                 "punpcklwd %%mm7, %%mm0\n\t"
1274                 "punpcklwd %%mm7, %%mm1\n\t"
1275                 "punpcklwd %%mm7, %%mm2\n\t"
1276                 "punpckhwd %%mm7, %%mm3\n\t"
1277                 "punpckhwd %%mm7, %%mm4\n\t"
1278                 "punpckhwd %%mm7, %%mm5\n\t"
1279                 "psllq  $8, %%mm1\n\t"
1280                 "psllq  $16, %%mm2\n\t"
1281                 "por    %%mm1, %%mm0\n\t"
1282                 "por    %%mm2, %%mm0\n\t"
1283                 "psllq  $8, %%mm4\n\t"
1284                 "psllq  $16, %%mm5\n\t"
1285                 "por    %%mm4, %%mm3\n\t"
1286                 "por    %%mm5, %%mm3\n\t"
1287                 MOVNTQ" %%mm0, %0\n\t"
1288                 MOVNTQ" %%mm3, 8%0\n\t"
1289                 :"=m"(*d)
1290                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291                 :"memory");
1292                 d += 16;
1293                 s += 4;
1294         }
1295         __asm __volatile(SFENCE:::"memory");
1296         __asm __volatile(EMMS:::"memory");
1297 #endif
1298         while(s < end)
1299         {
1300                 register uint16_t bgr;
1301                 bgr = *s++;
1302                 *d++ = (bgr&0x1F)<<3;
1303                 *d++ = (bgr&0x7E0)>>3;
1304                 *d++ = (bgr&0xF800)>>8;
1305                 *d++ = 0;
1306         }
1307 }
1308
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310 {
1311 #ifdef HAVE_MMX
1312 /* TODO: unroll this loop */
1313         asm volatile (
1314                 "xorl %%eax, %%eax              \n\t"
1315                 ".balign 16                     \n\t"
1316                 "1:                             \n\t"
1317                 PREFETCH" 32(%0, %%eax)         \n\t"
1318                 "movq (%0, %%eax), %%mm0        \n\t"
1319                 "movq %%mm0, %%mm1              \n\t"
1320                 "movq %%mm0, %%mm2              \n\t"
1321                 "pslld $16, %%mm0               \n\t"
1322                 "psrld $16, %%mm1               \n\t"
1323                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1324                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1325                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1326                 "por %%mm0, %%mm2               \n\t"
1327                 "por %%mm1, %%mm2               \n\t"
1328                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1329                 "addl $8, %%eax                 \n\t"
1330                 "cmpl %2, %%eax                 \n\t"
1331                 " jb 1b                         \n\t"
1332                 :: "r" (src), "r"(dst), "r" (src_size-7)
1333                 : "%eax"
1334         );
1335
1336         __asm __volatile(SFENCE:::"memory");
1337         __asm __volatile(EMMS:::"memory");
1338 #else
1339         unsigned i;
1340         unsigned num_pixels = src_size >> 2;
1341         for(i=0; i<num_pixels; i++)
1342         {
1343                 dst[4*i + 0] = src[4*i + 2];
1344                 dst[4*i + 1] = src[4*i + 1];
1345                 dst[4*i + 2] = src[4*i + 0];
1346         }
1347 #endif
1348 }
1349
1350 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1351 {
1352         unsigned i;
1353 #ifdef HAVE_MMX
1354         int mmx_size= 23 - src_size;
1355         asm volatile (
1356                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1357                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1358                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1359                 ".balign 16                     \n\t"
1360                 "1:                             \n\t"
1361                 PREFETCH" 32(%1, %%eax)         \n\t"
1362                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1363                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1364                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1365                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1366                 "pand %%mm5, %%mm0              \n\t"
1367                 "pand %%mm6, %%mm1              \n\t"
1368                 "pand %%mm7, %%mm2              \n\t"
1369                 "por %%mm0, %%mm1               \n\t"
1370                 "por %%mm2, %%mm1               \n\t"                
1371                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1372                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1373                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1374                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1375                 "pand %%mm7, %%mm0              \n\t"
1376                 "pand %%mm5, %%mm1              \n\t"
1377                 "pand %%mm6, %%mm2              \n\t"
1378                 "por %%mm0, %%mm1               \n\t"
1379                 "por %%mm2, %%mm1               \n\t"                
1380                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1381                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1382                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1383                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1384                 "pand %%mm6, %%mm0              \n\t"
1385                 "pand %%mm7, %%mm1              \n\t"
1386                 "pand %%mm5, %%mm2              \n\t"
1387                 "por %%mm0, %%mm1               \n\t"
1388                 "por %%mm2, %%mm1               \n\t"                
1389                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1390                 "addl $24, %%eax                \n\t"
1391                 " js 1b                         \n\t"
1392                 : "+a" (mmx_size)
1393                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1394         );
1395
1396         __asm __volatile(SFENCE:::"memory");
1397         __asm __volatile(EMMS:::"memory");
1398
1399         if(mmx_size==23) return; //finihsed, was multiple of 8
1400
1401         src+= src_size;
1402         dst+= src_size;
1403         src_size= 23-mmx_size;
1404         src-= src_size;
1405         dst-= src_size;
1406 #endif
1407         for(i=0; i<src_size; i+=3)
1408         {
1409                 register uint8_t x;
1410                 x          = src[i + 2];
1411                 dst[i + 1] = src[i + 1];
1412                 dst[i + 2] = src[i + 0];
1413                 dst[i + 0] = x;
1414         }
1415 }
1416
1417 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1418         unsigned int width, unsigned int height,
1419         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1420 {
1421         unsigned y;
1422         const unsigned chromWidth= width>>1;
1423         for(y=0; y<height; y++)
1424         {
1425 #ifdef HAVE_MMX
1426 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1427                 asm volatile(
1428                         "xorl %%eax, %%eax              \n\t"
1429                         ".balign 16                     \n\t"
1430                         "1:                             \n\t"
1431                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1432                         PREFETCH" 32(%2, %%eax)         \n\t"
1433                         PREFETCH" 32(%3, %%eax)         \n\t"
1434                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1435                         "movq %%mm0, %%mm2              \n\t" // U(0)
1436                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1437                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1438                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1439
1440                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1441                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1442                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1443                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1444                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1445                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1446                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1447                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1448
1449                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1450                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1451                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1452                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1453
1454                         "addl $8, %%eax                 \n\t"
1455                         "cmpl %4, %%eax                 \n\t"
1456                         " jb 1b                         \n\t"
1457                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1458                         : "%eax"
1459                 );
1460 #else
1461
1462 #if defined ARCH_ALPHA && defined HAVE_MVI
1463 #define pl2yuy2(n)                                      \
1464         y1 = yc[n];                                     \
1465         y2 = yc2[n];                                    \
1466         u = uc[n];                                      \
1467         v = vc[n];                                      \
1468         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1469         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1470         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1471         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1472         yuv1 = (u << 8) + (v << 24);                    \
1473         yuv2 = yuv1 + y2;                               \
1474         yuv1 += y1;                                     \
1475         qdst[n] = yuv1;                                 \
1476         qdst2[n] = yuv2;
1477
1478                 int i;
1479                 uint64_t *qdst = (uint64_t *) dst;
1480                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1481                 const uint32_t *yc = (uint32_t *) ysrc;
1482                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1483                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1484                 for(i = 0; i < chromWidth; i += 8){
1485                         uint64_t y1, y2, yuv1, yuv2;
1486                         uint64_t u, v;
1487                         /* Prefetch */
1488                         asm("ldq $31,64(%0)" :: "r"(yc));
1489                         asm("ldq $31,64(%0)" :: "r"(yc2));
1490                         asm("ldq $31,64(%0)" :: "r"(uc));
1491                         asm("ldq $31,64(%0)" :: "r"(vc));
1492
1493                         pl2yuy2(0);
1494                         pl2yuy2(1);
1495                         pl2yuy2(2);
1496                         pl2yuy2(3);
1497
1498                         yc += 4;
1499                         yc2 += 4;
1500                         uc += 4;
1501                         vc += 4;
1502                         qdst += 4;
1503                         qdst2 += 4;
1504                 }
1505                 y++;
1506                 ysrc += lumStride;
1507                 dst += dstStride;
1508
1509 #elif __WORDSIZE >= 64
1510                 int i;
1511                 uint64_t *ldst = (uint64_t *) dst;
1512                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1513                 for(i = 0; i < chromWidth; i += 2){
1514                         uint64_t k, l;
1515                         k = yc[0] + (uc[0] << 8) +
1516                             (yc[1] << 16) + (vc[0] << 24);
1517                         l = yc[2] + (uc[1] << 8) +
1518                             (yc[3] << 16) + (vc[1] << 24);
1519                         *ldst++ = k + (l << 32);
1520                         yc += 4;
1521                         uc += 2;
1522                         vc += 2;
1523                 }
1524
1525 #else
1526                 int i, *idst = (int32_t *) dst;
1527                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1528                 for(i = 0; i < chromWidth; i++){
1529                         *idst++ = yc[0] + (uc[0] << 8) +
1530                             (yc[1] << 16) + (vc[0] << 24);
1531                         yc += 2;
1532                         uc++;
1533                         vc++;
1534                 }
1535 #endif
1536 #endif
1537                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1538                 {
1539                         usrc += chromStride;
1540                         vsrc += chromStride;
1541                 }
1542                 ysrc += lumStride;
1543                 dst += dstStride;
1544         }
1545 #ifdef HAVE_MMX
1546 asm(    EMMS" \n\t"
1547         SFENCE" \n\t"
1548         :::"memory");
1549 #endif
1550 }
1551
1552 /**
1553  *
1554  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1555  * problem for anyone then tell me, and ill fix it)
1556  */
1557 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1558         unsigned int width, unsigned int height,
1559         int lumStride, int chromStride, int dstStride)
1560 {
1561         //FIXME interpolate chroma
1562         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1563 }
1564
1565 /**
1566  *
1567  * width should be a multiple of 16
1568  */
1569 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1570         unsigned int width, unsigned int height,
1571         int lumStride, int chromStride, int dstStride)
1572 {
1573         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1574 }
1575
1576 /**
1577  *
1578  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1579  * problem for anyone then tell me, and ill fix it)
1580  */
1581 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1582         unsigned int width, unsigned int height,
1583         int lumStride, int chromStride, int srcStride)
1584 {
1585         unsigned y;
1586         const unsigned chromWidth= width>>1;
1587         for(y=0; y<height; y+=2)
1588         {
1589 #ifdef HAVE_MMX
1590                 asm volatile(
1591                         "xorl %%eax, %%eax              \n\t"
1592                         "pcmpeqw %%mm7, %%mm7           \n\t"
1593                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1594                         ".balign 16                     \n\t"
1595                         "1:                             \n\t"
1596                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1597                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1598                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1599                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1600                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1601                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1602                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1603                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1604                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1605                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1606                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1607
1608                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1609
1610                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1611                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1612                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1613                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1614                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1615                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1616                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1617                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1618                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1619                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1620
1621                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1622
1623                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1624                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1625                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1626                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1627                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1628                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1629                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1630                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1631
1632                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1633                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1634
1635                         "addl $8, %%eax                 \n\t"
1636                         "cmpl %4, %%eax                 \n\t"
1637                         " jb 1b                         \n\t"
1638                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1639                         : "memory", "%eax"
1640                 );
1641
1642                 ydst += lumStride;
1643                 src  += srcStride;
1644
1645                 asm volatile(
1646                         "xorl %%eax, %%eax              \n\t"
1647                         ".balign 16                     \n\t"
1648                         "1:                             \n\t"
1649                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1650                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1651                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1652                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1653                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1654                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1655                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1656                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1657                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1658                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1659                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1660
1661                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1662                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1663
1664                         "addl $8, %%eax                 \n\t"
1665                         "cmpl %4, %%eax                 \n\t"
1666                         " jb 1b                         \n\t"
1667
1668                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1669                         : "memory", "%eax"
1670                 );
1671 #else
1672                 unsigned i;
1673                 for(i=0; i<chromWidth; i++)
1674                 {
1675                         ydst[2*i+0]     = src[4*i+0];
1676                         udst[i]         = src[4*i+1];
1677                         ydst[2*i+1]     = src[4*i+2];
1678                         vdst[i]         = src[4*i+3];
1679                 }
1680                 ydst += lumStride;
1681                 src  += srcStride;
1682
1683                 for(i=0; i<chromWidth; i++)
1684                 {
1685                         ydst[2*i+0]     = src[4*i+0];
1686                         ydst[2*i+1]     = src[4*i+2];
1687                 }
1688 #endif
1689                 udst += chromStride;
1690                 vdst += chromStride;
1691                 ydst += lumStride;
1692                 src  += srcStride;
1693         }
1694 #ifdef HAVE_MMX
1695 asm volatile(   EMMS" \n\t"
1696                 SFENCE" \n\t"
1697                 :::"memory");
1698 #endif
1699 }
1700
1701 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1702         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1703         unsigned int width, unsigned int height, int lumStride, int chromStride)
1704 {
1705         /* Y Plane */
1706         memcpy(ydst, ysrc, width*height);
1707
1708         /* XXX: implement upscaling for U,V */
1709 }
1710
1711 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1712 {
1713         int x,y;
1714         
1715         dst[0]= src[0];
1716         
1717         // first line
1718         for(x=0; x<srcWidth-1; x++){
1719                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1720                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1721         }
1722         dst[2*srcWidth-1]= src[srcWidth-1];
1723         
1724         dst+= dstStride;
1725
1726         for(y=1; y<srcHeight; y++){
1727 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728                 const int mmxSize= srcWidth&~15;
1729                 asm volatile(
1730                         "movl %4, %%eax                 \n\t"
1731                         "1:                             \n\t"
1732                         "movq (%0, %%eax), %%mm0        \n\t"
1733                         "movq (%1, %%eax), %%mm1        \n\t"
1734                         "movq 1(%0, %%eax), %%mm2       \n\t"
1735                         "movq 1(%1, %%eax), %%mm3       \n\t"
1736                         "movq -1(%0, %%eax), %%mm4      \n\t"
1737                         "movq -1(%1, %%eax), %%mm5      \n\t"
1738                         PAVGB" %%mm0, %%mm5             \n\t"
1739                         PAVGB" %%mm0, %%mm3             \n\t"
1740                         PAVGB" %%mm0, %%mm5             \n\t"
1741                         PAVGB" %%mm0, %%mm3             \n\t"
1742                         PAVGB" %%mm1, %%mm4             \n\t"
1743                         PAVGB" %%mm1, %%mm2             \n\t"
1744                         PAVGB" %%mm1, %%mm4             \n\t"
1745                         PAVGB" %%mm1, %%mm2             \n\t"
1746                         "movq %%mm5, %%mm7              \n\t"
1747                         "movq %%mm4, %%mm6              \n\t"
1748                         "punpcklbw %%mm3, %%mm5         \n\t"
1749                         "punpckhbw %%mm3, %%mm7         \n\t"
1750                         "punpcklbw %%mm2, %%mm4         \n\t"
1751                         "punpckhbw %%mm2, %%mm6         \n\t"
1752 #if 1
1753                         MOVNTQ" %%mm5, (%2, %%eax, 2)   \n\t"
1754                         MOVNTQ" %%mm7, 8(%2, %%eax, 2)  \n\t"
1755                         MOVNTQ" %%mm4, (%3, %%eax, 2)   \n\t"
1756                         MOVNTQ" %%mm6, 8(%3, %%eax, 2)  \n\t"
1757 #else
1758                         "movq %%mm5, (%2, %%eax, 2)     \n\t"
1759                         "movq %%mm7, 8(%2, %%eax, 2)    \n\t"
1760                         "movq %%mm4, (%3, %%eax, 2)     \n\t"
1761                         "movq %%mm6, 8(%3, %%eax, 2)    \n\t"
1762 #endif
1763                         "addl $8, %%eax                 \n\t"
1764                         " js 1b                         \n\t"
1765                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1766                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1767                            "g" (-mmxSize)
1768                         : "%eax"
1769
1770                 );
1771 #else
1772                 const int mmxSize=1;
1773 #endif
1774                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1775                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1776
1777                 for(x=mmxSize-1; x<srcWidth-1; x++){
1778                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1779                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1780                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1781                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1782                 }
1783                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1784                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1785
1786                 dst+=dstStride*2;
1787                 src+=srcStride;
1788         }
1789         
1790         // last line
1791 #if 1
1792         dst[0]= src[0];
1793         
1794         for(x=0; x<srcWidth-1; x++){
1795                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1796                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1797         }
1798         dst[2*srcWidth-1]= src[srcWidth-1];
1799 #else
1800         for(x=0; x<srcWidth; x++){
1801                 dst[2*x+0]=
1802                 dst[2*x+1]= src[x];
1803         }
1804 #endif
1805
1806 #ifdef HAVE_MMX
1807 asm volatile(   EMMS" \n\t"
1808                 SFENCE" \n\t"
1809                 :::"memory");
1810 #endif
1811 }
1812
1813 /**
1814  *
1815  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1816  * problem for anyone then tell me, and ill fix it)
1817  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1818  */
1819 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1820         unsigned int width, unsigned int height,
1821         int lumStride, int chromStride, int srcStride)
1822 {
1823         unsigned y;
1824         const unsigned chromWidth= width>>1;
1825         for(y=0; y<height; y+=2)
1826         {
1827 #ifdef HAVE_MMX
1828                 asm volatile(
1829                         "xorl %%eax, %%eax              \n\t"
1830                         "pcmpeqw %%mm7, %%mm7           \n\t"
1831                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1832                         ".balign 16                     \n\t"
1833                         "1:                             \n\t"
1834                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1835                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1836                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1837                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1838                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1839                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1840                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1841                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1842                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1843                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1844                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1845
1846                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1847
1848                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1849                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1850                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1851                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1852                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1853                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1854                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1855                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1856                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1857                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1858
1859                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1860
1861                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1862                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1863                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1864                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1865                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1866                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1867                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1868                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1869
1870                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1871                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1872
1873                         "addl $8, %%eax                 \n\t"
1874                         "cmpl %4, %%eax                 \n\t"
1875                         " jb 1b                         \n\t"
1876                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1877                         : "memory", "%eax"
1878                 );
1879
1880                 ydst += lumStride;
1881                 src  += srcStride;
1882
1883                 asm volatile(
1884                         "xorl %%eax, %%eax              \n\t"
1885                         ".balign 16                     \n\t"
1886                         "1:                             \n\t"
1887                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1888                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1889                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1890                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1891                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1892                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1893                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1894                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1895                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1896                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1897                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1898
1899                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1900                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1901
1902                         "addl $8, %%eax                 \n\t"
1903                         "cmpl %4, %%eax                 \n\t"
1904                         " jb 1b                         \n\t"
1905
1906                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1907                         : "memory", "%eax"
1908                 );
1909 #else
1910                 unsigned i;
1911                 for(i=0; i<chromWidth; i++)
1912                 {
1913                         udst[i]         = src[4*i+0];
1914                         ydst[2*i+0]     = src[4*i+1];
1915                         vdst[i]         = src[4*i+2];
1916                         ydst[2*i+1]     = src[4*i+3];
1917                 }
1918                 ydst += lumStride;
1919                 src  += srcStride;
1920
1921                 for(i=0; i<chromWidth; i++)
1922                 {
1923                         ydst[2*i+0]     = src[4*i+1];
1924                         ydst[2*i+1]     = src[4*i+3];
1925                 }
1926 #endif
1927                 udst += chromStride;
1928                 vdst += chromStride;
1929                 ydst += lumStride;
1930                 src  += srcStride;
1931         }
1932 #ifdef HAVE_MMX
1933 asm volatile(   EMMS" \n\t"
1934                 SFENCE" \n\t"
1935                 :::"memory");
1936 #endif
1937 }
1938
1939 /**
1940  *
1941  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1942  * problem for anyone then tell me, and ill fix it)
1943  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1944  */
1945 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1946         unsigned int width, unsigned int height,
1947         int lumStride, int chromStride, int srcStride)
1948 {
1949         unsigned y;
1950         const unsigned chromWidth= width>>1;
1951 #ifdef HAVE_MMX
1952         for(y=0; y<height-2; y+=2)
1953         {
1954                 unsigned i;
1955                 for(i=0; i<2; i++)
1956                 {
1957                         asm volatile(
1958                                 "movl %2, %%eax                 \n\t"
1959                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1960                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1961                                 "pxor %%mm7, %%mm7              \n\t"
1962                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1963                                 ".balign 16                     \n\t"
1964                                 "1:                             \n\t"
1965                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1966                                 "movd (%0, %%ebx), %%mm0        \n\t"
1967                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1968                                 "punpcklbw %%mm7, %%mm0         \n\t"
1969                                 "punpcklbw %%mm7, %%mm1         \n\t"
1970                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1971                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1972                                 "punpcklbw %%mm7, %%mm2         \n\t"
1973                                 "punpcklbw %%mm7, %%mm3         \n\t"
1974                                 "pmaddwd %%mm6, %%mm0           \n\t"
1975                                 "pmaddwd %%mm6, %%mm1           \n\t"
1976                                 "pmaddwd %%mm6, %%mm2           \n\t"
1977                                 "pmaddwd %%mm6, %%mm3           \n\t"
1978 #ifndef FAST_BGR2YV12
1979                                 "psrad $8, %%mm0                \n\t"
1980                                 "psrad $8, %%mm1                \n\t"
1981                                 "psrad $8, %%mm2                \n\t"
1982                                 "psrad $8, %%mm3                \n\t"
1983 #endif
1984                                 "packssdw %%mm1, %%mm0          \n\t"
1985                                 "packssdw %%mm3, %%mm2          \n\t"
1986                                 "pmaddwd %%mm5, %%mm0           \n\t"
1987                                 "pmaddwd %%mm5, %%mm2           \n\t"
1988                                 "packssdw %%mm2, %%mm0          \n\t"
1989                                 "psraw $7, %%mm0                \n\t"
1990
1991                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1992                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1993                                 "punpcklbw %%mm7, %%mm4         \n\t"
1994                                 "punpcklbw %%mm7, %%mm1         \n\t"
1995                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1996                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1997                                 "punpcklbw %%mm7, %%mm2         \n\t"
1998                                 "punpcklbw %%mm7, %%mm3         \n\t"
1999                                 "pmaddwd %%mm6, %%mm4           \n\t"
2000                                 "pmaddwd %%mm6, %%mm1           \n\t"
2001                                 "pmaddwd %%mm6, %%mm2           \n\t"
2002                                 "pmaddwd %%mm6, %%mm3           \n\t"
2003 #ifndef FAST_BGR2YV12
2004                                 "psrad $8, %%mm4                \n\t"
2005                                 "psrad $8, %%mm1                \n\t"
2006                                 "psrad $8, %%mm2                \n\t"
2007                                 "psrad $8, %%mm3                \n\t"
2008 #endif
2009                                 "packssdw %%mm1, %%mm4          \n\t"
2010                                 "packssdw %%mm3, %%mm2          \n\t"
2011                                 "pmaddwd %%mm5, %%mm4           \n\t"
2012                                 "pmaddwd %%mm5, %%mm2           \n\t"
2013                                 "addl $24, %%ebx                \n\t"
2014                                 "packssdw %%mm2, %%mm4          \n\t"
2015                                 "psraw $7, %%mm4                \n\t"
2016
2017                                 "packuswb %%mm4, %%mm0          \n\t"
2018                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2019
2020                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
2021                                 "addl $8, %%eax                 \n\t"
2022                                 " js 1b                         \n\t"
2023                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2024                                 : "%eax", "%ebx"
2025                         );
2026                         ydst += lumStride;
2027                         src  += srcStride;
2028                 }
2029                 src -= srcStride*2;
2030                 asm volatile(
2031                         "movl %4, %%eax                 \n\t"
2032                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2033                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2034                         "pxor %%mm7, %%mm7              \n\t"
2035                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
2036                         "addl %%ebx, %%ebx              \n\t"
2037                         ".balign 16                     \n\t"
2038                         "1:                             \n\t"
2039                         PREFETCH" 64(%0, %%ebx)         \n\t"
2040                         PREFETCH" 64(%1, %%ebx)         \n\t"
2041 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2042                         "movq (%0, %%ebx), %%mm0        \n\t"
2043                         "movq (%1, %%ebx), %%mm1        \n\t"
2044                         "movq 6(%0, %%ebx), %%mm2       \n\t"
2045                         "movq 6(%1, %%ebx), %%mm3       \n\t"
2046                         PAVGB" %%mm1, %%mm0             \n\t"
2047                         PAVGB" %%mm3, %%mm2             \n\t"
2048                         "movq %%mm0, %%mm1              \n\t"
2049                         "movq %%mm2, %%mm3              \n\t"
2050                         "psrlq $24, %%mm0               \n\t"
2051                         "psrlq $24, %%mm2               \n\t"
2052                         PAVGB" %%mm1, %%mm0             \n\t"
2053                         PAVGB" %%mm3, %%mm2             \n\t"
2054                         "punpcklbw %%mm7, %%mm0         \n\t"
2055                         "punpcklbw %%mm7, %%mm2         \n\t"
2056 #else
2057                         "movd (%0, %%ebx), %%mm0        \n\t"
2058                         "movd (%1, %%ebx), %%mm1        \n\t"
2059                         "movd 3(%0, %%ebx), %%mm2       \n\t"
2060                         "movd 3(%1, %%ebx), %%mm3       \n\t"
2061                         "punpcklbw %%mm7, %%mm0         \n\t"
2062                         "punpcklbw %%mm7, %%mm1         \n\t"
2063                         "punpcklbw %%mm7, %%mm2         \n\t"
2064                         "punpcklbw %%mm7, %%mm3         \n\t"
2065                         "paddw %%mm1, %%mm0             \n\t"
2066                         "paddw %%mm3, %%mm2             \n\t"
2067                         "paddw %%mm2, %%mm0             \n\t"
2068                         "movd 6(%0, %%ebx), %%mm4       \n\t"
2069                         "movd 6(%1, %%ebx), %%mm1       \n\t"
2070                         "movd 9(%0, %%ebx), %%mm2       \n\t"
2071                         "movd 9(%1, %%ebx), %%mm3       \n\t"
2072                         "punpcklbw %%mm7, %%mm4         \n\t"
2073                         "punpcklbw %%mm7, %%mm1         \n\t"
2074                         "punpcklbw %%mm7, %%mm2         \n\t"
2075                         "punpcklbw %%mm7, %%mm3         \n\t"
2076                         "paddw %%mm1, %%mm4             \n\t"
2077                         "paddw %%mm3, %%mm2             \n\t"
2078                         "paddw %%mm4, %%mm2             \n\t"
2079                         "psrlw $2, %%mm0                \n\t"
2080                         "psrlw $2, %%mm2                \n\t"
2081 #endif
2082                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2083                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2084
2085                         "pmaddwd %%mm0, %%mm1           \n\t"
2086                         "pmaddwd %%mm2, %%mm3           \n\t"
2087                         "pmaddwd %%mm6, %%mm0           \n\t"
2088                         "pmaddwd %%mm6, %%mm2           \n\t"
2089 #ifndef FAST_BGR2YV12
2090                         "psrad $8, %%mm0                \n\t"
2091                         "psrad $8, %%mm1                \n\t"
2092                         "psrad $8, %%mm2                \n\t"
2093                         "psrad $8, %%mm3                \n\t"
2094 #endif
2095                         "packssdw %%mm2, %%mm0          \n\t"
2096                         "packssdw %%mm3, %%mm1          \n\t"
2097                         "pmaddwd %%mm5, %%mm0           \n\t"
2098                         "pmaddwd %%mm5, %%mm1           \n\t"
2099                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2100                         "psraw $7, %%mm0                \n\t"
2101
2102 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2103                         "movq 12(%0, %%ebx), %%mm4      \n\t"
2104                         "movq 12(%1, %%ebx), %%mm1      \n\t"
2105                         "movq 18(%0, %%ebx), %%mm2      \n\t"
2106                         "movq 18(%1, %%ebx), %%mm3      \n\t"
2107                         PAVGB" %%mm1, %%mm4             \n\t"
2108                         PAVGB" %%mm3, %%mm2             \n\t"
2109                         "movq %%mm4, %%mm1              \n\t"
2110                         "movq %%mm2, %%mm3              \n\t"
2111                         "psrlq $24, %%mm4               \n\t"
2112                         "psrlq $24, %%mm2               \n\t"
2113                         PAVGB" %%mm1, %%mm4             \n\t"
2114                         PAVGB" %%mm3, %%mm2             \n\t"
2115                         "punpcklbw %%mm7, %%mm4         \n\t"
2116                         "punpcklbw %%mm7, %%mm2         \n\t"
2117 #else
2118                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2119                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2120                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2121                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2122                         "punpcklbw %%mm7, %%mm4         \n\t"
2123                         "punpcklbw %%mm7, %%mm1         \n\t"
2124                         "punpcklbw %%mm7, %%mm2         \n\t"
2125                         "punpcklbw %%mm7, %%mm3         \n\t"
2126                         "paddw %%mm1, %%mm4             \n\t"
2127                         "paddw %%mm3, %%mm2             \n\t"
2128                         "paddw %%mm2, %%mm4             \n\t"
2129                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2130                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2131                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2132                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2133                         "punpcklbw %%mm7, %%mm5         \n\t"
2134                         "punpcklbw %%mm7, %%mm1         \n\t"
2135                         "punpcklbw %%mm7, %%mm2         \n\t"
2136                         "punpcklbw %%mm7, %%mm3         \n\t"
2137                         "paddw %%mm1, %%mm5             \n\t"
2138                         "paddw %%mm3, %%mm2             \n\t"
2139                         "paddw %%mm5, %%mm2             \n\t"
2140                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2141                         "psrlw $2, %%mm4                \n\t"
2142                         "psrlw $2, %%mm2                \n\t"
2143 #endif
2144                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2145                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2146
2147                         "pmaddwd %%mm4, %%mm1           \n\t"
2148                         "pmaddwd %%mm2, %%mm3           \n\t"
2149                         "pmaddwd %%mm6, %%mm4           \n\t"
2150                         "pmaddwd %%mm6, %%mm2           \n\t"
2151 #ifndef FAST_BGR2YV12
2152                         "psrad $8, %%mm4                \n\t"
2153                         "psrad $8, %%mm1                \n\t"
2154                         "psrad $8, %%mm2                \n\t"
2155                         "psrad $8, %%mm3                \n\t"
2156 #endif
2157                         "packssdw %%mm2, %%mm4          \n\t"
2158                         "packssdw %%mm3, %%mm1          \n\t"
2159                         "pmaddwd %%mm5, %%mm4           \n\t"
2160                         "pmaddwd %%mm5, %%mm1           \n\t"
2161                         "addl $24, %%ebx                \n\t"
2162                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2163                         "psraw $7, %%mm4                \n\t"
2164
2165                         "movq %%mm0, %%mm1              \n\t"
2166                         "punpckldq %%mm4, %%mm0         \n\t"
2167                         "punpckhdq %%mm4, %%mm1         \n\t"
2168                         "packsswb %%mm1, %%mm0          \n\t"
2169                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2170
2171                         "movd %%mm0, (%2, %%eax)        \n\t"
2172                         "punpckhdq %%mm0, %%mm0         \n\t"
2173                         "movd %%mm0, (%3, %%eax)        \n\t"
2174                         "addl $4, %%eax                 \n\t"
2175                         " js 1b                         \n\t"
2176                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2177                         : "%eax", "%ebx"
2178                 );
2179
2180                 udst += chromStride;
2181                 vdst += chromStride;
2182                 src  += srcStride*2;
2183         }
2184
2185         asm volatile(   EMMS" \n\t"
2186                         SFENCE" \n\t"
2187                         :::"memory");
2188 #else
2189         y=0;
2190 #endif
2191         for(; y<height; y+=2)
2192         {
2193                 unsigned i;
2194                 for(i=0; i<chromWidth; i++)
2195                 {
2196                         unsigned int b= src[6*i+0];
2197                         unsigned int g= src[6*i+1];
2198                         unsigned int r= src[6*i+2];
2199
2200                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2201                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2202                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2203
2204                         udst[i]         = U;
2205                         vdst[i]         = V;
2206                         ydst[2*i]       = Y;
2207
2208                         b= src[6*i+3];
2209                         g= src[6*i+4];
2210                         r= src[6*i+5];
2211
2212                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2213                         ydst[2*i+1]     = Y;
2214                 }
2215                 ydst += lumStride;
2216                 src  += srcStride;
2217
2218                 for(i=0; i<chromWidth; i++)
2219                 {
2220                         unsigned int b= src[6*i+0];
2221                         unsigned int g= src[6*i+1];
2222                         unsigned int r= src[6*i+2];
2223
2224                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2225
2226                         ydst[2*i]       = Y;
2227
2228                         b= src[6*i+3];
2229                         g= src[6*i+4];
2230                         r= src[6*i+5];
2231
2232                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2233                         ydst[2*i+1]     = Y;
2234                 }
2235                 udst += chromStride;
2236                 vdst += chromStride;
2237                 ydst += lumStride;
2238                 src  += srcStride;
2239         }
2240 }
2241
2242 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2243                             unsigned width, unsigned height, int src1Stride,
2244                             int src2Stride, int dstStride){
2245         unsigned h;
2246
2247         for(h=0; h < height; h++)
2248         {
2249                 unsigned w;
2250
2251 #ifdef HAVE_MMX
2252 #ifdef HAVE_SSE2
2253                 asm(
2254                         "xorl %%eax, %%eax              \n\t"
2255                         "1:                             \n\t"
2256                         PREFETCH" 64(%1, %%eax)         \n\t"
2257                         PREFETCH" 64(%2, %%eax)         \n\t"
2258                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2259                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2260                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2261                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2262                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2263                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2264                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2265                         "addl $16, %%eax                        \n\t"
2266                         "cmpl %3, %%eax                 \n\t"
2267                         " jb 1b                         \n\t"
2268                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2269                         : "memory", "%eax"
2270                 );
2271 #else
2272                 asm(
2273                         "xorl %%eax, %%eax              \n\t"
2274                         "1:                             \n\t"
2275                         PREFETCH" 64(%1, %%eax)         \n\t"
2276                         PREFETCH" 64(%2, %%eax)         \n\t"
2277                         "movq (%1, %%eax), %%mm0        \n\t"
2278                         "movq 8(%1, %%eax), %%mm2       \n\t"
2279                         "movq %%mm0, %%mm1              \n\t"
2280                         "movq %%mm2, %%mm3              \n\t"
2281                         "movq (%2, %%eax), %%mm4        \n\t"
2282                         "movq 8(%2, %%eax), %%mm5       \n\t"
2283                         "punpcklbw %%mm4, %%mm0         \n\t"
2284                         "punpckhbw %%mm4, %%mm1         \n\t"
2285                         "punpcklbw %%mm5, %%mm2         \n\t"
2286                         "punpckhbw %%mm5, %%mm3         \n\t"
2287                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2288                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2289                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2290                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2291                         "addl $16, %%eax                        \n\t"
2292                         "cmpl %3, %%eax                 \n\t"
2293                         " jb 1b                         \n\t"
2294                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2295                         : "memory", "%eax"
2296                 );
2297 #endif
2298                 for(w= (width&(~15)); w < width; w++)
2299                 {
2300                         dest[2*w+0] = src1[w];
2301                         dest[2*w+1] = src2[w];
2302                 }
2303 #else
2304                 for(w=0; w < width; w++)
2305                 {
2306                         dest[2*w+0] = src1[w];
2307                         dest[2*w+1] = src2[w];
2308                 }
2309 #endif
2310                 dest += dstStride;
2311                 src1 += src1Stride;
2312                 src2 += src2Stride;
2313         }
2314 #ifdef HAVE_MMX
2315         asm(
2316                 EMMS" \n\t"
2317                 SFENCE" \n\t"
2318                 ::: "memory"
2319                 );
2320 #endif
2321 }
2322
2323 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2324                         uint8_t *dst1, uint8_t *dst2,
2325                         unsigned width, unsigned height,
2326                         int srcStride1, int srcStride2,
2327                         int dstStride1, int dstStride2)
2328 {
2329     unsigned int y,x,h;
2330     int w;
2331     w=width/2; h=height/2;
2332 #ifdef HAVE_MMX
2333     asm volatile(
2334         PREFETCH" %0\n\t"
2335         PREFETCH" %1\n\t"
2336         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2337 #endif
2338     for(y=0;y<h;y++){
2339         const uint8_t* s1=src1+srcStride1*(y>>1);
2340         uint8_t* d=dst1+dstStride1*y;
2341         x=0;
2342 #ifdef HAVE_MMX
2343         for(;x<w-31;x+=32)
2344         {
2345             asm volatile(
2346                 PREFETCH" 32%1\n\t"
2347                 "movq   %1, %%mm0\n\t"
2348                 "movq   8%1, %%mm2\n\t"
2349                 "movq   16%1, %%mm4\n\t"
2350                 "movq   24%1, %%mm6\n\t"
2351                 "movq   %%mm0, %%mm1\n\t"
2352                 "movq   %%mm2, %%mm3\n\t"
2353                 "movq   %%mm4, %%mm5\n\t"
2354                 "movq   %%mm6, %%mm7\n\t"
2355                 "punpcklbw %%mm0, %%mm0\n\t"
2356                 "punpckhbw %%mm1, %%mm1\n\t"
2357                 "punpcklbw %%mm2, %%mm2\n\t"
2358                 "punpckhbw %%mm3, %%mm3\n\t"
2359                 "punpcklbw %%mm4, %%mm4\n\t"
2360                 "punpckhbw %%mm5, %%mm5\n\t"
2361                 "punpcklbw %%mm6, %%mm6\n\t"
2362                 "punpckhbw %%mm7, %%mm7\n\t"
2363                 MOVNTQ" %%mm0, %0\n\t"
2364                 MOVNTQ" %%mm1, 8%0\n\t"
2365                 MOVNTQ" %%mm2, 16%0\n\t"
2366                 MOVNTQ" %%mm3, 24%0\n\t"
2367                 MOVNTQ" %%mm4, 32%0\n\t"
2368                 MOVNTQ" %%mm5, 40%0\n\t"
2369                 MOVNTQ" %%mm6, 48%0\n\t"
2370                 MOVNTQ" %%mm7, 56%0"
2371                 :"=m"(d[2*x])
2372                 :"m"(s1[x])
2373                 :"memory");
2374         }
2375 #endif
2376         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2377     }
2378     for(y=0;y<h;y++){
2379         const uint8_t* s2=src2+srcStride2*(y>>1);
2380         uint8_t* d=dst2+dstStride2*y;
2381         x=0;
2382 #ifdef HAVE_MMX
2383         for(;x<w-31;x+=32)
2384         {
2385             asm volatile(
2386                 PREFETCH" 32%1\n\t"
2387                 "movq   %1, %%mm0\n\t"
2388                 "movq   8%1, %%mm2\n\t"
2389                 "movq   16%1, %%mm4\n\t"
2390                 "movq   24%1, %%mm6\n\t"
2391                 "movq   %%mm0, %%mm1\n\t"
2392                 "movq   %%mm2, %%mm3\n\t"
2393                 "movq   %%mm4, %%mm5\n\t"
2394                 "movq   %%mm6, %%mm7\n\t"
2395                 "punpcklbw %%mm0, %%mm0\n\t"
2396                 "punpckhbw %%mm1, %%mm1\n\t"
2397                 "punpcklbw %%mm2, %%mm2\n\t"
2398                 "punpckhbw %%mm3, %%mm3\n\t"
2399                 "punpcklbw %%mm4, %%mm4\n\t"
2400                 "punpckhbw %%mm5, %%mm5\n\t"
2401                 "punpcklbw %%mm6, %%mm6\n\t"
2402                 "punpckhbw %%mm7, %%mm7\n\t"
2403                 MOVNTQ" %%mm0, %0\n\t"
2404                 MOVNTQ" %%mm1, 8%0\n\t"
2405                 MOVNTQ" %%mm2, 16%0\n\t"
2406                 MOVNTQ" %%mm3, 24%0\n\t"
2407                 MOVNTQ" %%mm4, 32%0\n\t"
2408                 MOVNTQ" %%mm5, 40%0\n\t"
2409                 MOVNTQ" %%mm6, 48%0\n\t"
2410                 MOVNTQ" %%mm7, 56%0"
2411                 :"=m"(d[2*x])
2412                 :"m"(s2[x])
2413                 :"memory");
2414         }
2415 #endif
2416         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2417     }
2418 #ifdef HAVE_MMX
2419         asm(
2420                 EMMS" \n\t"
2421                 SFENCE" \n\t"
2422                 ::: "memory"
2423                 );
2424 #endif
2425 }
2426
2427 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2428                         uint8_t *dst,
2429                         unsigned width, unsigned height,
2430                         int srcStride1, int srcStride2,
2431                         int srcStride3, int dstStride)
2432 {
2433     unsigned y,x,w,h;
2434     w=width/2; h=height;
2435     for(y=0;y<h;y++){
2436         const uint8_t* yp=src1+srcStride1*y;
2437         const uint8_t* up=src2+srcStride2*(y>>2);
2438         const uint8_t* vp=src3+srcStride3*(y>>2);
2439         uint8_t* d=dst+dstStride*y;
2440         x=0;
2441 #ifdef HAVE_MMX
2442         for(;x<w-7;x+=8)
2443         {
2444             asm volatile(
2445                 PREFETCH" 32(%1, %0)\n\t"
2446                 PREFETCH" 32(%2, %0)\n\t"
2447                 PREFETCH" 32(%3, %0)\n\t"
2448                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2449                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2450                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2451                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2452                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2453                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2454                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2455                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2456                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2457                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2458
2459                 "movq   %%mm1, %%mm6\n\t"
2460                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2461                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2462                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2463                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2464                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2465                 
2466                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2467                 "movq   8(%1, %0, 4), %%mm0\n\t"
2468                 "movq   %%mm0, %%mm3\n\t"
2469                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2470                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2471                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2472                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2473
2474                 "movq   %%mm4, %%mm6\n\t"
2475                 "movq   16(%1, %0, 4), %%mm0\n\t"
2476                 "movq   %%mm0, %%mm3\n\t"
2477                 "punpcklbw %%mm5, %%mm4\n\t"
2478                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2479                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2480                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2481                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2482                 
2483                 "punpckhbw %%mm5, %%mm6\n\t"
2484                 "movq   24(%1, %0, 4), %%mm0\n\t"
2485                 "movq   %%mm0, %%mm3\n\t"
2486                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2487                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2488                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2489                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2490
2491                 : "+r" (x)
2492                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2493                 :"memory");
2494         }
2495 #endif
2496         for(; x<w; x++)
2497         {
2498             const int x2= x<<2;
2499             d[8*x+0]=yp[x2];
2500             d[8*x+1]=up[x];
2501             d[8*x+2]=yp[x2+1];
2502             d[8*x+3]=vp[x];
2503             d[8*x+4]=yp[x2+2];
2504             d[8*x+5]=up[x];
2505             d[8*x+6]=yp[x2+3];
2506             d[8*x+7]=vp[x];
2507         }
2508     }
2509 #ifdef HAVE_MMX
2510         asm(
2511                 EMMS" \n\t"
2512                 SFENCE" \n\t"
2513                 ::: "memory"
2514                 );
2515 #endif
2516 }