]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
Remove obsolete big-endian FIXMEs.
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #ifndef __WORDSIZE
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
18 #endif
19
20 #undef PREFETCH
21 #undef MOVNTQ
22 #undef EMMS
23 #undef SFENCE
24 #undef MMREG_SIZE
25 #undef PREFETCHW
26 #undef PAVGB
27
28 #ifdef HAVE_SSE2
29 #define MMREG_SIZE 16
30 #else
31 #define MMREG_SIZE 8
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB     "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
41 #define PAVGB     "pavgb"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_3DNOW
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
49 #define EMMS     "femms"
50 #else
51 #define EMMS     "emms"
52 #endif
53
54 #ifdef HAVE_MMX2
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE "/nop"
60 #endif
61
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
63 {
64   uint8_t *dest = dst;
65   const uint8_t *s = src;
66   const uint8_t *end;
67 #ifdef HAVE_MMX
68   const uint8_t *mm_end;
69 #endif
70   end = s + src_size;
71 #ifdef HAVE_MMX
72   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
73   mm_end = end - 23;
74   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
75   while(s < mm_end)
76   {
77     __asm __volatile(
78         PREFETCH"       32%1\n\t"
79         "movd   %1, %%mm0\n\t"
80         "punpckldq 3%1, %%mm0\n\t"
81         "movd   6%1, %%mm1\n\t"
82         "punpckldq 9%1, %%mm1\n\t"
83         "movd   12%1, %%mm2\n\t"
84         "punpckldq 15%1, %%mm2\n\t"
85         "movd   18%1, %%mm3\n\t"
86         "punpckldq 21%1, %%mm3\n\t"
87         "pand   %%mm7, %%mm0\n\t"
88         "pand   %%mm7, %%mm1\n\t"
89         "pand   %%mm7, %%mm2\n\t"
90         "pand   %%mm7, %%mm3\n\t"
91         MOVNTQ" %%mm0, %0\n\t"
92         MOVNTQ" %%mm1, 8%0\n\t"
93         MOVNTQ" %%mm2, 16%0\n\t"
94         MOVNTQ" %%mm3, 24%0"
95         :"=m"(*dest)
96         :"m"(*s)
97         :"memory");
98     dest += 32;
99     s += 24;
100   }
101   __asm __volatile(SFENCE:::"memory");
102   __asm __volatile(EMMS:::"memory");
103 #endif
104   while(s < end)
105   {
106 #ifdef WORDS_BIGENDIAN
107     /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
108     *dest++ = 0;
109     *dest++ = s[2];
110     *dest++ = s[1];
111     *dest++ = s[0];
112     s+=3;
113 #else
114     *dest++ = *s++;
115     *dest++ = *s++;
116     *dest++ = *s++;
117     *dest++ = 0;
118 #endif
119   }
120 }
121
122 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
123 {
124   uint8_t *dest = dst;
125   const uint8_t *s = src;
126   const uint8_t *end;
127 #ifdef HAVE_MMX
128   const uint8_t *mm_end;
129 #endif
130   end = s + src_size;
131 #ifdef HAVE_MMX
132   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
133   mm_end = end - 31;
134   while(s < mm_end)
135   {
136     __asm __volatile(
137         PREFETCH"       32%1\n\t"
138         "movq   %1, %%mm0\n\t"
139         "movq   8%1, %%mm1\n\t"
140         "movq   16%1, %%mm4\n\t"
141         "movq   24%1, %%mm5\n\t"
142         "movq   %%mm0, %%mm2\n\t"
143         "movq   %%mm1, %%mm3\n\t"
144         "movq   %%mm4, %%mm6\n\t"
145         "movq   %%mm5, %%mm7\n\t"
146         "psrlq  $8, %%mm2\n\t"
147         "psrlq  $8, %%mm3\n\t"
148         "psrlq  $8, %%mm6\n\t"
149         "psrlq  $8, %%mm7\n\t"
150         "pand   %2, %%mm0\n\t"
151         "pand   %2, %%mm1\n\t"
152         "pand   %2, %%mm4\n\t"
153         "pand   %2, %%mm5\n\t"
154         "pand   %3, %%mm2\n\t"
155         "pand   %3, %%mm3\n\t"
156         "pand   %3, %%mm6\n\t"
157         "pand   %3, %%mm7\n\t"
158         "por    %%mm2, %%mm0\n\t"
159         "por    %%mm3, %%mm1\n\t"
160         "por    %%mm6, %%mm4\n\t"
161         "por    %%mm7, %%mm5\n\t"
162
163         "movq   %%mm1, %%mm2\n\t"
164         "movq   %%mm4, %%mm3\n\t"
165         "psllq  $48, %%mm2\n\t"
166         "psllq  $32, %%mm3\n\t"
167         "pand   %4, %%mm2\n\t"
168         "pand   %5, %%mm3\n\t"
169         "por    %%mm2, %%mm0\n\t"
170         "psrlq  $16, %%mm1\n\t"
171         "psrlq  $32, %%mm4\n\t"
172         "psllq  $16, %%mm5\n\t"
173         "por    %%mm3, %%mm1\n\t"
174         "pand   %6, %%mm5\n\t"
175         "por    %%mm5, %%mm4\n\t"
176
177         MOVNTQ" %%mm0, %0\n\t"
178         MOVNTQ" %%mm1, 8%0\n\t"
179         MOVNTQ" %%mm4, 16%0"
180         :"=m"(*dest)
181         :"m"(*s),"m"(mask24l),
182          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
183         :"memory");
184     dest += 24;
185     s += 32;
186   }
187   __asm __volatile(SFENCE:::"memory");
188   __asm __volatile(EMMS:::"memory");
189 #endif
190   while(s < end)
191   {
192 #ifdef WORDS_BIGENDIAN
193     /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
194     s++;
195     dest[2] = *s++;
196     dest[1] = *s++;
197     dest[0] = *s++;
198     dest += 3;
199 #else
200     *dest++ = *s++;
201     *dest++ = *s++;
202     *dest++ = *s++;
203     s++;
204 #endif
205   }
206 }
207
208 /*
209  Original by Strepto/Astral
210  ported to gcc & bugfixed : A'rpi
211  MMX2, 3DNOW optimization by Nick Kurshev
212  32bit c version, and and&add trick by Michael Niedermayer
213 */
214 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
215 {
216   register const uint8_t* s=src;
217   register uint8_t* d=dst;
218   register const uint8_t *end;
219   const uint8_t *mm_end;
220   end = s + src_size;
221 #ifdef HAVE_MMX
222   __asm __volatile(PREFETCH"    %0"::"m"(*s));
223   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
224   mm_end = end - 15;
225   while(s<mm_end)
226   {
227         __asm __volatile(
228                 PREFETCH"       32%1\n\t"
229                 "movq   %1, %%mm0\n\t"
230                 "movq   8%1, %%mm2\n\t"
231                 "movq   %%mm0, %%mm1\n\t"
232                 "movq   %%mm2, %%mm3\n\t"
233                 "pand   %%mm4, %%mm0\n\t"
234                 "pand   %%mm4, %%mm2\n\t"
235                 "paddw  %%mm1, %%mm0\n\t"
236                 "paddw  %%mm3, %%mm2\n\t"
237                 MOVNTQ" %%mm0, %0\n\t"
238                 MOVNTQ" %%mm2, 8%0"
239                 :"=m"(*d)
240                 :"m"(*s)
241                 );
242         d+=16;
243         s+=16;
244   }
245   __asm __volatile(SFENCE:::"memory");
246   __asm __volatile(EMMS:::"memory");
247 #endif
248     mm_end = end - 3;
249     while(s < mm_end)
250     {
251         register unsigned x= *((uint32_t *)s);
252         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
253         d+=4;
254         s+=4;
255     }
256     if(s < end)
257     {
258         register unsigned short x= *((uint16_t *)s);
259         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
260     }
261 }
262
263 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
264 {
265   register const uint8_t* s=src;
266   register uint8_t* d=dst;
267   register const uint8_t *end;
268   const uint8_t *mm_end;
269   end = s + src_size;
270 #ifdef HAVE_MMX
271   __asm __volatile(PREFETCH"    %0"::"m"(*s));
272   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
273   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
274   mm_end = end - 15;
275   while(s<mm_end)
276   {
277         __asm __volatile(
278                 PREFETCH"       32%1\n\t"
279                 "movq   %1, %%mm0\n\t"
280                 "movq   8%1, %%mm2\n\t"
281                 "movq   %%mm0, %%mm1\n\t"
282                 "movq   %%mm2, %%mm3\n\t"
283                 "psrlq  $1, %%mm0\n\t"
284                 "psrlq  $1, %%mm2\n\t"
285                 "pand   %%mm7, %%mm0\n\t"
286                 "pand   %%mm7, %%mm2\n\t"
287                 "pand   %%mm6, %%mm1\n\t"
288                 "pand   %%mm6, %%mm3\n\t"
289                 "por    %%mm1, %%mm0\n\t"
290                 "por    %%mm3, %%mm2\n\t"
291                 MOVNTQ" %%mm0, %0\n\t"
292                 MOVNTQ" %%mm2, 8%0"
293                 :"=m"(*d)
294                 :"m"(*s)
295                 );
296         d+=16;
297         s+=16;
298   }
299   __asm __volatile(SFENCE:::"memory");
300   __asm __volatile(EMMS:::"memory");
301 #endif
302     mm_end = end - 3;
303     while(s < mm_end)
304     {
305         register uint32_t x= *((uint32_t *)s);
306         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
307         s+=4;
308         d+=4;
309     }
310     if(s < end)
311     {
312         register uint16_t x= *((uint16_t *)s);
313         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
314         s+=2;
315         d+=2;
316     }
317 }
318
319 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
320 {
321         const uint8_t *s = src;
322         const uint8_t *end;
323 #ifdef HAVE_MMX
324         const uint8_t *mm_end;
325 #endif
326         uint16_t *d = (uint16_t *)dst;
327         end = s + src_size;
328 #ifdef HAVE_MMX
329         mm_end = end - 15;
330 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
331         asm volatile(
332                 "movq %3, %%mm5                 \n\t"
333                 "movq %4, %%mm6                 \n\t"
334                 "movq %5, %%mm7                 \n\t"
335                 ".balign 16                     \n\t"
336                 "1:                             \n\t"
337                 PREFETCH" 32(%1)                \n\t"
338                 "movd   (%1), %%mm0             \n\t"
339                 "movd   4(%1), %%mm3            \n\t"
340                 "punpckldq 8(%1), %%mm0         \n\t"
341                 "punpckldq 12(%1), %%mm3        \n\t"
342                 "movq %%mm0, %%mm1              \n\t"
343                 "movq %%mm3, %%mm4              \n\t"
344                 "pand %%mm6, %%mm0              \n\t"
345                 "pand %%mm6, %%mm3              \n\t"
346                 "pmaddwd %%mm7, %%mm0           \n\t"
347                 "pmaddwd %%mm7, %%mm3           \n\t"
348                 "pand %%mm5, %%mm1              \n\t"
349                 "pand %%mm5, %%mm4              \n\t"
350                 "por %%mm1, %%mm0               \n\t"   
351                 "por %%mm4, %%mm3               \n\t"
352                 "psrld $5, %%mm0                \n\t"
353                 "pslld $11, %%mm3               \n\t"
354                 "por %%mm3, %%mm0               \n\t"
355                 MOVNTQ" %%mm0, (%0)             \n\t"
356                 "add $16, %1                    \n\t"
357                 "add $8, %0                     \n\t"
358                 "cmp %2, %1                     \n\t"
359                 " jb 1b                         \n\t"
360                 : "+r" (d), "+r"(s)
361                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
362         );
363 #else
364         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
365         __asm __volatile(
366             "movq       %0, %%mm7\n\t"
367             "movq       %1, %%mm6\n\t"
368             ::"m"(red_16mask),"m"(green_16mask));
369         while(s < mm_end)
370         {
371             __asm __volatile(
372                 PREFETCH" 32%1\n\t"
373                 "movd   %1, %%mm0\n\t"
374                 "movd   4%1, %%mm3\n\t"
375                 "punpckldq 8%1, %%mm0\n\t"
376                 "punpckldq 12%1, %%mm3\n\t"
377                 "movq   %%mm0, %%mm1\n\t"
378                 "movq   %%mm0, %%mm2\n\t"
379                 "movq   %%mm3, %%mm4\n\t"
380                 "movq   %%mm3, %%mm5\n\t"
381                 "psrlq  $3, %%mm0\n\t"
382                 "psrlq  $3, %%mm3\n\t"
383                 "pand   %2, %%mm0\n\t"
384                 "pand   %2, %%mm3\n\t"
385                 "psrlq  $5, %%mm1\n\t"
386                 "psrlq  $5, %%mm4\n\t"
387                 "pand   %%mm6, %%mm1\n\t"
388                 "pand   %%mm6, %%mm4\n\t"
389                 "psrlq  $8, %%mm2\n\t"
390                 "psrlq  $8, %%mm5\n\t"
391                 "pand   %%mm7, %%mm2\n\t"
392                 "pand   %%mm7, %%mm5\n\t"
393                 "por    %%mm1, %%mm0\n\t"
394                 "por    %%mm4, %%mm3\n\t"
395                 "por    %%mm2, %%mm0\n\t"
396                 "por    %%mm5, %%mm3\n\t"
397                 "psllq  $16, %%mm3\n\t"
398                 "por    %%mm3, %%mm0\n\t"
399                 MOVNTQ" %%mm0, %0\n\t"
400                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
401                 d += 4;
402                 s += 16;
403         }
404 #endif
405         __asm __volatile(SFENCE:::"memory");
406         __asm __volatile(EMMS:::"memory");
407 #endif
408         while(s < end)
409         {
410                 register int rgb = *(uint32_t*)s; s += 4;
411                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
412         }
413 }
414
415 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
416 {
417         const uint8_t *s = src;
418         const uint8_t *end;
419 #ifdef HAVE_MMX
420         const uint8_t *mm_end;
421 #endif
422         uint16_t *d = (uint16_t *)dst;
423         end = s + src_size;
424 #ifdef HAVE_MMX
425         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
426         __asm __volatile(
427             "movq       %0, %%mm7\n\t"
428             "movq       %1, %%mm6\n\t"
429             ::"m"(red_16mask),"m"(green_16mask));
430         mm_end = end - 15;
431         while(s < mm_end)
432         {
433             __asm __volatile(
434                 PREFETCH" 32%1\n\t"
435                 "movd   %1, %%mm0\n\t"
436                 "movd   4%1, %%mm3\n\t"
437                 "punpckldq 8%1, %%mm0\n\t"
438                 "punpckldq 12%1, %%mm3\n\t"
439                 "movq   %%mm0, %%mm1\n\t"
440                 "movq   %%mm0, %%mm2\n\t"
441                 "movq   %%mm3, %%mm4\n\t"
442                 "movq   %%mm3, %%mm5\n\t"
443                 "psllq  $8, %%mm0\n\t"
444                 "psllq  $8, %%mm3\n\t"
445                 "pand   %%mm7, %%mm0\n\t"
446                 "pand   %%mm7, %%mm3\n\t"
447                 "psrlq  $5, %%mm1\n\t"
448                 "psrlq  $5, %%mm4\n\t"
449                 "pand   %%mm6, %%mm1\n\t"
450                 "pand   %%mm6, %%mm4\n\t"
451                 "psrlq  $19, %%mm2\n\t"
452                 "psrlq  $19, %%mm5\n\t"
453                 "pand   %2, %%mm2\n\t"
454                 "pand   %2, %%mm5\n\t"
455                 "por    %%mm1, %%mm0\n\t"
456                 "por    %%mm4, %%mm3\n\t"
457                 "por    %%mm2, %%mm0\n\t"
458                 "por    %%mm5, %%mm3\n\t"
459                 "psllq  $16, %%mm3\n\t"
460                 "por    %%mm3, %%mm0\n\t"
461                 MOVNTQ" %%mm0, %0\n\t"
462                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
463                 d += 4;
464                 s += 16;
465         }
466         __asm __volatile(SFENCE:::"memory");
467         __asm __volatile(EMMS:::"memory");
468 #endif
469         while(s < end)
470         {
471                 const int src= *s; s += 4;
472                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
473         }
474 }
475
476 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
477 {
478         const uint8_t *s = src;
479         const uint8_t *end;
480 #ifdef HAVE_MMX
481         const uint8_t *mm_end;
482 #endif
483         uint16_t *d = (uint16_t *)dst;
484         end = s + src_size;
485 #ifdef HAVE_MMX
486         mm_end = end - 15;
487 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
488         asm volatile(
489                 "movq %3, %%mm5                 \n\t"
490                 "movq %4, %%mm6                 \n\t"
491                 "movq %5, %%mm7                 \n\t"
492                 ".balign 16                     \n\t"
493                 "1:                             \n\t"
494                 PREFETCH" 32(%1)                \n\t"
495                 "movd   (%1), %%mm0             \n\t"
496                 "movd   4(%1), %%mm3            \n\t"
497                 "punpckldq 8(%1), %%mm0         \n\t"
498                 "punpckldq 12(%1), %%mm3        \n\t"
499                 "movq %%mm0, %%mm1              \n\t"
500                 "movq %%mm3, %%mm4              \n\t"
501                 "pand %%mm6, %%mm0              \n\t"
502                 "pand %%mm6, %%mm3              \n\t"
503                 "pmaddwd %%mm7, %%mm0           \n\t"
504                 "pmaddwd %%mm7, %%mm3           \n\t"
505                 "pand %%mm5, %%mm1              \n\t"
506                 "pand %%mm5, %%mm4              \n\t"
507                 "por %%mm1, %%mm0               \n\t"   
508                 "por %%mm4, %%mm3               \n\t"
509                 "psrld $6, %%mm0                \n\t"
510                 "pslld $10, %%mm3               \n\t"
511                 "por %%mm3, %%mm0               \n\t"
512                 MOVNTQ" %%mm0, (%0)             \n\t"
513                 "add $16, %1                    \n\t"
514                 "add $8, %0                     \n\t"
515                 "cmp %2, %1                     \n\t"
516                 " jb 1b                         \n\t"
517                 : "+r" (d), "+r"(s)
518                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
519         );
520 #else
521         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
522         __asm __volatile(
523             "movq       %0, %%mm7\n\t"
524             "movq       %1, %%mm6\n\t"
525             ::"m"(red_15mask),"m"(green_15mask));
526         while(s < mm_end)
527         {
528             __asm __volatile(
529                 PREFETCH" 32%1\n\t"
530                 "movd   %1, %%mm0\n\t"
531                 "movd   4%1, %%mm3\n\t"
532                 "punpckldq 8%1, %%mm0\n\t"
533                 "punpckldq 12%1, %%mm3\n\t"
534                 "movq   %%mm0, %%mm1\n\t"
535                 "movq   %%mm0, %%mm2\n\t"
536                 "movq   %%mm3, %%mm4\n\t"
537                 "movq   %%mm3, %%mm5\n\t"
538                 "psrlq  $3, %%mm0\n\t"
539                 "psrlq  $3, %%mm3\n\t"
540                 "pand   %2, %%mm0\n\t"
541                 "pand   %2, %%mm3\n\t"
542                 "psrlq  $6, %%mm1\n\t"
543                 "psrlq  $6, %%mm4\n\t"
544                 "pand   %%mm6, %%mm1\n\t"
545                 "pand   %%mm6, %%mm4\n\t"
546                 "psrlq  $9, %%mm2\n\t"
547                 "psrlq  $9, %%mm5\n\t"
548                 "pand   %%mm7, %%mm2\n\t"
549                 "pand   %%mm7, %%mm5\n\t"
550                 "por    %%mm1, %%mm0\n\t"
551                 "por    %%mm4, %%mm3\n\t"
552                 "por    %%mm2, %%mm0\n\t"
553                 "por    %%mm5, %%mm3\n\t"
554                 "psllq  $16, %%mm3\n\t"
555                 "por    %%mm3, %%mm0\n\t"
556                 MOVNTQ" %%mm0, %0\n\t"
557                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
558                 d += 4;
559                 s += 16;
560         }
561 #endif
562         __asm __volatile(SFENCE:::"memory");
563         __asm __volatile(EMMS:::"memory");
564 #endif
565         while(s < end)
566         {
567                 const int src= *s; s += 4;
568                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
569         }
570 }
571
572 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
573 {
574         const uint8_t *s = src;
575         const uint8_t *end;
576 #ifdef HAVE_MMX
577         const uint8_t *mm_end;
578 #endif
579         uint16_t *d = (uint16_t *)dst;
580         end = s + src_size;
581 #ifdef HAVE_MMX
582         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
583         __asm __volatile(
584             "movq       %0, %%mm7\n\t"
585             "movq       %1, %%mm6\n\t"
586             ::"m"(red_15mask),"m"(green_15mask));
587         mm_end = end - 15;
588         while(s < mm_end)
589         {
590             __asm __volatile(
591                 PREFETCH" 32%1\n\t"
592                 "movd   %1, %%mm0\n\t"
593                 "movd   4%1, %%mm3\n\t"
594                 "punpckldq 8%1, %%mm0\n\t"
595                 "punpckldq 12%1, %%mm3\n\t"
596                 "movq   %%mm0, %%mm1\n\t"
597                 "movq   %%mm0, %%mm2\n\t"
598                 "movq   %%mm3, %%mm4\n\t"
599                 "movq   %%mm3, %%mm5\n\t"
600                 "psllq  $7, %%mm0\n\t"
601                 "psllq  $7, %%mm3\n\t"
602                 "pand   %%mm7, %%mm0\n\t"
603                 "pand   %%mm7, %%mm3\n\t"
604                 "psrlq  $6, %%mm1\n\t"
605                 "psrlq  $6, %%mm4\n\t"
606                 "pand   %%mm6, %%mm1\n\t"
607                 "pand   %%mm6, %%mm4\n\t"
608                 "psrlq  $19, %%mm2\n\t"
609                 "psrlq  $19, %%mm5\n\t"
610                 "pand   %2, %%mm2\n\t"
611                 "pand   %2, %%mm5\n\t"
612                 "por    %%mm1, %%mm0\n\t"
613                 "por    %%mm4, %%mm3\n\t"
614                 "por    %%mm2, %%mm0\n\t"
615                 "por    %%mm5, %%mm3\n\t"
616                 "psllq  $16, %%mm3\n\t"
617                 "por    %%mm3, %%mm0\n\t"
618                 MOVNTQ" %%mm0, %0\n\t"
619                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
620                 d += 4;
621                 s += 16;
622         }
623         __asm __volatile(SFENCE:::"memory");
624         __asm __volatile(EMMS:::"memory");
625 #endif
626         while(s < end)
627         {
628                 const int src= *s; s += 4;
629                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
630         }
631 }
632
633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
634 {
635         const uint8_t *s = src;
636         const uint8_t *end;
637 #ifdef HAVE_MMX
638         const uint8_t *mm_end;
639 #endif
640         uint16_t *d = (uint16_t *)dst;
641         end = s + src_size;
642 #ifdef HAVE_MMX
643         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
644         __asm __volatile(
645             "movq       %0, %%mm7\n\t"
646             "movq       %1, %%mm6\n\t"
647             ::"m"(red_16mask),"m"(green_16mask));
648         mm_end = end - 11;
649         while(s < mm_end)
650         {
651             __asm __volatile(
652                 PREFETCH" 32%1\n\t"
653                 "movd   %1, %%mm0\n\t"
654                 "movd   3%1, %%mm3\n\t"
655                 "punpckldq 6%1, %%mm0\n\t"
656                 "punpckldq 9%1, %%mm3\n\t"
657                 "movq   %%mm0, %%mm1\n\t"
658                 "movq   %%mm0, %%mm2\n\t"
659                 "movq   %%mm3, %%mm4\n\t"
660                 "movq   %%mm3, %%mm5\n\t"
661                 "psrlq  $3, %%mm0\n\t"
662                 "psrlq  $3, %%mm3\n\t"
663                 "pand   %2, %%mm0\n\t"
664                 "pand   %2, %%mm3\n\t"
665                 "psrlq  $5, %%mm1\n\t"
666                 "psrlq  $5, %%mm4\n\t"
667                 "pand   %%mm6, %%mm1\n\t"
668                 "pand   %%mm6, %%mm4\n\t"
669                 "psrlq  $8, %%mm2\n\t"
670                 "psrlq  $8, %%mm5\n\t"
671                 "pand   %%mm7, %%mm2\n\t"
672                 "pand   %%mm7, %%mm5\n\t"
673                 "por    %%mm1, %%mm0\n\t"
674                 "por    %%mm4, %%mm3\n\t"
675                 "por    %%mm2, %%mm0\n\t"
676                 "por    %%mm5, %%mm3\n\t"
677                 "psllq  $16, %%mm3\n\t"
678                 "por    %%mm3, %%mm0\n\t"
679                 MOVNTQ" %%mm0, %0\n\t"
680                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
681                 d += 4;
682                 s += 12;
683         }
684         __asm __volatile(SFENCE:::"memory");
685         __asm __volatile(EMMS:::"memory");
686 #endif
687         while(s < end)
688         {
689                 const int b= *s++;
690                 const int g= *s++;
691                 const int r= *s++;
692                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
693         }
694 }
695
696 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
697 {
698         const uint8_t *s = src;
699         const uint8_t *end;
700 #ifdef HAVE_MMX
701         const uint8_t *mm_end;
702 #endif
703         uint16_t *d = (uint16_t *)dst;
704         end = s + src_size;
705 #ifdef HAVE_MMX
706         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
707         __asm __volatile(
708             "movq       %0, %%mm7\n\t"
709             "movq       %1, %%mm6\n\t"
710             ::"m"(red_16mask),"m"(green_16mask));
711         mm_end = end - 15;
712         while(s < mm_end)
713         {
714             __asm __volatile(
715                 PREFETCH" 32%1\n\t"
716                 "movd   %1, %%mm0\n\t"
717                 "movd   3%1, %%mm3\n\t"
718                 "punpckldq 6%1, %%mm0\n\t"
719                 "punpckldq 9%1, %%mm3\n\t"
720                 "movq   %%mm0, %%mm1\n\t"
721                 "movq   %%mm0, %%mm2\n\t"
722                 "movq   %%mm3, %%mm4\n\t"
723                 "movq   %%mm3, %%mm5\n\t"
724                 "psllq  $8, %%mm0\n\t"
725                 "psllq  $8, %%mm3\n\t"
726                 "pand   %%mm7, %%mm0\n\t"
727                 "pand   %%mm7, %%mm3\n\t"
728                 "psrlq  $5, %%mm1\n\t"
729                 "psrlq  $5, %%mm4\n\t"
730                 "pand   %%mm6, %%mm1\n\t"
731                 "pand   %%mm6, %%mm4\n\t"
732                 "psrlq  $19, %%mm2\n\t"
733                 "psrlq  $19, %%mm5\n\t"
734                 "pand   %2, %%mm2\n\t"
735                 "pand   %2, %%mm5\n\t"
736                 "por    %%mm1, %%mm0\n\t"
737                 "por    %%mm4, %%mm3\n\t"
738                 "por    %%mm2, %%mm0\n\t"
739                 "por    %%mm5, %%mm3\n\t"
740                 "psllq  $16, %%mm3\n\t"
741                 "por    %%mm3, %%mm0\n\t"
742                 MOVNTQ" %%mm0, %0\n\t"
743                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
744                 d += 4;
745                 s += 12;
746         }
747         __asm __volatile(SFENCE:::"memory");
748         __asm __volatile(EMMS:::"memory");
749 #endif
750         while(s < end)
751         {
752                 const int r= *s++;
753                 const int g= *s++;
754                 const int b= *s++;
755                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
756         }
757 }
758
759 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
760 {
761         const uint8_t *s = src;
762         const uint8_t *end;
763 #ifdef HAVE_MMX
764         const uint8_t *mm_end;
765 #endif
766         uint16_t *d = (uint16_t *)dst;
767         end = s + src_size;
768 #ifdef HAVE_MMX
769         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
770         __asm __volatile(
771             "movq       %0, %%mm7\n\t"
772             "movq       %1, %%mm6\n\t"
773             ::"m"(red_15mask),"m"(green_15mask));
774         mm_end = end - 11;
775         while(s < mm_end)
776         {
777             __asm __volatile(
778                 PREFETCH" 32%1\n\t"
779                 "movd   %1, %%mm0\n\t"
780                 "movd   3%1, %%mm3\n\t"
781                 "punpckldq 6%1, %%mm0\n\t"
782                 "punpckldq 9%1, %%mm3\n\t"
783                 "movq   %%mm0, %%mm1\n\t"
784                 "movq   %%mm0, %%mm2\n\t"
785                 "movq   %%mm3, %%mm4\n\t"
786                 "movq   %%mm3, %%mm5\n\t"
787                 "psrlq  $3, %%mm0\n\t"
788                 "psrlq  $3, %%mm3\n\t"
789                 "pand   %2, %%mm0\n\t"
790                 "pand   %2, %%mm3\n\t"
791                 "psrlq  $6, %%mm1\n\t"
792                 "psrlq  $6, %%mm4\n\t"
793                 "pand   %%mm6, %%mm1\n\t"
794                 "pand   %%mm6, %%mm4\n\t"
795                 "psrlq  $9, %%mm2\n\t"
796                 "psrlq  $9, %%mm5\n\t"
797                 "pand   %%mm7, %%mm2\n\t"
798                 "pand   %%mm7, %%mm5\n\t"
799                 "por    %%mm1, %%mm0\n\t"
800                 "por    %%mm4, %%mm3\n\t"
801                 "por    %%mm2, %%mm0\n\t"
802                 "por    %%mm5, %%mm3\n\t"
803                 "psllq  $16, %%mm3\n\t"
804                 "por    %%mm3, %%mm0\n\t"
805                 MOVNTQ" %%mm0, %0\n\t"
806                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
807                 d += 4;
808                 s += 12;
809         }
810         __asm __volatile(SFENCE:::"memory");
811         __asm __volatile(EMMS:::"memory");
812 #endif
813         while(s < end)
814         {
815                 const int b= *s++;
816                 const int g= *s++;
817                 const int r= *s++;
818                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
819         }
820 }
821
822 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
823 {
824         const uint8_t *s = src;
825         const uint8_t *end;
826 #ifdef HAVE_MMX
827         const uint8_t *mm_end;
828 #endif
829         uint16_t *d = (uint16_t *)dst;
830         end = s + src_size;
831 #ifdef HAVE_MMX
832         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
833         __asm __volatile(
834             "movq       %0, %%mm7\n\t"
835             "movq       %1, %%mm6\n\t"
836             ::"m"(red_15mask),"m"(green_15mask));
837         mm_end = end - 15;
838         while(s < mm_end)
839         {
840             __asm __volatile(
841                 PREFETCH" 32%1\n\t"
842                 "movd   %1, %%mm0\n\t"
843                 "movd   3%1, %%mm3\n\t"
844                 "punpckldq 6%1, %%mm0\n\t"
845                 "punpckldq 9%1, %%mm3\n\t"
846                 "movq   %%mm0, %%mm1\n\t"
847                 "movq   %%mm0, %%mm2\n\t"
848                 "movq   %%mm3, %%mm4\n\t"
849                 "movq   %%mm3, %%mm5\n\t"
850                 "psllq  $7, %%mm0\n\t"
851                 "psllq  $7, %%mm3\n\t"
852                 "pand   %%mm7, %%mm0\n\t"
853                 "pand   %%mm7, %%mm3\n\t"
854                 "psrlq  $6, %%mm1\n\t"
855                 "psrlq  $6, %%mm4\n\t"
856                 "pand   %%mm6, %%mm1\n\t"
857                 "pand   %%mm6, %%mm4\n\t"
858                 "psrlq  $19, %%mm2\n\t"
859                 "psrlq  $19, %%mm5\n\t"
860                 "pand   %2, %%mm2\n\t"
861                 "pand   %2, %%mm5\n\t"
862                 "por    %%mm1, %%mm0\n\t"
863                 "por    %%mm4, %%mm3\n\t"
864                 "por    %%mm2, %%mm0\n\t"
865                 "por    %%mm5, %%mm3\n\t"
866                 "psllq  $16, %%mm3\n\t"
867                 "por    %%mm3, %%mm0\n\t"
868                 MOVNTQ" %%mm0, %0\n\t"
869                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
870                 d += 4;
871                 s += 12;
872         }
873         __asm __volatile(SFENCE:::"memory");
874         __asm __volatile(EMMS:::"memory");
875 #endif
876         while(s < end)
877         {
878                 const int r= *s++;
879                 const int g= *s++;
880                 const int b= *s++;
881                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
882         }
883 }
884
885 /*
886   I use here less accurate approximation by simply
887  left-shifting the input
888   value and filling the low order bits with
889  zeroes. This method improves png's
890   compression but this scheme cannot reproduce white exactly, since it does not
891   generate an all-ones maximum value; the net effect is to darken the
892   image slightly.
893
894   The better method should be "left bit replication":
895
896    4 3 2 1 0
897    ---------
898    1 1 0 1 1
899
900    7 6 5 4 3  2 1 0
901    ----------------
902    1 1 0 1 1  1 1 0
903    |=======|  |===|
904        |      Leftmost Bits Repeated to Fill Open Bits
905        |
906    Original Bits
907 */
908 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
909 {
910         const uint16_t *end;
911 #ifdef HAVE_MMX
912         const uint16_t *mm_end;
913 #endif
914         uint8_t *d = (uint8_t *)dst;
915         const uint16_t *s = (uint16_t *)src;
916         end = s + src_size/2;
917 #ifdef HAVE_MMX
918         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
919         mm_end = end - 7;
920         while(s < mm_end)
921         {
922             __asm __volatile(
923                 PREFETCH" 32%1\n\t"
924                 "movq   %1, %%mm0\n\t"
925                 "movq   %1, %%mm1\n\t"
926                 "movq   %1, %%mm2\n\t"
927                 "pand   %2, %%mm0\n\t"
928                 "pand   %3, %%mm1\n\t"
929                 "pand   %4, %%mm2\n\t"
930                 "psllq  $3, %%mm0\n\t"
931                 "psrlq  $2, %%mm1\n\t"
932                 "psrlq  $7, %%mm2\n\t"
933                 "movq   %%mm0, %%mm3\n\t"
934                 "movq   %%mm1, %%mm4\n\t"
935                 "movq   %%mm2, %%mm5\n\t"
936                 "punpcklwd %5, %%mm0\n\t"
937                 "punpcklwd %5, %%mm1\n\t"
938                 "punpcklwd %5, %%mm2\n\t"
939                 "punpckhwd %5, %%mm3\n\t"
940                 "punpckhwd %5, %%mm4\n\t"
941                 "punpckhwd %5, %%mm5\n\t"
942                 "psllq  $8, %%mm1\n\t"
943                 "psllq  $16, %%mm2\n\t"
944                 "por    %%mm1, %%mm0\n\t"
945                 "por    %%mm2, %%mm0\n\t"
946                 "psllq  $8, %%mm4\n\t"
947                 "psllq  $16, %%mm5\n\t"
948                 "por    %%mm4, %%mm3\n\t"
949                 "por    %%mm5, %%mm3\n\t"
950
951                 "movq   %%mm0, %%mm6\n\t"
952                 "movq   %%mm3, %%mm7\n\t"
953                 
954                 "movq   8%1, %%mm0\n\t"
955                 "movq   8%1, %%mm1\n\t"
956                 "movq   8%1, %%mm2\n\t"
957                 "pand   %2, %%mm0\n\t"
958                 "pand   %3, %%mm1\n\t"
959                 "pand   %4, %%mm2\n\t"
960                 "psllq  $3, %%mm0\n\t"
961                 "psrlq  $2, %%mm1\n\t"
962                 "psrlq  $7, %%mm2\n\t"
963                 "movq   %%mm0, %%mm3\n\t"
964                 "movq   %%mm1, %%mm4\n\t"
965                 "movq   %%mm2, %%mm5\n\t"
966                 "punpcklwd %5, %%mm0\n\t"
967                 "punpcklwd %5, %%mm1\n\t"
968                 "punpcklwd %5, %%mm2\n\t"
969                 "punpckhwd %5, %%mm3\n\t"
970                 "punpckhwd %5, %%mm4\n\t"
971                 "punpckhwd %5, %%mm5\n\t"
972                 "psllq  $8, %%mm1\n\t"
973                 "psllq  $16, %%mm2\n\t"
974                 "por    %%mm1, %%mm0\n\t"
975                 "por    %%mm2, %%mm0\n\t"
976                 "psllq  $8, %%mm4\n\t"
977                 "psllq  $16, %%mm5\n\t"
978                 "por    %%mm4, %%mm3\n\t"
979                 "por    %%mm5, %%mm3\n\t"
980
981                 :"=m"(*d)
982                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
983                 :"memory");
984             /* Borrowed 32 to 24 */
985             __asm __volatile(
986                 "movq   %%mm0, %%mm4\n\t"
987                 "movq   %%mm3, %%mm5\n\t"
988                 "movq   %%mm6, %%mm0\n\t"
989                 "movq   %%mm7, %%mm1\n\t"
990                 
991                 "movq   %%mm4, %%mm6\n\t"
992                 "movq   %%mm5, %%mm7\n\t"
993                 "movq   %%mm0, %%mm2\n\t"
994                 "movq   %%mm1, %%mm3\n\t"
995
996                 "psrlq  $8, %%mm2\n\t"
997                 "psrlq  $8, %%mm3\n\t"
998                 "psrlq  $8, %%mm6\n\t"
999                 "psrlq  $8, %%mm7\n\t"
1000                 "pand   %2, %%mm0\n\t"
1001                 "pand   %2, %%mm1\n\t"
1002                 "pand   %2, %%mm4\n\t"
1003                 "pand   %2, %%mm5\n\t"
1004                 "pand   %3, %%mm2\n\t"
1005                 "pand   %3, %%mm3\n\t"
1006                 "pand   %3, %%mm6\n\t"
1007                 "pand   %3, %%mm7\n\t"
1008                 "por    %%mm2, %%mm0\n\t"
1009                 "por    %%mm3, %%mm1\n\t"
1010                 "por    %%mm6, %%mm4\n\t"
1011                 "por    %%mm7, %%mm5\n\t"
1012
1013                 "movq   %%mm1, %%mm2\n\t"
1014                 "movq   %%mm4, %%mm3\n\t"
1015                 "psllq  $48, %%mm2\n\t"
1016                 "psllq  $32, %%mm3\n\t"
1017                 "pand   %4, %%mm2\n\t"
1018                 "pand   %5, %%mm3\n\t"
1019                 "por    %%mm2, %%mm0\n\t"
1020                 "psrlq  $16, %%mm1\n\t"
1021                 "psrlq  $32, %%mm4\n\t"
1022                 "psllq  $16, %%mm5\n\t"
1023                 "por    %%mm3, %%mm1\n\t"
1024                 "pand   %6, %%mm5\n\t"
1025                 "por    %%mm5, %%mm4\n\t"
1026
1027                 MOVNTQ" %%mm0, %0\n\t"
1028                 MOVNTQ" %%mm1, 8%0\n\t"
1029                 MOVNTQ" %%mm4, 16%0"
1030
1031                 :"=m"(*d)
1032                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1033                 :"memory");
1034                 d += 24;
1035                 s += 8;
1036         }
1037         __asm __volatile(SFENCE:::"memory");
1038         __asm __volatile(EMMS:::"memory");
1039 #endif
1040         while(s < end)
1041         {
1042                 register uint16_t bgr;
1043                 bgr = *s++;
1044                 *d++ = (bgr&0x1F)<<3;
1045                 *d++ = (bgr&0x3E0)>>2;
1046                 *d++ = (bgr&0x7C00)>>7;
1047         }
1048 }
1049
1050 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1051 {
1052         const uint16_t *end;
1053 #ifdef HAVE_MMX
1054         const uint16_t *mm_end;
1055 #endif
1056         uint8_t *d = (uint8_t *)dst;
1057         const uint16_t *s = (const uint16_t *)src;
1058         end = s + src_size/2;
1059 #ifdef HAVE_MMX
1060         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1061         mm_end = end - 7;
1062         while(s < mm_end)
1063         {
1064             __asm __volatile(
1065                 PREFETCH" 32%1\n\t"
1066                 "movq   %1, %%mm0\n\t"
1067                 "movq   %1, %%mm1\n\t"
1068                 "movq   %1, %%mm2\n\t"
1069                 "pand   %2, %%mm0\n\t"
1070                 "pand   %3, %%mm1\n\t"
1071                 "pand   %4, %%mm2\n\t"
1072                 "psllq  $3, %%mm0\n\t"
1073                 "psrlq  $3, %%mm1\n\t"
1074                 "psrlq  $8, %%mm2\n\t"
1075                 "movq   %%mm0, %%mm3\n\t"
1076                 "movq   %%mm1, %%mm4\n\t"
1077                 "movq   %%mm2, %%mm5\n\t"
1078                 "punpcklwd %5, %%mm0\n\t"
1079                 "punpcklwd %5, %%mm1\n\t"
1080                 "punpcklwd %5, %%mm2\n\t"
1081                 "punpckhwd %5, %%mm3\n\t"
1082                 "punpckhwd %5, %%mm4\n\t"
1083                 "punpckhwd %5, %%mm5\n\t"
1084                 "psllq  $8, %%mm1\n\t"
1085                 "psllq  $16, %%mm2\n\t"
1086                 "por    %%mm1, %%mm0\n\t"
1087                 "por    %%mm2, %%mm0\n\t"
1088                 "psllq  $8, %%mm4\n\t"
1089                 "psllq  $16, %%mm5\n\t"
1090                 "por    %%mm4, %%mm3\n\t"
1091                 "por    %%mm5, %%mm3\n\t"
1092                 
1093                 "movq   %%mm0, %%mm6\n\t"
1094                 "movq   %%mm3, %%mm7\n\t"
1095
1096                 "movq   8%1, %%mm0\n\t"
1097                 "movq   8%1, %%mm1\n\t"
1098                 "movq   8%1, %%mm2\n\t"
1099                 "pand   %2, %%mm0\n\t"
1100                 "pand   %3, %%mm1\n\t"
1101                 "pand   %4, %%mm2\n\t"
1102                 "psllq  $3, %%mm0\n\t"
1103                 "psrlq  $3, %%mm1\n\t"
1104                 "psrlq  $8, %%mm2\n\t"
1105                 "movq   %%mm0, %%mm3\n\t"
1106                 "movq   %%mm1, %%mm4\n\t"
1107                 "movq   %%mm2, %%mm5\n\t"
1108                 "punpcklwd %5, %%mm0\n\t"
1109                 "punpcklwd %5, %%mm1\n\t"
1110                 "punpcklwd %5, %%mm2\n\t"
1111                 "punpckhwd %5, %%mm3\n\t"
1112                 "punpckhwd %5, %%mm4\n\t"
1113                 "punpckhwd %5, %%mm5\n\t"
1114                 "psllq  $8, %%mm1\n\t"
1115                 "psllq  $16, %%mm2\n\t"
1116                 "por    %%mm1, %%mm0\n\t"
1117                 "por    %%mm2, %%mm0\n\t"
1118                 "psllq  $8, %%mm4\n\t"
1119                 "psllq  $16, %%mm5\n\t"
1120                 "por    %%mm4, %%mm3\n\t"
1121                 "por    %%mm5, %%mm3\n\t"
1122                 :"=m"(*d)
1123                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1124                 :"memory");
1125             /* Borrowed 32 to 24 */
1126             __asm __volatile(
1127                 "movq   %%mm0, %%mm4\n\t"
1128                 "movq   %%mm3, %%mm5\n\t"
1129                 "movq   %%mm6, %%mm0\n\t"
1130                 "movq   %%mm7, %%mm1\n\t"
1131                 
1132                 "movq   %%mm4, %%mm6\n\t"
1133                 "movq   %%mm5, %%mm7\n\t"
1134                 "movq   %%mm0, %%mm2\n\t"
1135                 "movq   %%mm1, %%mm3\n\t"
1136
1137                 "psrlq  $8, %%mm2\n\t"
1138                 "psrlq  $8, %%mm3\n\t"
1139                 "psrlq  $8, %%mm6\n\t"
1140                 "psrlq  $8, %%mm7\n\t"
1141                 "pand   %2, %%mm0\n\t"
1142                 "pand   %2, %%mm1\n\t"
1143                 "pand   %2, %%mm4\n\t"
1144                 "pand   %2, %%mm5\n\t"
1145                 "pand   %3, %%mm2\n\t"
1146                 "pand   %3, %%mm3\n\t"
1147                 "pand   %3, %%mm6\n\t"
1148                 "pand   %3, %%mm7\n\t"
1149                 "por    %%mm2, %%mm0\n\t"
1150                 "por    %%mm3, %%mm1\n\t"
1151                 "por    %%mm6, %%mm4\n\t"
1152                 "por    %%mm7, %%mm5\n\t"
1153
1154                 "movq   %%mm1, %%mm2\n\t"
1155                 "movq   %%mm4, %%mm3\n\t"
1156                 "psllq  $48, %%mm2\n\t"
1157                 "psllq  $32, %%mm3\n\t"
1158                 "pand   %4, %%mm2\n\t"
1159                 "pand   %5, %%mm3\n\t"
1160                 "por    %%mm2, %%mm0\n\t"
1161                 "psrlq  $16, %%mm1\n\t"
1162                 "psrlq  $32, %%mm4\n\t"
1163                 "psllq  $16, %%mm5\n\t"
1164                 "por    %%mm3, %%mm1\n\t"
1165                 "pand   %6, %%mm5\n\t"
1166                 "por    %%mm5, %%mm4\n\t"
1167
1168                 MOVNTQ" %%mm0, %0\n\t"
1169                 MOVNTQ" %%mm1, 8%0\n\t"
1170                 MOVNTQ" %%mm4, 16%0"
1171
1172                 :"=m"(*d)
1173                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1174                 :"memory");
1175                 d += 24;
1176                 s += 8;
1177         }
1178         __asm __volatile(SFENCE:::"memory");
1179         __asm __volatile(EMMS:::"memory");
1180 #endif
1181         while(s < end)
1182         {
1183                 register uint16_t bgr;
1184                 bgr = *s++;
1185                 *d++ = (bgr&0x1F)<<3;
1186                 *d++ = (bgr&0x7E0)>>3;
1187                 *d++ = (bgr&0xF800)>>8;
1188         }
1189 }
1190
1191 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1192 {
1193         const uint16_t *end;
1194 #ifdef HAVE_MMX
1195         const uint16_t *mm_end;
1196 #endif
1197         uint8_t *d = (uint8_t *)dst;
1198         const uint16_t *s = (const uint16_t *)src;
1199         end = s + src_size/2;
1200 #ifdef HAVE_MMX
1201         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1202         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1203         mm_end = end - 3;
1204         while(s < mm_end)
1205         {
1206             __asm __volatile(
1207                 PREFETCH" 32%1\n\t"
1208                 "movq   %1, %%mm0\n\t"
1209                 "movq   %1, %%mm1\n\t"
1210                 "movq   %1, %%mm2\n\t"
1211                 "pand   %2, %%mm0\n\t"
1212                 "pand   %3, %%mm1\n\t"
1213                 "pand   %4, %%mm2\n\t"
1214                 "psllq  $3, %%mm0\n\t"
1215                 "psrlq  $2, %%mm1\n\t"
1216                 "psrlq  $7, %%mm2\n\t"
1217                 "movq   %%mm0, %%mm3\n\t"
1218                 "movq   %%mm1, %%mm4\n\t"
1219                 "movq   %%mm2, %%mm5\n\t"
1220                 "punpcklwd %%mm7, %%mm0\n\t"
1221                 "punpcklwd %%mm7, %%mm1\n\t"
1222                 "punpcklwd %%mm7, %%mm2\n\t"
1223                 "punpckhwd %%mm7, %%mm3\n\t"
1224                 "punpckhwd %%mm7, %%mm4\n\t"
1225                 "punpckhwd %%mm7, %%mm5\n\t"
1226                 "psllq  $8, %%mm1\n\t"
1227                 "psllq  $16, %%mm2\n\t"
1228                 "por    %%mm1, %%mm0\n\t"
1229                 "por    %%mm2, %%mm0\n\t"
1230                 "psllq  $8, %%mm4\n\t"
1231                 "psllq  $16, %%mm5\n\t"
1232                 "por    %%mm4, %%mm3\n\t"
1233                 "por    %%mm5, %%mm3\n\t"
1234                 MOVNTQ" %%mm0, %0\n\t"
1235                 MOVNTQ" %%mm3, 8%0\n\t"
1236                 :"=m"(*d)
1237                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1238                 :"memory");
1239                 d += 16;
1240                 s += 4;
1241         }
1242         __asm __volatile(SFENCE:::"memory");
1243         __asm __volatile(EMMS:::"memory");
1244 #endif
1245         while(s < end)
1246         {
1247 #if 0 //slightly slower on athlon
1248                 int bgr= *s++;
1249                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1250 #else
1251                 register uint16_t bgr;
1252                 bgr = *s++;
1253 #ifdef WORDS_BIGENDIAN
1254                 *d++ = 0;
1255                 *d++ = (bgr&0x7C00)>>7;
1256                 *d++ = (bgr&0x3E0)>>2;
1257                 *d++ = (bgr&0x1F)<<3;
1258 #else
1259                 *d++ = (bgr&0x1F)<<3;
1260                 *d++ = (bgr&0x3E0)>>2;
1261                 *d++ = (bgr&0x7C00)>>7;
1262                 *d++ = 0;
1263 #endif
1264
1265 #endif
1266         }
1267 }
1268
1269 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1270 {
1271         const uint16_t *end;
1272 #ifdef HAVE_MMX
1273         const uint16_t *mm_end;
1274 #endif
1275         uint8_t *d = (uint8_t *)dst;
1276         const uint16_t *s = (uint16_t *)src;
1277         end = s + src_size/2;
1278 #ifdef HAVE_MMX
1279         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1280         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1281         mm_end = end - 3;
1282         while(s < mm_end)
1283         {
1284             __asm __volatile(
1285                 PREFETCH" 32%1\n\t"
1286                 "movq   %1, %%mm0\n\t"
1287                 "movq   %1, %%mm1\n\t"
1288                 "movq   %1, %%mm2\n\t"
1289                 "pand   %2, %%mm0\n\t"
1290                 "pand   %3, %%mm1\n\t"
1291                 "pand   %4, %%mm2\n\t"
1292                 "psllq  $3, %%mm0\n\t"
1293                 "psrlq  $3, %%mm1\n\t"
1294                 "psrlq  $8, %%mm2\n\t"
1295                 "movq   %%mm0, %%mm3\n\t"
1296                 "movq   %%mm1, %%mm4\n\t"
1297                 "movq   %%mm2, %%mm5\n\t"
1298                 "punpcklwd %%mm7, %%mm0\n\t"
1299                 "punpcklwd %%mm7, %%mm1\n\t"
1300                 "punpcklwd %%mm7, %%mm2\n\t"
1301                 "punpckhwd %%mm7, %%mm3\n\t"
1302                 "punpckhwd %%mm7, %%mm4\n\t"
1303                 "punpckhwd %%mm7, %%mm5\n\t"
1304                 "psllq  $8, %%mm1\n\t"
1305                 "psllq  $16, %%mm2\n\t"
1306                 "por    %%mm1, %%mm0\n\t"
1307                 "por    %%mm2, %%mm0\n\t"
1308                 "psllq  $8, %%mm4\n\t"
1309                 "psllq  $16, %%mm5\n\t"
1310                 "por    %%mm4, %%mm3\n\t"
1311                 "por    %%mm5, %%mm3\n\t"
1312                 MOVNTQ" %%mm0, %0\n\t"
1313                 MOVNTQ" %%mm3, 8%0\n\t"
1314                 :"=m"(*d)
1315                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1316                 :"memory");
1317                 d += 16;
1318                 s += 4;
1319         }
1320         __asm __volatile(SFENCE:::"memory");
1321         __asm __volatile(EMMS:::"memory");
1322 #endif
1323         while(s < end)
1324         {
1325                 register uint16_t bgr;
1326                 bgr = *s++;
1327 #ifdef WORDS_BIGENDIAN
1328                 *d++ = 0;
1329                 *d++ = (bgr&0xF800)>>8;
1330                 *d++ = (bgr&0x7E0)>>3;
1331                 *d++ = (bgr&0x1F)<<3;
1332 #else
1333                 *d++ = (bgr&0x1F)<<3;
1334                 *d++ = (bgr&0x7E0)>>3;
1335                 *d++ = (bgr&0xF800)>>8;
1336                 *d++ = 0;
1337 #endif
1338         }
1339 }
1340
1341 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1342 {
1343 #ifdef HAVE_MMX
1344 /* TODO: unroll this loop */
1345         asm volatile (
1346                 "xor %%"REG_a", %%"REG_a"       \n\t"
1347                 ".balign 16                     \n\t"
1348                 "1:                             \n\t"
1349                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1350                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1351                 "movq %%mm0, %%mm1              \n\t"
1352                 "movq %%mm0, %%mm2              \n\t"
1353                 "pslld $16, %%mm0               \n\t"
1354                 "psrld $16, %%mm1               \n\t"
1355                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1356                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1357                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1358                 "por %%mm0, %%mm2               \n\t"
1359                 "por %%mm1, %%mm2               \n\t"
1360                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1361                 "add $8, %%"REG_a"              \n\t"
1362                 "cmp %2, %%"REG_a"              \n\t"
1363                 " jb 1b                         \n\t"
1364                 :: "r" (src), "r"(dst), "r" (src_size-7)
1365                 : "%"REG_a
1366         );
1367
1368         __asm __volatile(SFENCE:::"memory");
1369         __asm __volatile(EMMS:::"memory");
1370 #else
1371         unsigned i;
1372         unsigned num_pixels = src_size >> 2;
1373         for(i=0; i<num_pixels; i++)
1374         {
1375 #ifdef WORDS_BIGENDIAN  
1376           dst[4*i + 1] = src[4*i + 3];
1377           dst[4*i + 2] = src[4*i + 2];
1378           dst[4*i + 3] = src[4*i + 1];
1379 #else
1380           dst[4*i + 0] = src[4*i + 2];
1381           dst[4*i + 1] = src[4*i + 1];
1382           dst[4*i + 2] = src[4*i + 0];
1383 #endif
1384         }
1385 #endif
1386 }
1387
1388 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1389 {
1390         unsigned i;
1391 #ifdef HAVE_MMX
1392         long mmx_size= 23 - src_size;
1393         asm volatile (
1394                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1395                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1396                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1397                 ".balign 16                     \n\t"
1398                 "1:                             \n\t"
1399                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1400                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1401                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1402                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1403                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1404                 "pand %%mm5, %%mm0              \n\t"
1405                 "pand %%mm6, %%mm1              \n\t"
1406                 "pand %%mm7, %%mm2              \n\t"
1407                 "por %%mm0, %%mm1               \n\t"
1408                 "por %%mm2, %%mm1               \n\t"                
1409                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1410                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1411                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1412                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1413                 "pand %%mm7, %%mm0              \n\t"
1414                 "pand %%mm5, %%mm1              \n\t"
1415                 "pand %%mm6, %%mm2              \n\t"
1416                 "por %%mm0, %%mm1               \n\t"
1417                 "por %%mm2, %%mm1               \n\t"                
1418                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1419                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1420                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1421                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1422                 "pand %%mm6, %%mm0              \n\t"
1423                 "pand %%mm7, %%mm1              \n\t"
1424                 "pand %%mm5, %%mm2              \n\t"
1425                 "por %%mm0, %%mm1               \n\t"
1426                 "por %%mm2, %%mm1               \n\t"                
1427                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1428                 "add $24, %%"REG_a"             \n\t"
1429                 " js 1b                         \n\t"
1430                 : "+a" (mmx_size)
1431                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1432         );
1433
1434         __asm __volatile(SFENCE:::"memory");
1435         __asm __volatile(EMMS:::"memory");
1436
1437         if(mmx_size==23) return; //finihsed, was multiple of 8
1438
1439         src+= src_size;
1440         dst+= src_size;
1441         src_size= 23-mmx_size;
1442         src-= src_size;
1443         dst-= src_size;
1444 #endif
1445         for(i=0; i<src_size; i+=3)
1446         {
1447                 register uint8_t x;
1448                 x          = src[i + 2];
1449                 dst[i + 1] = src[i + 1];
1450                 dst[i + 2] = src[i + 0];
1451                 dst[i + 0] = x;
1452         }
1453 }
1454
1455 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456         long width, long height,
1457         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1458 {
1459         long y;
1460         const long chromWidth= width>>1;
1461         for(y=0; y<height; y++)
1462         {
1463 #ifdef HAVE_MMX
1464 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1465                 asm volatile(
1466                         "xor %%"REG_a", %%"REG_a"       \n\t"
1467                         ".balign 16                     \n\t"
1468                         "1:                             \n\t"
1469                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1470                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1471                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1472                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1473                         "movq %%mm0, %%mm2              \n\t" // U(0)
1474                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1475                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1476                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1477
1478                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1479                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1480                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1481                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1482                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1483                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1484                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1485                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1486
1487                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1488                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1489                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1490                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1491
1492                         "add $8, %%"REG_a"              \n\t"
1493                         "cmp %4, %%"REG_a"              \n\t"
1494                         " jb 1b                         \n\t"
1495                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1496                         : "%"REG_a
1497                 );
1498 #else
1499
1500 #if defined ARCH_ALPHA && defined HAVE_MVI
1501 #define pl2yuy2(n)                                      \
1502         y1 = yc[n];                                     \
1503         y2 = yc2[n];                                    \
1504         u = uc[n];                                      \
1505         v = vc[n];                                      \
1506         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1507         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1508         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1509         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1510         yuv1 = (u << 8) + (v << 24);                    \
1511         yuv2 = yuv1 + y2;                               \
1512         yuv1 += y1;                                     \
1513         qdst[n] = yuv1;                                 \
1514         qdst2[n] = yuv2;
1515
1516                 int i;
1517                 uint64_t *qdst = (uint64_t *) dst;
1518                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1519                 const uint32_t *yc = (uint32_t *) ysrc;
1520                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1521                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1522                 for(i = 0; i < chromWidth; i += 8){
1523                         uint64_t y1, y2, yuv1, yuv2;
1524                         uint64_t u, v;
1525                         /* Prefetch */
1526                         asm("ldq $31,64(%0)" :: "r"(yc));
1527                         asm("ldq $31,64(%0)" :: "r"(yc2));
1528                         asm("ldq $31,64(%0)" :: "r"(uc));
1529                         asm("ldq $31,64(%0)" :: "r"(vc));
1530
1531                         pl2yuy2(0);
1532                         pl2yuy2(1);
1533                         pl2yuy2(2);
1534                         pl2yuy2(3);
1535
1536                         yc += 4;
1537                         yc2 += 4;
1538                         uc += 4;
1539                         vc += 4;
1540                         qdst += 4;
1541                         qdst2 += 4;
1542                 }
1543                 y++;
1544                 ysrc += lumStride;
1545                 dst += dstStride;
1546
1547 #elif __WORDSIZE >= 64
1548                 int i;
1549                 uint64_t *ldst = (uint64_t *) dst;
1550                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1551                 for(i = 0; i < chromWidth; i += 2){
1552                         uint64_t k, l;
1553                         k = yc[0] + (uc[0] << 8) +
1554                             (yc[1] << 16) + (vc[0] << 24);
1555                         l = yc[2] + (uc[1] << 8) +
1556                             (yc[3] << 16) + (vc[1] << 24);
1557                         *ldst++ = k + (l << 32);
1558                         yc += 4;
1559                         uc += 2;
1560                         vc += 2;
1561                 }
1562
1563 #else
1564                 int i, *idst = (int32_t *) dst;
1565                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1566                 for(i = 0; i < chromWidth; i++){
1567 #ifdef WORDS_BIGENDIAN
1568                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1569                             (yc[1] << 8) + (vc[0] << 0);
1570 #else
1571                         *idst++ = yc[0] + (uc[0] << 8) +
1572                             (yc[1] << 16) + (vc[0] << 24);
1573 #endif
1574                         yc += 2;
1575                         uc++;
1576                         vc++;
1577                 }
1578 #endif
1579 #endif
1580                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1581                 {
1582                         usrc += chromStride;
1583                         vsrc += chromStride;
1584                 }
1585                 ysrc += lumStride;
1586                 dst += dstStride;
1587         }
1588 #ifdef HAVE_MMX
1589 asm(    EMMS" \n\t"
1590         SFENCE" \n\t"
1591         :::"memory");
1592 #endif
1593 }
1594
1595 /**
1596  *
1597  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1598  * problem for anyone then tell me, and ill fix it)
1599  */
1600 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1601         long width, long height,
1602         long lumStride, long chromStride, long dstStride)
1603 {
1604         //FIXME interpolate chroma
1605         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1606 }
1607
1608 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1609         long width, long height,
1610         long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1611 {
1612         long y;
1613         const long chromWidth= width>>1;
1614         for(y=0; y<height; y++)
1615         {
1616 #ifdef HAVE_MMX
1617 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1618                 asm volatile(
1619                         "xor %%"REG_a", %%"REG_a"       \n\t"
1620                         ".balign 16                     \n\t"
1621                         "1:                             \n\t"
1622                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1623                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1624                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1625                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1626                         "movq %%mm0, %%mm2              \n\t" // U(0)
1627                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1628                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1629                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1630
1631                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1632                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1633                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1634                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1635                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1636                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1637                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1638                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1639
1640                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1641                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1642                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1643                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1644
1645                         "add $8, %%"REG_a"              \n\t"
1646                         "cmp %4, %%"REG_a"              \n\t"
1647                         " jb 1b                         \n\t"
1648                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1649                         : "%"REG_a
1650                 );
1651 #else
1652 //FIXME adapt the alpha asm code from yv12->yuy2
1653
1654 #if __WORDSIZE >= 64
1655                 int i;
1656                 uint64_t *ldst = (uint64_t *) dst;
1657                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1658                 for(i = 0; i < chromWidth; i += 2){
1659                         uint64_t k, l;
1660                         k = uc[0] + (yc[0] << 8) +
1661                             (vc[0] << 16) + (yc[1] << 24);
1662                         l = uc[1] + (yc[2] << 8) +
1663                             (vc[1] << 16) + (yc[3] << 24);
1664                         *ldst++ = k + (l << 32);
1665                         yc += 4;
1666                         uc += 2;
1667                         vc += 2;
1668                 }
1669
1670 #else
1671                 int i, *idst = (int32_t *) dst;
1672                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1673                 for(i = 0; i < chromWidth; i++){
1674 #ifdef WORDS_BIGENDIAN
1675                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1676                             (vc[0] << 8) + (yc[1] << 0);
1677 #else
1678                         *idst++ = uc[0] + (yc[0] << 8) +
1679                             (vc[0] << 16) + (yc[1] << 24);
1680 #endif
1681                         yc += 2;
1682                         uc++;
1683                         vc++;
1684                 }
1685 #endif
1686 #endif
1687                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1688                 {
1689                         usrc += chromStride;
1690                         vsrc += chromStride;
1691                 }
1692                 ysrc += lumStride;
1693                 dst += dstStride;
1694         }
1695 #ifdef HAVE_MMX
1696 asm(    EMMS" \n\t"
1697         SFENCE" \n\t"
1698         :::"memory");
1699 #endif
1700 }
1701
1702 /**
1703  *
1704  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1705  * problem for anyone then tell me, and ill fix it)
1706  */
1707 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1708         long width, long height,
1709         long lumStride, long chromStride, long dstStride)
1710 {
1711         //FIXME interpolate chroma
1712         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1713 }
1714
1715 /**
1716  *
1717  * width should be a multiple of 16
1718  */
1719 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1720         long width, long height,
1721         long lumStride, long chromStride, long dstStride)
1722 {
1723         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1724 }
1725
1726 /**
1727  *
1728  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729  * problem for anyone then tell me, and ill fix it)
1730  */
1731 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1732         long width, long height,
1733         long lumStride, long chromStride, long srcStride)
1734 {
1735         long y;
1736         const long chromWidth= width>>1;
1737         for(y=0; y<height; y+=2)
1738         {
1739 #ifdef HAVE_MMX
1740                 asm volatile(
1741                         "xor %%"REG_a", %%"REG_a"       \n\t"
1742                         "pcmpeqw %%mm7, %%mm7           \n\t"
1743                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1744                         ".balign 16                     \n\t"
1745                         "1:                             \n\t"
1746                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1747                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1748                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1749                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1750                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1751                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1752                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1753                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1754                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1755                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1756                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1757
1758                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1759
1760                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1761                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1762                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1763                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1764                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1765                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1766                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1767                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1768                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1769                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1770
1771                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1772
1773                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1774                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1775                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1776                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1777                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1778                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1779                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1780                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1781
1782                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1783                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1784
1785                         "add $8, %%"REG_a"              \n\t"
1786                         "cmp %4, %%"REG_a"              \n\t"
1787                         " jb 1b                         \n\t"
1788                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1789                         : "memory", "%"REG_a
1790                 );
1791
1792                 ydst += lumStride;
1793                 src  += srcStride;
1794
1795                 asm volatile(
1796                         "xor %%"REG_a", %%"REG_a"       \n\t"
1797                         ".balign 16                     \n\t"
1798                         "1:                             \n\t"
1799                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1800                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1801                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1802                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1803                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1804                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1805                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1806                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1807                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1808                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1809                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1810
1811                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1812                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1813
1814                         "add $8, %%"REG_a"              \n\t"
1815                         "cmp %4, %%"REG_a"              \n\t"
1816                         " jb 1b                         \n\t"
1817
1818                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1819                         : "memory", "%"REG_a
1820                 );
1821 #else
1822                 long i;
1823                 for(i=0; i<chromWidth; i++)
1824                 {
1825                         ydst[2*i+0]     = src[4*i+0];
1826                         udst[i]         = src[4*i+1];
1827                         ydst[2*i+1]     = src[4*i+2];
1828                         vdst[i]         = src[4*i+3];
1829                 }
1830                 ydst += lumStride;
1831                 src  += srcStride;
1832
1833                 for(i=0; i<chromWidth; i++)
1834                 {
1835                         ydst[2*i+0]     = src[4*i+0];
1836                         ydst[2*i+1]     = src[4*i+2];
1837                 }
1838 #endif
1839                 udst += chromStride;
1840                 vdst += chromStride;
1841                 ydst += lumStride;
1842                 src  += srcStride;
1843         }
1844 #ifdef HAVE_MMX
1845 asm volatile(   EMMS" \n\t"
1846                 SFENCE" \n\t"
1847                 :::"memory");
1848 #endif
1849 }
1850
1851 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1852         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853         long width, long height, long lumStride, long chromStride)
1854 {
1855         /* Y Plane */
1856         memcpy(ydst, ysrc, width*height);
1857
1858         /* XXX: implement upscaling for U,V */
1859 }
1860
1861 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1862 {
1863         long x,y;
1864         
1865         dst[0]= src[0];
1866         
1867         // first line
1868         for(x=0; x<srcWidth-1; x++){
1869                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1870                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1871         }
1872         dst[2*srcWidth-1]= src[srcWidth-1];
1873         
1874         dst+= dstStride;
1875
1876         for(y=1; y<srcHeight; y++){
1877 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878                 const long mmxSize= srcWidth&~15;
1879                 asm volatile(
1880                         "mov %4, %%"REG_a"              \n\t"
1881                         "1:                             \n\t"
1882                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1883                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1884                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1885                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1886                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1887                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1888                         PAVGB" %%mm0, %%mm5             \n\t"
1889                         PAVGB" %%mm0, %%mm3             \n\t"
1890                         PAVGB" %%mm0, %%mm5             \n\t"
1891                         PAVGB" %%mm0, %%mm3             \n\t"
1892                         PAVGB" %%mm1, %%mm4             \n\t"
1893                         PAVGB" %%mm1, %%mm2             \n\t"
1894                         PAVGB" %%mm1, %%mm4             \n\t"
1895                         PAVGB" %%mm1, %%mm2             \n\t"
1896                         "movq %%mm5, %%mm7              \n\t"
1897                         "movq %%mm4, %%mm6              \n\t"
1898                         "punpcklbw %%mm3, %%mm5         \n\t"
1899                         "punpckhbw %%mm3, %%mm7         \n\t"
1900                         "punpcklbw %%mm2, %%mm4         \n\t"
1901                         "punpckhbw %%mm2, %%mm6         \n\t"
1902 #if 1
1903                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1904                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1905                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1906                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1907 #else
1908                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1909                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1910                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1911                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1912 #endif
1913                         "add $8, %%"REG_a"              \n\t"
1914                         " js 1b                         \n\t"
1915                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1916                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1917                            "g" (-mmxSize)
1918                         : "%"REG_a
1919
1920                 );
1921 #else
1922                 const long mmxSize=1;
1923 #endif
1924                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1925                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1926
1927                 for(x=mmxSize-1; x<srcWidth-1; x++){
1928                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1929                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1930                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1931                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1932                 }
1933                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1934                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1935
1936                 dst+=dstStride*2;
1937                 src+=srcStride;
1938         }
1939         
1940         // last line
1941 #if 1
1942         dst[0]= src[0];
1943         
1944         for(x=0; x<srcWidth-1; x++){
1945                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1946                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1947         }
1948         dst[2*srcWidth-1]= src[srcWidth-1];
1949 #else
1950         for(x=0; x<srcWidth; x++){
1951                 dst[2*x+0]=
1952                 dst[2*x+1]= src[x];
1953         }
1954 #endif
1955
1956 #ifdef HAVE_MMX
1957 asm volatile(   EMMS" \n\t"
1958                 SFENCE" \n\t"
1959                 :::"memory");
1960 #endif
1961 }
1962
1963 /**
1964  *
1965  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1966  * problem for anyone then tell me, and ill fix it)
1967  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1968  */
1969 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1970         long width, long height,
1971         long lumStride, long chromStride, long srcStride)
1972 {
1973         long y;
1974         const long chromWidth= width>>1;
1975         for(y=0; y<height; y+=2)
1976         {
1977 #ifdef HAVE_MMX
1978                 asm volatile(
1979                         "xorl %%eax, %%eax              \n\t"
1980                         "pcmpeqw %%mm7, %%mm7           \n\t"
1981                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1982                         ".balign 16                     \n\t"
1983                         "1:                             \n\t"
1984                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1985                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1986                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1987                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1988                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1989                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1990                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1991                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1992                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1993                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1994                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1995
1996                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1997
1998                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1999                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2000                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2001                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2002                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2003                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2004                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2005                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2006                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2007                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2008
2009                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2010
2011                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2012                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2013                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2014                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2015                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2016                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2017                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2018                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2019
2020                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2021                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2022
2023                         "addl $8, %%eax                 \n\t"
2024                         "cmpl %4, %%eax                 \n\t"
2025                         " jb 1b                         \n\t"
2026                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2027                         : "memory", "%eax"
2028                 );
2029
2030                 ydst += lumStride;
2031                 src  += srcStride;
2032
2033                 asm volatile(
2034                         "xorl %%eax, %%eax              \n\t"
2035                         ".balign 16                     \n\t"
2036                         "1:                             \n\t"
2037                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2038                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2039                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2040                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2041                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2042                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2043                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2044                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2045                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2046                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2047                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2048
2049                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2050                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2051
2052                         "addl $8, %%eax                 \n\t"
2053                         "cmpl %4, %%eax                 \n\t"
2054                         " jb 1b                         \n\t"
2055
2056                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2057                         : "memory", "%eax"
2058                 );
2059 #else
2060                 long i;
2061                 for(i=0; i<chromWidth; i++)
2062                 {
2063                         udst[i]         = src[4*i+0];
2064                         ydst[2*i+0]     = src[4*i+1];
2065                         vdst[i]         = src[4*i+2];
2066                         ydst[2*i+1]     = src[4*i+3];
2067                 }
2068                 ydst += lumStride;
2069                 src  += srcStride;
2070
2071                 for(i=0; i<chromWidth; i++)
2072                 {
2073                         ydst[2*i+0]     = src[4*i+1];
2074                         ydst[2*i+1]     = src[4*i+3];
2075                 }
2076 #endif
2077                 udst += chromStride;
2078                 vdst += chromStride;
2079                 ydst += lumStride;
2080                 src  += srcStride;
2081         }
2082 #ifdef HAVE_MMX
2083 asm volatile(   EMMS" \n\t"
2084                 SFENCE" \n\t"
2085                 :::"memory");
2086 #endif
2087 }
2088
2089 /**
2090  *
2091  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2092  * problem for anyone then tell me, and ill fix it)
2093  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2094  */
2095 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2096         long width, long height,
2097         long lumStride, long chromStride, long srcStride)
2098 {
2099         long y;
2100         const long chromWidth= width>>1;
2101 #ifdef HAVE_MMX
2102         for(y=0; y<height-2; y+=2)
2103         {
2104                 long i;
2105                 for(i=0; i<2; i++)
2106                 {
2107                         asm volatile(
2108                                 "mov %2, %%"REG_a"              \n\t"
2109                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2110                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2111                                 "pxor %%mm7, %%mm7              \n\t"
2112                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2113                                 ".balign 16                     \n\t"
2114                                 "1:                             \n\t"
2115                                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
2116                                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
2117                                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
2118                                 "punpcklbw %%mm7, %%mm0         \n\t"
2119                                 "punpcklbw %%mm7, %%mm1         \n\t"
2120                                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
2121                                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
2122                                 "punpcklbw %%mm7, %%mm2         \n\t"
2123                                 "punpcklbw %%mm7, %%mm3         \n\t"
2124                                 "pmaddwd %%mm6, %%mm0           \n\t"
2125                                 "pmaddwd %%mm6, %%mm1           \n\t"
2126                                 "pmaddwd %%mm6, %%mm2           \n\t"
2127                                 "pmaddwd %%mm6, %%mm3           \n\t"
2128 #ifndef FAST_BGR2YV12
2129                                 "psrad $8, %%mm0                \n\t"
2130                                 "psrad $8, %%mm1                \n\t"
2131                                 "psrad $8, %%mm2                \n\t"
2132                                 "psrad $8, %%mm3                \n\t"
2133 #endif
2134                                 "packssdw %%mm1, %%mm0          \n\t"
2135                                 "packssdw %%mm3, %%mm2          \n\t"
2136                                 "pmaddwd %%mm5, %%mm0           \n\t"
2137                                 "pmaddwd %%mm5, %%mm2           \n\t"
2138                                 "packssdw %%mm2, %%mm0          \n\t"
2139                                 "psraw $7, %%mm0                \n\t"
2140
2141                                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2142                                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
2143                                 "punpcklbw %%mm7, %%mm4         \n\t"
2144                                 "punpcklbw %%mm7, %%mm1         \n\t"
2145                                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
2146                                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
2147                                 "punpcklbw %%mm7, %%mm2         \n\t"
2148                                 "punpcklbw %%mm7, %%mm3         \n\t"
2149                                 "pmaddwd %%mm6, %%mm4           \n\t"
2150                                 "pmaddwd %%mm6, %%mm1           \n\t"
2151                                 "pmaddwd %%mm6, %%mm2           \n\t"
2152                                 "pmaddwd %%mm6, %%mm3           \n\t"
2153 #ifndef FAST_BGR2YV12
2154                                 "psrad $8, %%mm4                \n\t"
2155                                 "psrad $8, %%mm1                \n\t"
2156                                 "psrad $8, %%mm2                \n\t"
2157                                 "psrad $8, %%mm3                \n\t"
2158 #endif
2159                                 "packssdw %%mm1, %%mm4          \n\t"
2160                                 "packssdw %%mm3, %%mm2          \n\t"
2161                                 "pmaddwd %%mm5, %%mm4           \n\t"
2162                                 "pmaddwd %%mm5, %%mm2           \n\t"
2163                                 "add $24, %%"REG_b"             \n\t"
2164                                 "packssdw %%mm2, %%mm4          \n\t"
2165                                 "psraw $7, %%mm4                \n\t"
2166
2167                                 "packuswb %%mm4, %%mm0          \n\t"
2168                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2169
2170                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2171                                 "add $8, %%"REG_a"              \n\t"
2172                                 " js 1b                         \n\t"
2173                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2174                                 : "%"REG_a, "%"REG_b
2175                         );
2176                         ydst += lumStride;
2177                         src  += srcStride;
2178                 }
2179                 src -= srcStride*2;
2180                 asm volatile(
2181                         "mov %4, %%"REG_a"              \n\t"
2182                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2183                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2184                         "pxor %%mm7, %%mm7              \n\t"
2185                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2186                         "add %%"REG_b", %%"REG_b"       \n\t"
2187                         ".balign 16                     \n\t"
2188                         "1:                             \n\t"
2189                         PREFETCH" 64(%0, %%"REG_b")     \n\t"
2190                         PREFETCH" 64(%1, %%"REG_b")     \n\t"
2191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2192                         "movq (%0, %%"REG_b"), %%mm0    \n\t"
2193                         "movq (%1, %%"REG_b"), %%mm1    \n\t"
2194                         "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
2195                         "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
2196                         PAVGB" %%mm1, %%mm0             \n\t"
2197                         PAVGB" %%mm3, %%mm2             \n\t"
2198                         "movq %%mm0, %%mm1              \n\t"
2199                         "movq %%mm2, %%mm3              \n\t"
2200                         "psrlq $24, %%mm0               \n\t"
2201                         "psrlq $24, %%mm2               \n\t"
2202                         PAVGB" %%mm1, %%mm0             \n\t"
2203                         PAVGB" %%mm3, %%mm2             \n\t"
2204                         "punpcklbw %%mm7, %%mm0         \n\t"
2205                         "punpcklbw %%mm7, %%mm2         \n\t"
2206 #else
2207                         "movd (%0, %%"REG_b"), %%mm0    \n\t"
2208                         "movd (%1, %%"REG_b"), %%mm1    \n\t"
2209                         "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
2210                         "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
2211                         "punpcklbw %%mm7, %%mm0         \n\t"
2212                         "punpcklbw %%mm7, %%mm1         \n\t"
2213                         "punpcklbw %%mm7, %%mm2         \n\t"
2214                         "punpcklbw %%mm7, %%mm3         \n\t"
2215                         "paddw %%mm1, %%mm0             \n\t"
2216                         "paddw %%mm3, %%mm2             \n\t"
2217                         "paddw %%mm2, %%mm0             \n\t"
2218                         "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
2219                         "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
2220                         "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
2221                         "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
2222                         "punpcklbw %%mm7, %%mm4         \n\t"
2223                         "punpcklbw %%mm7, %%mm1         \n\t"
2224                         "punpcklbw %%mm7, %%mm2         \n\t"
2225                         "punpcklbw %%mm7, %%mm3         \n\t"
2226                         "paddw %%mm1, %%mm4             \n\t"
2227                         "paddw %%mm3, %%mm2             \n\t"
2228                         "paddw %%mm4, %%mm2             \n\t"
2229                         "psrlw $2, %%mm0                \n\t"
2230                         "psrlw $2, %%mm2                \n\t"
2231 #endif
2232                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2233                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2234
2235                         "pmaddwd %%mm0, %%mm1           \n\t"
2236                         "pmaddwd %%mm2, %%mm3           \n\t"
2237                         "pmaddwd %%mm6, %%mm0           \n\t"
2238                         "pmaddwd %%mm6, %%mm2           \n\t"
2239 #ifndef FAST_BGR2YV12
2240                         "psrad $8, %%mm0                \n\t"
2241                         "psrad $8, %%mm1                \n\t"
2242                         "psrad $8, %%mm2                \n\t"
2243                         "psrad $8, %%mm3                \n\t"
2244 #endif
2245                         "packssdw %%mm2, %%mm0          \n\t"
2246                         "packssdw %%mm3, %%mm1          \n\t"
2247                         "pmaddwd %%mm5, %%mm0           \n\t"
2248                         "pmaddwd %%mm5, %%mm1           \n\t"
2249                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2250                         "psraw $7, %%mm0                \n\t"
2251
2252 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2253                         "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
2254                         "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
2255                         "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
2256                         "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
2257                         PAVGB" %%mm1, %%mm4             \n\t"
2258                         PAVGB" %%mm3, %%mm2             \n\t"
2259                         "movq %%mm4, %%mm1              \n\t"
2260                         "movq %%mm2, %%mm3              \n\t"
2261                         "psrlq $24, %%mm4               \n\t"
2262                         "psrlq $24, %%mm2               \n\t"
2263                         PAVGB" %%mm1, %%mm4             \n\t"
2264                         PAVGB" %%mm3, %%mm2             \n\t"
2265                         "punpcklbw %%mm7, %%mm4         \n\t"
2266                         "punpcklbw %%mm7, %%mm2         \n\t"
2267 #else
2268                         "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2269                         "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
2270                         "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
2271                         "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
2272                         "punpcklbw %%mm7, %%mm4         \n\t"
2273                         "punpcklbw %%mm7, %%mm1         \n\t"
2274                         "punpcklbw %%mm7, %%mm2         \n\t"
2275                         "punpcklbw %%mm7, %%mm3         \n\t"
2276                         "paddw %%mm1, %%mm4             \n\t"
2277                         "paddw %%mm3, %%mm2             \n\t"
2278                         "paddw %%mm2, %%mm4             \n\t"
2279                         "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
2280                         "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
2281                         "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
2282                         "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
2283                         "punpcklbw %%mm7, %%mm5         \n\t"
2284                         "punpcklbw %%mm7, %%mm1         \n\t"
2285                         "punpcklbw %%mm7, %%mm2         \n\t"
2286                         "punpcklbw %%mm7, %%mm3         \n\t"
2287                         "paddw %%mm1, %%mm5             \n\t"
2288                         "paddw %%mm3, %%mm2             \n\t"
2289                         "paddw %%mm5, %%mm2             \n\t"
2290                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2291                         "psrlw $2, %%mm4                \n\t"
2292                         "psrlw $2, %%mm2                \n\t"
2293 #endif
2294                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2295                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2296
2297                         "pmaddwd %%mm4, %%mm1           \n\t"
2298                         "pmaddwd %%mm2, %%mm3           \n\t"
2299                         "pmaddwd %%mm6, %%mm4           \n\t"
2300                         "pmaddwd %%mm6, %%mm2           \n\t"
2301 #ifndef FAST_BGR2YV12
2302                         "psrad $8, %%mm4                \n\t"
2303                         "psrad $8, %%mm1                \n\t"
2304                         "psrad $8, %%mm2                \n\t"
2305                         "psrad $8, %%mm3                \n\t"
2306 #endif
2307                         "packssdw %%mm2, %%mm4          \n\t"
2308                         "packssdw %%mm3, %%mm1          \n\t"
2309                         "pmaddwd %%mm5, %%mm4           \n\t"
2310                         "pmaddwd %%mm5, %%mm1           \n\t"
2311                         "add $24, %%"REG_b"             \n\t"
2312                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2313                         "psraw $7, %%mm4                \n\t"
2314
2315                         "movq %%mm0, %%mm1              \n\t"
2316                         "punpckldq %%mm4, %%mm0         \n\t"
2317                         "punpckhdq %%mm4, %%mm1         \n\t"
2318                         "packsswb %%mm1, %%mm0          \n\t"
2319                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2320                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2321                         "punpckhdq %%mm0, %%mm0         \n\t"
2322                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2323                         "add $4, %%"REG_a"              \n\t"
2324                         " js 1b                         \n\t"
2325                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2326                         : "%"REG_a, "%"REG_b
2327                 );
2328
2329                 udst += chromStride;
2330                 vdst += chromStride;
2331                 src  += srcStride*2;
2332         }
2333
2334         asm volatile(   EMMS" \n\t"
2335                         SFENCE" \n\t"
2336                         :::"memory");
2337 #else
2338         y=0;
2339 #endif
2340         for(; y<height; y+=2)
2341         {
2342                 long i;
2343                 for(i=0; i<chromWidth; i++)
2344                 {
2345                         unsigned int b= src[6*i+0];
2346                         unsigned int g= src[6*i+1];
2347                         unsigned int r= src[6*i+2];
2348
2349                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2351                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2352
2353                         udst[i]         = U;
2354                         vdst[i]         = V;
2355                         ydst[2*i]       = Y;
2356
2357                         b= src[6*i+3];
2358                         g= src[6*i+4];
2359                         r= src[6*i+5];
2360
2361                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2362                         ydst[2*i+1]     = Y;
2363                 }
2364                 ydst += lumStride;
2365                 src  += srcStride;
2366
2367                 for(i=0; i<chromWidth; i++)
2368                 {
2369                         unsigned int b= src[6*i+0];
2370                         unsigned int g= src[6*i+1];
2371                         unsigned int r= src[6*i+2];
2372
2373                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374
2375                         ydst[2*i]       = Y;
2376
2377                         b= src[6*i+3];
2378                         g= src[6*i+4];
2379                         r= src[6*i+5];
2380
2381                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2382                         ydst[2*i+1]     = Y;
2383                 }
2384                 udst += chromStride;
2385                 vdst += chromStride;
2386                 ydst += lumStride;
2387                 src  += srcStride;
2388         }
2389 }
2390
2391 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2392                             long width, long height, long src1Stride,
2393                             long src2Stride, long dstStride){
2394         long h;
2395
2396         for(h=0; h < height; h++)
2397         {
2398                 long w;
2399
2400 #ifdef HAVE_MMX
2401 #ifdef HAVE_SSE2
2402                 asm(
2403                         "xor %%"REG_a", %%"REG_a"       \n\t"
2404                         "1:                             \n\t"
2405                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2406                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2407                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2408                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2409                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2410                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2411                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2412                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2413                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2414                         "add $16, %%"REG_a"             \n\t"
2415                         "cmp %3, %%"REG_a"              \n\t"
2416                         " jb 1b                         \n\t"
2417                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2418                         : "memory", "%"REG_a""
2419                 );
2420 #else
2421                 asm(
2422                         "xor %%"REG_a", %%"REG_a"       \n\t"
2423                         "1:                             \n\t"
2424                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2425                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2426                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2427                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2428                         "movq %%mm0, %%mm1              \n\t"
2429                         "movq %%mm2, %%mm3              \n\t"
2430                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2431                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2432                         "punpcklbw %%mm4, %%mm0         \n\t"
2433                         "punpckhbw %%mm4, %%mm1         \n\t"
2434                         "punpcklbw %%mm5, %%mm2         \n\t"
2435                         "punpckhbw %%mm5, %%mm3         \n\t"
2436                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2437                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2438                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2439                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2440                         "add $16, %%"REG_a"             \n\t"
2441                         "cmp %3, %%"REG_a"              \n\t"
2442                         " jb 1b                         \n\t"
2443                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2444                         : "memory", "%"REG_a
2445                 );
2446 #endif
2447                 for(w= (width&(~15)); w < width; w++)
2448                 {
2449                         dest[2*w+0] = src1[w];
2450                         dest[2*w+1] = src2[w];
2451                 }
2452 #else
2453                 for(w=0; w < width; w++)
2454                 {
2455                         dest[2*w+0] = src1[w];
2456                         dest[2*w+1] = src2[w];
2457                 }
2458 #endif
2459                 dest += dstStride;
2460                 src1 += src1Stride;
2461                 src2 += src2Stride;
2462         }
2463 #ifdef HAVE_MMX
2464         asm(
2465                 EMMS" \n\t"
2466                 SFENCE" \n\t"
2467                 ::: "memory"
2468                 );
2469 #endif
2470 }
2471
2472 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2473                         uint8_t *dst1, uint8_t *dst2,
2474                         long width, long height,
2475                         long srcStride1, long srcStride2,
2476                         long dstStride1, long dstStride2)
2477 {
2478     long y,x,w,h;
2479     w=width/2; h=height/2;
2480 #ifdef HAVE_MMX
2481     asm volatile(
2482         PREFETCH" %0\n\t"
2483         PREFETCH" %1\n\t"
2484         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2485 #endif
2486     for(y=0;y<h;y++){
2487         const uint8_t* s1=src1+srcStride1*(y>>1);
2488         uint8_t* d=dst1+dstStride1*y;
2489         x=0;
2490 #ifdef HAVE_MMX
2491         for(;x<w-31;x+=32)
2492         {
2493             asm volatile(
2494                 PREFETCH" 32%1\n\t"
2495                 "movq   %1, %%mm0\n\t"
2496                 "movq   8%1, %%mm2\n\t"
2497                 "movq   16%1, %%mm4\n\t"
2498                 "movq   24%1, %%mm6\n\t"
2499                 "movq   %%mm0, %%mm1\n\t"
2500                 "movq   %%mm2, %%mm3\n\t"
2501                 "movq   %%mm4, %%mm5\n\t"
2502                 "movq   %%mm6, %%mm7\n\t"
2503                 "punpcklbw %%mm0, %%mm0\n\t"
2504                 "punpckhbw %%mm1, %%mm1\n\t"
2505                 "punpcklbw %%mm2, %%mm2\n\t"
2506                 "punpckhbw %%mm3, %%mm3\n\t"
2507                 "punpcklbw %%mm4, %%mm4\n\t"
2508                 "punpckhbw %%mm5, %%mm5\n\t"
2509                 "punpcklbw %%mm6, %%mm6\n\t"
2510                 "punpckhbw %%mm7, %%mm7\n\t"
2511                 MOVNTQ" %%mm0, %0\n\t"
2512                 MOVNTQ" %%mm1, 8%0\n\t"
2513                 MOVNTQ" %%mm2, 16%0\n\t"
2514                 MOVNTQ" %%mm3, 24%0\n\t"
2515                 MOVNTQ" %%mm4, 32%0\n\t"
2516                 MOVNTQ" %%mm5, 40%0\n\t"
2517                 MOVNTQ" %%mm6, 48%0\n\t"
2518                 MOVNTQ" %%mm7, 56%0"
2519                 :"=m"(d[2*x])
2520                 :"m"(s1[x])
2521                 :"memory");
2522         }
2523 #endif
2524         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2525     }
2526     for(y=0;y<h;y++){
2527         const uint8_t* s2=src2+srcStride2*(y>>1);
2528         uint8_t* d=dst2+dstStride2*y;
2529         x=0;
2530 #ifdef HAVE_MMX
2531         for(;x<w-31;x+=32)
2532         {
2533             asm volatile(
2534                 PREFETCH" 32%1\n\t"
2535                 "movq   %1, %%mm0\n\t"
2536                 "movq   8%1, %%mm2\n\t"
2537                 "movq   16%1, %%mm4\n\t"
2538                 "movq   24%1, %%mm6\n\t"
2539                 "movq   %%mm0, %%mm1\n\t"
2540                 "movq   %%mm2, %%mm3\n\t"
2541                 "movq   %%mm4, %%mm5\n\t"
2542                 "movq   %%mm6, %%mm7\n\t"
2543                 "punpcklbw %%mm0, %%mm0\n\t"
2544                 "punpckhbw %%mm1, %%mm1\n\t"
2545                 "punpcklbw %%mm2, %%mm2\n\t"
2546                 "punpckhbw %%mm3, %%mm3\n\t"
2547                 "punpcklbw %%mm4, %%mm4\n\t"
2548                 "punpckhbw %%mm5, %%mm5\n\t"
2549                 "punpcklbw %%mm6, %%mm6\n\t"
2550                 "punpckhbw %%mm7, %%mm7\n\t"
2551                 MOVNTQ" %%mm0, %0\n\t"
2552                 MOVNTQ" %%mm1, 8%0\n\t"
2553                 MOVNTQ" %%mm2, 16%0\n\t"
2554                 MOVNTQ" %%mm3, 24%0\n\t"
2555                 MOVNTQ" %%mm4, 32%0\n\t"
2556                 MOVNTQ" %%mm5, 40%0\n\t"
2557                 MOVNTQ" %%mm6, 48%0\n\t"
2558                 MOVNTQ" %%mm7, 56%0"
2559                 :"=m"(d[2*x])
2560                 :"m"(s2[x])
2561                 :"memory");
2562         }
2563 #endif
2564         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2565     }
2566 #ifdef HAVE_MMX
2567         asm(
2568                 EMMS" \n\t"
2569                 SFENCE" \n\t"
2570                 ::: "memory"
2571                 );
2572 #endif
2573 }
2574
2575 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2576                         uint8_t *dst,
2577                         long width, long height,
2578                         long srcStride1, long srcStride2,
2579                         long srcStride3, long dstStride)
2580 {
2581     long y,x,w,h;
2582     w=width/2; h=height;
2583     for(y=0;y<h;y++){
2584         const uint8_t* yp=src1+srcStride1*y;
2585         const uint8_t* up=src2+srcStride2*(y>>2);
2586         const uint8_t* vp=src3+srcStride3*(y>>2);
2587         uint8_t* d=dst+dstStride*y;
2588         x=0;
2589 #ifdef HAVE_MMX
2590         for(;x<w-7;x+=8)
2591         {
2592             asm volatile(
2593                 PREFETCH" 32(%1, %0)\n\t"
2594                 PREFETCH" 32(%2, %0)\n\t"
2595                 PREFETCH" 32(%3, %0)\n\t"
2596                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2597                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2598                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2599                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2600                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2601                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2602                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2603                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2604                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2605                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2606
2607                 "movq   %%mm1, %%mm6\n\t"
2608                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2609                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2610                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2611                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2612                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2613                 
2614                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2615                 "movq   8(%1, %0, 4), %%mm0\n\t"
2616                 "movq   %%mm0, %%mm3\n\t"
2617                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2618                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2619                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2620                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2621
2622                 "movq   %%mm4, %%mm6\n\t"
2623                 "movq   16(%1, %0, 4), %%mm0\n\t"
2624                 "movq   %%mm0, %%mm3\n\t"
2625                 "punpcklbw %%mm5, %%mm4\n\t"
2626                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2627                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2628                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2629                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2630                 
2631                 "punpckhbw %%mm5, %%mm6\n\t"
2632                 "movq   24(%1, %0, 4), %%mm0\n\t"
2633                 "movq   %%mm0, %%mm3\n\t"
2634                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2635                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2636                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2637                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2638
2639                 : "+r" (x)
2640                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2641                 :"memory");
2642         }
2643 #endif
2644         for(; x<w; x++)
2645         {
2646             const long x2= x<<2;
2647             d[8*x+0]=yp[x2];
2648             d[8*x+1]=up[x];
2649             d[8*x+2]=yp[x2+1];
2650             d[8*x+3]=vp[x];
2651             d[8*x+4]=yp[x2+2];
2652             d[8*x+5]=up[x];
2653             d[8*x+6]=yp[x2+3];
2654             d[8*x+7]=vp[x];
2655         }
2656     }
2657 #ifdef HAVE_MMX
2658         asm(
2659                 EMMS" \n\t"
2660                 SFENCE" \n\t"
2661                 ::: "memory"
2662                 );
2663 #endif
2664 }